diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,241369 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 34477, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 10.902585760650759, + "learning_rate": 9.66183574879227e-09, + "loss": 1.3826, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 11.334472300433701, + "learning_rate": 1.932367149758454e-08, + "loss": 1.4349, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 32.582421525759806, + "learning_rate": 2.8985507246376815e-08, + "loss": 1.4612, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 10.446184549629798, + "learning_rate": 3.864734299516908e-08, + "loss": 1.422, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 13.263614152087957, + "learning_rate": 4.8309178743961356e-08, + "loss": 1.4392, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 2.7562363144839326, + "learning_rate": 5.797101449275363e-08, + "loss": 0.5817, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 11.124312222492104, + "learning_rate": 6.76328502415459e-08, + "loss": 1.4716, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 10.210958443494112, + "learning_rate": 7.729468599033816e-08, + "loss": 1.4321, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 12.672495401240727, + "learning_rate": 8.695652173913044e-08, + "loss": 1.3483, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 12.116412998372667, + "learning_rate": 9.661835748792271e-08, + "loss": 1.4066, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 12.982203215947148, + "learning_rate": 1.0628019323671499e-07, + "loss": 1.3687, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 11.710384644286911, + "learning_rate": 1.1594202898550726e-07, + "loss": 1.3987, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 10.874063535027856, + "learning_rate": 1.2560386473429953e-07, + "loss": 1.4071, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 12.761928136279126, + "learning_rate": 1.352657004830918e-07, + "loss": 1.4515, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 12.996175363203852, + "learning_rate": 1.4492753623188408e-07, + "loss": 1.4064, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 11.322090637600077, + "learning_rate": 1.5458937198067633e-07, + "loss": 1.4511, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 10.49364809721525, + "learning_rate": 1.6425120772946863e-07, + "loss": 1.4309, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 10.52341544957961, + "learning_rate": 1.7391304347826088e-07, + "loss": 1.3539, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 9.874116008072514, + "learning_rate": 1.8357487922705315e-07, + "loss": 1.4114, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 10.742639079204595, + "learning_rate": 1.9323671497584542e-07, + "loss": 1.4801, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 9.90729082514578, + "learning_rate": 2.0289855072463767e-07, + "loss": 1.3878, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 10.942852117150515, + "learning_rate": 2.1256038647342997e-07, + "loss": 1.4661, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 10.005743703222267, + "learning_rate": 2.2222222222222224e-07, + "loss": 1.4229, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 9.761815997765936, + "learning_rate": 2.3188405797101452e-07, + "loss": 1.4014, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 8.946146514945893, + "learning_rate": 2.4154589371980677e-07, + "loss": 1.4634, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 9.398585606703989, + "learning_rate": 2.5120772946859907e-07, + "loss": 1.4026, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 10.68936991748351, + "learning_rate": 2.608695652173913e-07, + "loss": 1.4067, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 9.406194909092136, + "learning_rate": 2.705314009661836e-07, + "loss": 1.4122, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 9.065522206431622, + "learning_rate": 2.8019323671497586e-07, + "loss": 1.4059, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 7.807723580610007, + "learning_rate": 2.8985507246376816e-07, + "loss": 1.3636, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 8.631919950816048, + "learning_rate": 2.995169082125604e-07, + "loss": 1.3489, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 8.17244753304758, + "learning_rate": 3.0917874396135266e-07, + "loss": 1.3182, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 7.6643688884801495, + "learning_rate": 3.1884057971014496e-07, + "loss": 1.2818, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 8.410898922067737, + "learning_rate": 3.2850241545893726e-07, + "loss": 1.2887, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 6.935457651498026, + "learning_rate": 3.3816425120772945e-07, + "loss": 1.3182, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 7.6127326218164315, + "learning_rate": 3.4782608695652175e-07, + "loss": 1.3334, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 6.907476689475056, + "learning_rate": 3.5748792270531405e-07, + "loss": 1.2999, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 2.2110830438514313, + "learning_rate": 3.671497584541063e-07, + "loss": 0.5416, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 6.776959505318949, + "learning_rate": 3.768115942028986e-07, + "loss": 1.314, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 6.4780721889402795, + "learning_rate": 3.8647342995169085e-07, + "loss": 1.303, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 6.045130969379203, + "learning_rate": 3.9613526570048315e-07, + "loss": 1.2868, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 5.161201960870711, + "learning_rate": 4.0579710144927534e-07, + "loss": 1.2769, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 5.221918419647352, + "learning_rate": 4.1545893719806764e-07, + "loss": 1.1963, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 5.000572041254113, + "learning_rate": 4.2512077294685994e-07, + "loss": 1.162, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 5.251171090907864, + "learning_rate": 4.347826086956522e-07, + "loss": 1.2013, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 4.736477820453483, + "learning_rate": 4.444444444444445e-07, + "loss": 1.2328, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 4.5723320856171625, + "learning_rate": 4.5410628019323674e-07, + "loss": 1.191, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 9.852973300666672, + "learning_rate": 4.6376811594202904e-07, + "loss": 1.1809, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 5.032628525468694, + "learning_rate": 4.7342995169082134e-07, + "loss": 1.2258, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 4.353624092804532, + "learning_rate": 4.830917874396135e-07, + "loss": 1.1636, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 4.848875128595678, + "learning_rate": 4.927536231884058e-07, + "loss": 1.1713, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 5.112385707140277, + "learning_rate": 5.024154589371981e-07, + "loss": 1.1563, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 3.759387774304104, + "learning_rate": 5.120772946859904e-07, + "loss": 1.0965, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 4.2198450674567765, + "learning_rate": 5.217391304347826e-07, + "loss": 1.16, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 7.435933948784844, + "learning_rate": 5.314009661835749e-07, + "loss": 1.1361, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 3.8628198569717718, + "learning_rate": 5.410628019323672e-07, + "loss": 1.1206, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 6.785969101685286, + "learning_rate": 5.507246376811594e-07, + "loss": 1.0852, + "step": 57 + }, + { + "epoch": 0.0, + "grad_norm": 6.533497749036434, + "learning_rate": 5.603864734299517e-07, + "loss": 1.0724, + "step": 58 + }, + { + "epoch": 0.0, + "grad_norm": 4.2410935002604075, + "learning_rate": 5.70048309178744e-07, + "loss": 1.1244, + "step": 59 + }, + { + "epoch": 0.0, + "grad_norm": 4.557337779581996, + "learning_rate": 5.797101449275363e-07, + "loss": 1.1023, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 3.7410737450426583, + "learning_rate": 5.893719806763285e-07, + "loss": 1.0745, + "step": 61 + }, + { + "epoch": 0.0, + "grad_norm": 5.688350034795917, + "learning_rate": 5.990338164251208e-07, + "loss": 1.1123, + "step": 62 + }, + { + "epoch": 0.0, + "grad_norm": 4.328605564330714, + "learning_rate": 6.086956521739131e-07, + "loss": 1.1644, + "step": 63 + }, + { + "epoch": 0.0, + "grad_norm": 3.9086608112631827, + "learning_rate": 6.183574879227053e-07, + "loss": 1.0614, + "step": 64 + }, + { + "epoch": 0.0, + "grad_norm": 3.809100408732758, + "learning_rate": 6.280193236714976e-07, + "loss": 1.0645, + "step": 65 + }, + { + "epoch": 0.0, + "grad_norm": 12.931131473524957, + "learning_rate": 6.376811594202899e-07, + "loss": 1.0693, + "step": 66 + }, + { + "epoch": 0.0, + "grad_norm": 5.402187274049444, + "learning_rate": 6.473429951690821e-07, + "loss": 1.0492, + "step": 67 + }, + { + "epoch": 0.0, + "grad_norm": 3.6933473807732375, + "learning_rate": 6.570048309178745e-07, + "loss": 1.0684, + "step": 68 + }, + { + "epoch": 0.0, + "grad_norm": 4.519167874113205, + "learning_rate": 6.666666666666667e-07, + "loss": 1.1003, + "step": 69 + }, + { + "epoch": 0.0, + "grad_norm": 2.844312772418968, + "learning_rate": 6.763285024154589e-07, + "loss": 0.9941, + "step": 70 + }, + { + "epoch": 0.0, + "grad_norm": 3.488300131291408, + "learning_rate": 6.859903381642513e-07, + "loss": 1.0327, + "step": 71 + }, + { + "epoch": 0.0, + "grad_norm": 3.9793301170637476, + "learning_rate": 6.956521739130435e-07, + "loss": 1.0361, + "step": 72 + }, + { + "epoch": 0.0, + "grad_norm": 3.3497348081867435, + "learning_rate": 7.053140096618358e-07, + "loss": 1.0637, + "step": 73 + }, + { + "epoch": 0.0, + "grad_norm": 2.692154789172541, + "learning_rate": 7.149758454106281e-07, + "loss": 1.0553, + "step": 74 + }, + { + "epoch": 0.0, + "grad_norm": 3.6527631076513054, + "learning_rate": 7.246376811594204e-07, + "loss": 1.0331, + "step": 75 + }, + { + "epoch": 0.0, + "grad_norm": 2.1652343765232613, + "learning_rate": 7.342995169082126e-07, + "loss": 0.5513, + "step": 76 + }, + { + "epoch": 0.0, + "grad_norm": 3.6505288977537846, + "learning_rate": 7.439613526570048e-07, + "loss": 0.9955, + "step": 77 + }, + { + "epoch": 0.0, + "grad_norm": 5.15875599445925, + "learning_rate": 7.536231884057972e-07, + "loss": 0.9819, + "step": 78 + }, + { + "epoch": 0.0, + "grad_norm": 3.249338846465422, + "learning_rate": 7.632850241545894e-07, + "loss": 0.9516, + "step": 79 + }, + { + "epoch": 0.0, + "grad_norm": 8.543051147335913, + "learning_rate": 7.729468599033817e-07, + "loss": 1.0343, + "step": 80 + }, + { + "epoch": 0.0, + "grad_norm": 3.2400652799998553, + "learning_rate": 7.82608695652174e-07, + "loss": 0.9879, + "step": 81 + }, + { + "epoch": 0.0, + "grad_norm": 3.77238609248563, + "learning_rate": 7.922705314009663e-07, + "loss": 0.9595, + "step": 82 + }, + { + "epoch": 0.0, + "grad_norm": 4.498127374678085, + "learning_rate": 8.019323671497585e-07, + "loss": 0.9528, + "step": 83 + }, + { + "epoch": 0.0, + "grad_norm": 4.436921656942298, + "learning_rate": 8.115942028985507e-07, + "loss": 1.0013, + "step": 84 + }, + { + "epoch": 0.0, + "grad_norm": 3.5671775962958403, + "learning_rate": 8.212560386473431e-07, + "loss": 0.9162, + "step": 85 + }, + { + "epoch": 0.0, + "grad_norm": 1.701524271732804, + "learning_rate": 8.309178743961353e-07, + "loss": 0.5166, + "step": 86 + }, + { + "epoch": 0.0, + "grad_norm": 3.1916334396269073, + "learning_rate": 8.405797101449276e-07, + "loss": 0.9655, + "step": 87 + }, + { + "epoch": 0.0, + "grad_norm": 3.6093376671619715, + "learning_rate": 8.502415458937199e-07, + "loss": 0.9476, + "step": 88 + }, + { + "epoch": 0.0, + "grad_norm": 2.6792268990273063, + "learning_rate": 8.599033816425122e-07, + "loss": 0.9307, + "step": 89 + }, + { + "epoch": 0.0, + "grad_norm": 3.984287859995814, + "learning_rate": 8.695652173913044e-07, + "loss": 0.9134, + "step": 90 + }, + { + "epoch": 0.0, + "grad_norm": 3.7175176892584827, + "learning_rate": 8.792270531400967e-07, + "loss": 0.9664, + "step": 91 + }, + { + "epoch": 0.0, + "grad_norm": 2.421911573866612, + "learning_rate": 8.88888888888889e-07, + "loss": 0.922, + "step": 92 + }, + { + "epoch": 0.0, + "grad_norm": 4.021780569193719, + "learning_rate": 8.985507246376813e-07, + "loss": 0.9462, + "step": 93 + }, + { + "epoch": 0.0, + "grad_norm": 3.202046222531195, + "learning_rate": 9.082125603864735e-07, + "loss": 0.9803, + "step": 94 + }, + { + "epoch": 0.0, + "grad_norm": 4.198201555893774, + "learning_rate": 9.178743961352659e-07, + "loss": 0.8845, + "step": 95 + }, + { + "epoch": 0.0, + "grad_norm": 8.54787372795916, + "learning_rate": 9.275362318840581e-07, + "loss": 0.967, + "step": 96 + }, + { + "epoch": 0.0, + "grad_norm": 3.0288153846756427, + "learning_rate": 9.371980676328503e-07, + "loss": 0.9805, + "step": 97 + }, + { + "epoch": 0.0, + "grad_norm": 3.2204757970383207, + "learning_rate": 9.468599033816427e-07, + "loss": 0.9108, + "step": 98 + }, + { + "epoch": 0.0, + "grad_norm": 3.310814060577355, + "learning_rate": 9.565217391304349e-07, + "loss": 0.9437, + "step": 99 + }, + { + "epoch": 0.0, + "grad_norm": 2.439087716916908, + "learning_rate": 9.66183574879227e-07, + "loss": 0.9227, + "step": 100 + }, + { + "epoch": 0.0, + "grad_norm": 5.273514921879792, + "learning_rate": 9.758454106280193e-07, + "loss": 0.9104, + "step": 101 + }, + { + "epoch": 0.0, + "grad_norm": 2.615495919183641, + "learning_rate": 9.855072463768117e-07, + "loss": 0.8825, + "step": 102 + }, + { + "epoch": 0.0, + "grad_norm": 2.6253087215239117, + "learning_rate": 9.951690821256039e-07, + "loss": 0.8849, + "step": 103 + }, + { + "epoch": 0.0, + "grad_norm": 3.6807880764731515, + "learning_rate": 1.0048309178743963e-06, + "loss": 0.9651, + "step": 104 + }, + { + "epoch": 0.0, + "grad_norm": 3.4683128430364603, + "learning_rate": 1.0144927536231885e-06, + "loss": 0.9961, + "step": 105 + }, + { + "epoch": 0.0, + "grad_norm": 3.569244596421863, + "learning_rate": 1.0241545893719809e-06, + "loss": 0.9385, + "step": 106 + }, + { + "epoch": 0.0, + "grad_norm": 3.706647104423181, + "learning_rate": 1.033816425120773e-06, + "loss": 0.8958, + "step": 107 + }, + { + "epoch": 0.0, + "grad_norm": 2.7371821778512246, + "learning_rate": 1.0434782608695653e-06, + "loss": 0.9243, + "step": 108 + }, + { + "epoch": 0.0, + "grad_norm": 4.183327667145081, + "learning_rate": 1.0531400966183577e-06, + "loss": 0.9134, + "step": 109 + }, + { + "epoch": 0.0, + "grad_norm": 2.6491573104224373, + "learning_rate": 1.0628019323671499e-06, + "loss": 0.8815, + "step": 110 + }, + { + "epoch": 0.0, + "grad_norm": 12.449979502359554, + "learning_rate": 1.072463768115942e-06, + "loss": 0.8931, + "step": 111 + }, + { + "epoch": 0.0, + "grad_norm": 3.5063077726530567, + "learning_rate": 1.0821256038647345e-06, + "loss": 0.9002, + "step": 112 + }, + { + "epoch": 0.0, + "grad_norm": 2.985428794914929, + "learning_rate": 1.0917874396135266e-06, + "loss": 0.8787, + "step": 113 + }, + { + "epoch": 0.0, + "grad_norm": 3.0477068997368346, + "learning_rate": 1.1014492753623188e-06, + "loss": 0.9474, + "step": 114 + }, + { + "epoch": 0.0, + "grad_norm": 2.798320882044652, + "learning_rate": 1.111111111111111e-06, + "loss": 0.846, + "step": 115 + }, + { + "epoch": 0.0, + "grad_norm": 3.1516189112907624, + "learning_rate": 1.1207729468599034e-06, + "loss": 0.9302, + "step": 116 + }, + { + "epoch": 0.0, + "grad_norm": 6.598346890128015, + "learning_rate": 1.1304347826086956e-06, + "loss": 0.867, + "step": 117 + }, + { + "epoch": 0.0, + "grad_norm": 3.043149379454441, + "learning_rate": 1.140096618357488e-06, + "loss": 0.8967, + "step": 118 + }, + { + "epoch": 0.0, + "grad_norm": 3.764701110356925, + "learning_rate": 1.1497584541062802e-06, + "loss": 0.8568, + "step": 119 + }, + { + "epoch": 0.0, + "grad_norm": 10.505158644598149, + "learning_rate": 1.1594202898550726e-06, + "loss": 0.8762, + "step": 120 + }, + { + "epoch": 0.0, + "grad_norm": 3.2359925472124185, + "learning_rate": 1.1690821256038648e-06, + "loss": 0.8627, + "step": 121 + }, + { + "epoch": 0.0, + "grad_norm": 3.1930381298753594, + "learning_rate": 1.178743961352657e-06, + "loss": 0.8855, + "step": 122 + }, + { + "epoch": 0.0, + "grad_norm": 2.3717513899657003, + "learning_rate": 1.1884057971014494e-06, + "loss": 0.8141, + "step": 123 + }, + { + "epoch": 0.0, + "grad_norm": 2.861869232131268, + "learning_rate": 1.1980676328502416e-06, + "loss": 0.8336, + "step": 124 + }, + { + "epoch": 0.0, + "grad_norm": 1.506906515790977, + "learning_rate": 1.2077294685990338e-06, + "loss": 0.5685, + "step": 125 + }, + { + "epoch": 0.0, + "grad_norm": 2.6045709514403255, + "learning_rate": 1.2173913043478262e-06, + "loss": 0.8942, + "step": 126 + }, + { + "epoch": 0.0, + "grad_norm": 2.633244006220314, + "learning_rate": 1.2270531400966184e-06, + "loss": 0.847, + "step": 127 + }, + { + "epoch": 0.0, + "grad_norm": 3.4243033985240174, + "learning_rate": 1.2367149758454106e-06, + "loss": 0.8279, + "step": 128 + }, + { + "epoch": 0.0, + "grad_norm": 7.789725198991392, + "learning_rate": 1.246376811594203e-06, + "loss": 0.9009, + "step": 129 + }, + { + "epoch": 0.0, + "grad_norm": 4.072371836841086, + "learning_rate": 1.2560386473429952e-06, + "loss": 0.8435, + "step": 130 + }, + { + "epoch": 0.0, + "grad_norm": 3.0649779232226595, + "learning_rate": 1.2657004830917876e-06, + "loss": 0.8639, + "step": 131 + }, + { + "epoch": 0.0, + "grad_norm": 3.0700587567698734, + "learning_rate": 1.2753623188405798e-06, + "loss": 0.7963, + "step": 132 + }, + { + "epoch": 0.0, + "grad_norm": 2.4193924026828966, + "learning_rate": 1.2850241545893722e-06, + "loss": 0.8605, + "step": 133 + }, + { + "epoch": 0.0, + "grad_norm": 3.202817937925971, + "learning_rate": 1.2946859903381642e-06, + "loss": 0.8311, + "step": 134 + }, + { + "epoch": 0.0, + "grad_norm": 6.422881435168387, + "learning_rate": 1.3043478260869566e-06, + "loss": 0.8437, + "step": 135 + }, + { + "epoch": 0.0, + "grad_norm": 3.664961525551387, + "learning_rate": 1.314009661835749e-06, + "loss": 0.8979, + "step": 136 + }, + { + "epoch": 0.0, + "grad_norm": 2.940602747359213, + "learning_rate": 1.323671497584541e-06, + "loss": 0.8119, + "step": 137 + }, + { + "epoch": 0.0, + "grad_norm": 2.8993137438288454, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.7771, + "step": 138 + }, + { + "epoch": 0.0, + "grad_norm": 2.7301275823863396, + "learning_rate": 1.3429951690821258e-06, + "loss": 0.8398, + "step": 139 + }, + { + "epoch": 0.0, + "grad_norm": 7.065443446461518, + "learning_rate": 1.3526570048309178e-06, + "loss": 0.8077, + "step": 140 + }, + { + "epoch": 0.0, + "grad_norm": 3.337869669238539, + "learning_rate": 1.3623188405797102e-06, + "loss": 0.8258, + "step": 141 + }, + { + "epoch": 0.0, + "grad_norm": 6.805132243399703, + "learning_rate": 1.3719806763285026e-06, + "loss": 0.8174, + "step": 142 + }, + { + "epoch": 0.0, + "grad_norm": 2.781684644333002, + "learning_rate": 1.3816425120772948e-06, + "loss": 0.8397, + "step": 143 + }, + { + "epoch": 0.0, + "grad_norm": 2.9496515986124017, + "learning_rate": 1.391304347826087e-06, + "loss": 0.7938, + "step": 144 + }, + { + "epoch": 0.0, + "grad_norm": 2.916239563766088, + "learning_rate": 1.4009661835748794e-06, + "loss": 0.8296, + "step": 145 + }, + { + "epoch": 0.0, + "grad_norm": 5.62163464532523, + "learning_rate": 1.4106280193236716e-06, + "loss": 0.8806, + "step": 146 + }, + { + "epoch": 0.0, + "grad_norm": 2.8153001177734907, + "learning_rate": 1.420289855072464e-06, + "loss": 0.8553, + "step": 147 + }, + { + "epoch": 0.0, + "grad_norm": 3.4221082674519856, + "learning_rate": 1.4299516908212562e-06, + "loss": 0.8199, + "step": 148 + }, + { + "epoch": 0.0, + "grad_norm": 3.551684982386612, + "learning_rate": 1.4396135265700484e-06, + "loss": 0.8063, + "step": 149 + }, + { + "epoch": 0.0, + "grad_norm": 3.535427019192855, + "learning_rate": 1.4492753623188408e-06, + "loss": 0.8039, + "step": 150 + }, + { + "epoch": 0.0, + "grad_norm": 3.1511241652929898, + "learning_rate": 1.4589371980676328e-06, + "loss": 0.8181, + "step": 151 + }, + { + "epoch": 0.0, + "grad_norm": 3.5502460738922115, + "learning_rate": 1.4685990338164252e-06, + "loss": 0.837, + "step": 152 + }, + { + "epoch": 0.0, + "grad_norm": 2.715414201401706, + "learning_rate": 1.4782608695652176e-06, + "loss": 0.8426, + "step": 153 + }, + { + "epoch": 0.0, + "grad_norm": 3.0356389235417254, + "learning_rate": 1.4879227053140096e-06, + "loss": 0.8409, + "step": 154 + }, + { + "epoch": 0.0, + "grad_norm": 3.677243849107699, + "learning_rate": 1.497584541062802e-06, + "loss": 0.8443, + "step": 155 + }, + { + "epoch": 0.0, + "grad_norm": 2.6572234911901322, + "learning_rate": 1.5072463768115944e-06, + "loss": 0.7816, + "step": 156 + }, + { + "epoch": 0.0, + "grad_norm": 2.824456592650191, + "learning_rate": 1.5169082125603866e-06, + "loss": 0.783, + "step": 157 + }, + { + "epoch": 0.0, + "grad_norm": 3.321298983797876, + "learning_rate": 1.5265700483091788e-06, + "loss": 0.7944, + "step": 158 + }, + { + "epoch": 0.0, + "grad_norm": 3.785578794585339, + "learning_rate": 1.5362318840579712e-06, + "loss": 0.8407, + "step": 159 + }, + { + "epoch": 0.0, + "grad_norm": 3.9500134478355853, + "learning_rate": 1.5458937198067634e-06, + "loss": 0.767, + "step": 160 + }, + { + "epoch": 0.0, + "grad_norm": 3.105387119233999, + "learning_rate": 1.5555555555555558e-06, + "loss": 0.8698, + "step": 161 + }, + { + "epoch": 0.0, + "grad_norm": 3.808815292868604, + "learning_rate": 1.565217391304348e-06, + "loss": 0.8123, + "step": 162 + }, + { + "epoch": 0.0, + "grad_norm": 2.9054045037805203, + "learning_rate": 1.5748792270531402e-06, + "loss": 0.7828, + "step": 163 + }, + { + "epoch": 0.0, + "grad_norm": 2.732628287818472, + "learning_rate": 1.5845410628019326e-06, + "loss": 0.8132, + "step": 164 + }, + { + "epoch": 0.0, + "grad_norm": 2.4747987712795037, + "learning_rate": 1.5942028985507246e-06, + "loss": 0.7928, + "step": 165 + }, + { + "epoch": 0.0, + "grad_norm": 2.7305043879648694, + "learning_rate": 1.603864734299517e-06, + "loss": 0.7574, + "step": 166 + }, + { + "epoch": 0.0, + "grad_norm": 5.521469446211857, + "learning_rate": 1.6135265700483094e-06, + "loss": 0.8061, + "step": 167 + }, + { + "epoch": 0.0, + "grad_norm": 3.555462600336474, + "learning_rate": 1.6231884057971014e-06, + "loss": 0.8195, + "step": 168 + }, + { + "epoch": 0.0, + "grad_norm": 3.540046293803978, + "learning_rate": 1.6328502415458938e-06, + "loss": 0.8071, + "step": 169 + }, + { + "epoch": 0.0, + "grad_norm": 3.2441231963671764, + "learning_rate": 1.6425120772946862e-06, + "loss": 0.7611, + "step": 170 + }, + { + "epoch": 0.0, + "grad_norm": 3.2400946419619294, + "learning_rate": 1.6521739130434784e-06, + "loss": 0.8144, + "step": 171 + }, + { + "epoch": 0.0, + "grad_norm": 3.672911076101772, + "learning_rate": 1.6618357487922706e-06, + "loss": 0.8251, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 4.80262733296641, + "learning_rate": 1.671497584541063e-06, + "loss": 0.8086, + "step": 173 + }, + { + "epoch": 0.01, + "grad_norm": 2.809214401551259, + "learning_rate": 1.6811594202898552e-06, + "loss": 0.7797, + "step": 174 + }, + { + "epoch": 0.01, + "grad_norm": 3.0722456945365417, + "learning_rate": 1.6908212560386476e-06, + "loss": 0.8099, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 3.6410031771730607, + "learning_rate": 1.7004830917874398e-06, + "loss": 0.7721, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 12.53510907252986, + "learning_rate": 1.710144927536232e-06, + "loss": 0.7951, + "step": 177 + }, + { + "epoch": 0.01, + "grad_norm": 3.4712479463997976, + "learning_rate": 1.7198067632850244e-06, + "loss": 0.8076, + "step": 178 + }, + { + "epoch": 0.01, + "grad_norm": 4.240600184537767, + "learning_rate": 1.7294685990338168e-06, + "loss": 0.8238, + "step": 179 + }, + { + "epoch": 0.01, + "grad_norm": 12.122250009394799, + "learning_rate": 1.7391304347826088e-06, + "loss": 0.7594, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 3.029509500591487, + "learning_rate": 1.7487922705314012e-06, + "loss": 0.8398, + "step": 181 + }, + { + "epoch": 0.01, + "grad_norm": 2.8709903406735218, + "learning_rate": 1.7584541062801934e-06, + "loss": 0.749, + "step": 182 + }, + { + "epoch": 0.01, + "grad_norm": 2.776303034189679, + "learning_rate": 1.7681159420289855e-06, + "loss": 0.7377, + "step": 183 + }, + { + "epoch": 0.01, + "grad_norm": 4.130139876574884, + "learning_rate": 1.777777777777778e-06, + "loss": 0.7271, + "step": 184 + }, + { + "epoch": 0.01, + "grad_norm": 2.590200696970771, + "learning_rate": 1.7874396135265702e-06, + "loss": 0.7459, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 4.2132477077976835, + "learning_rate": 1.7971014492753626e-06, + "loss": 0.7904, + "step": 186 + }, + { + "epoch": 0.01, + "grad_norm": 2.7060108007201715, + "learning_rate": 1.8067632850241548e-06, + "loss": 0.7452, + "step": 187 + }, + { + "epoch": 0.01, + "grad_norm": 4.441692690167807, + "learning_rate": 1.816425120772947e-06, + "loss": 0.7501, + "step": 188 + }, + { + "epoch": 0.01, + "grad_norm": 3.332735234268166, + "learning_rate": 1.8260869565217394e-06, + "loss": 0.8015, + "step": 189 + }, + { + "epoch": 0.01, + "grad_norm": 17.563537317412962, + "learning_rate": 1.8357487922705318e-06, + "loss": 0.7719, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 2.45019968215305, + "learning_rate": 1.8454106280193237e-06, + "loss": 0.7713, + "step": 191 + }, + { + "epoch": 0.01, + "grad_norm": 2.966531660864752, + "learning_rate": 1.8550724637681161e-06, + "loss": 0.7553, + "step": 192 + }, + { + "epoch": 0.01, + "grad_norm": 2.479765334957802, + "learning_rate": 1.8647342995169086e-06, + "loss": 0.7277, + "step": 193 + }, + { + "epoch": 0.01, + "grad_norm": 2.4729561766929566, + "learning_rate": 1.8743961352657005e-06, + "loss": 0.7584, + "step": 194 + }, + { + "epoch": 0.01, + "grad_norm": 2.5457901292522704, + "learning_rate": 1.884057971014493e-06, + "loss": 0.8221, + "step": 195 + }, + { + "epoch": 0.01, + "grad_norm": 3.0050316876268854, + "learning_rate": 1.8937198067632853e-06, + "loss": 0.7684, + "step": 196 + }, + { + "epoch": 0.01, + "grad_norm": 23.533278140750923, + "learning_rate": 1.9033816425120773e-06, + "loss": 0.7595, + "step": 197 + }, + { + "epoch": 0.01, + "grad_norm": 2.451193325641964, + "learning_rate": 1.9130434782608697e-06, + "loss": 0.7668, + "step": 198 + }, + { + "epoch": 0.01, + "grad_norm": 3.411131241887189, + "learning_rate": 1.922705314009662e-06, + "loss": 0.7484, + "step": 199 + }, + { + "epoch": 0.01, + "grad_norm": 40.38991480918015, + "learning_rate": 1.932367149758454e-06, + "loss": 0.7305, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 3.375415531976911, + "learning_rate": 1.9420289855072467e-06, + "loss": 0.736, + "step": 201 + }, + { + "epoch": 0.01, + "grad_norm": 1.2643090943854034, + "learning_rate": 1.9516908212560385e-06, + "loss": 0.5819, + "step": 202 + }, + { + "epoch": 0.01, + "grad_norm": 2.5467020326881276, + "learning_rate": 1.961352657004831e-06, + "loss": 0.6981, + "step": 203 + }, + { + "epoch": 0.01, + "grad_norm": 11.848219041394907, + "learning_rate": 1.9710144927536233e-06, + "loss": 0.7013, + "step": 204 + }, + { + "epoch": 0.01, + "grad_norm": 3.270627567486081, + "learning_rate": 1.9806763285024155e-06, + "loss": 0.7402, + "step": 205 + }, + { + "epoch": 0.01, + "grad_norm": 3.638087658109634, + "learning_rate": 1.9903381642512077e-06, + "loss": 0.7958, + "step": 206 + }, + { + "epoch": 0.01, + "grad_norm": 7.364139010830503, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7631, + "step": 207 + }, + { + "epoch": 0.01, + "grad_norm": 4.582908499621389, + "learning_rate": 2.0096618357487925e-06, + "loss": 0.7411, + "step": 208 + }, + { + "epoch": 0.01, + "grad_norm": 2.812306455606928, + "learning_rate": 2.0193236714975847e-06, + "loss": 0.7389, + "step": 209 + }, + { + "epoch": 0.01, + "grad_norm": 2.6071803763229062, + "learning_rate": 2.028985507246377e-06, + "loss": 0.7231, + "step": 210 + }, + { + "epoch": 0.01, + "grad_norm": 2.347004938554119, + "learning_rate": 2.038647342995169e-06, + "loss": 0.7152, + "step": 211 + }, + { + "epoch": 0.01, + "grad_norm": 2.8605222820216474, + "learning_rate": 2.0483091787439617e-06, + "loss": 0.8354, + "step": 212 + }, + { + "epoch": 0.01, + "grad_norm": 3.8510908622602593, + "learning_rate": 2.0579710144927535e-06, + "loss": 0.7478, + "step": 213 + }, + { + "epoch": 0.01, + "grad_norm": 5.985579876968309, + "learning_rate": 2.067632850241546e-06, + "loss": 0.7797, + "step": 214 + }, + { + "epoch": 0.01, + "grad_norm": 4.067726695348538, + "learning_rate": 2.0772946859903383e-06, + "loss": 0.7255, + "step": 215 + }, + { + "epoch": 0.01, + "grad_norm": 3.2575397993558317, + "learning_rate": 2.0869565217391305e-06, + "loss": 0.7389, + "step": 216 + }, + { + "epoch": 0.01, + "grad_norm": 3.1617909764639966, + "learning_rate": 2.0966183574879227e-06, + "loss": 0.7713, + "step": 217 + }, + { + "epoch": 0.01, + "grad_norm": 3.3547206419031186, + "learning_rate": 2.1062801932367153e-06, + "loss": 0.7317, + "step": 218 + }, + { + "epoch": 0.01, + "grad_norm": 3.648614340603927, + "learning_rate": 2.1159420289855075e-06, + "loss": 0.7037, + "step": 219 + }, + { + "epoch": 0.01, + "grad_norm": 9.999588781236714, + "learning_rate": 2.1256038647342997e-06, + "loss": 0.7133, + "step": 220 + }, + { + "epoch": 0.01, + "grad_norm": 2.7898462015800454, + "learning_rate": 2.135265700483092e-06, + "loss": 0.7464, + "step": 221 + }, + { + "epoch": 0.01, + "grad_norm": 4.161152292102214, + "learning_rate": 2.144927536231884e-06, + "loss": 0.7253, + "step": 222 + }, + { + "epoch": 0.01, + "grad_norm": 2.7846394473225624, + "learning_rate": 2.1545893719806767e-06, + "loss": 0.7235, + "step": 223 + }, + { + "epoch": 0.01, + "grad_norm": 3.9304068534492322, + "learning_rate": 2.164251207729469e-06, + "loss": 0.7773, + "step": 224 + }, + { + "epoch": 0.01, + "grad_norm": 2.575624491313094, + "learning_rate": 2.173913043478261e-06, + "loss": 0.7099, + "step": 225 + }, + { + "epoch": 0.01, + "grad_norm": 2.769332383371077, + "learning_rate": 2.1835748792270533e-06, + "loss": 0.721, + "step": 226 + }, + { + "epoch": 0.01, + "grad_norm": 4.762446952657221, + "learning_rate": 2.193236714975846e-06, + "loss": 0.7685, + "step": 227 + }, + { + "epoch": 0.01, + "grad_norm": 3.233673981057795, + "learning_rate": 2.2028985507246377e-06, + "loss": 0.7565, + "step": 228 + }, + { + "epoch": 0.01, + "grad_norm": 3.320934154730071, + "learning_rate": 2.2125603864734303e-06, + "loss": 0.72, + "step": 229 + }, + { + "epoch": 0.01, + "grad_norm": 3.0906456629491075, + "learning_rate": 2.222222222222222e-06, + "loss": 0.7645, + "step": 230 + }, + { + "epoch": 0.01, + "grad_norm": 2.7749551945095967, + "learning_rate": 2.2318840579710147e-06, + "loss": 0.7521, + "step": 231 + }, + { + "epoch": 0.01, + "grad_norm": 3.783060869661108, + "learning_rate": 2.241545893719807e-06, + "loss": 0.7252, + "step": 232 + }, + { + "epoch": 0.01, + "grad_norm": 3.0898272094754677, + "learning_rate": 2.251207729468599e-06, + "loss": 0.7612, + "step": 233 + }, + { + "epoch": 0.01, + "grad_norm": 3.6229168845932365, + "learning_rate": 2.2608695652173913e-06, + "loss": 0.7562, + "step": 234 + }, + { + "epoch": 0.01, + "grad_norm": 4.484800833389815, + "learning_rate": 2.270531400966184e-06, + "loss": 0.7121, + "step": 235 + }, + { + "epoch": 0.01, + "grad_norm": 3.232265291535921, + "learning_rate": 2.280193236714976e-06, + "loss": 0.7214, + "step": 236 + }, + { + "epoch": 0.01, + "grad_norm": 2.632928381788788, + "learning_rate": 2.2898550724637683e-06, + "loss": 0.801, + "step": 237 + }, + { + "epoch": 0.01, + "grad_norm": 2.917329596834301, + "learning_rate": 2.2995169082125605e-06, + "loss": 0.7725, + "step": 238 + }, + { + "epoch": 0.01, + "grad_norm": 3.1022283917360696, + "learning_rate": 2.3091787439613527e-06, + "loss": 0.7263, + "step": 239 + }, + { + "epoch": 0.01, + "grad_norm": 2.488480304351483, + "learning_rate": 2.3188405797101453e-06, + "loss": 0.7125, + "step": 240 + }, + { + "epoch": 0.01, + "grad_norm": 2.595838107287109, + "learning_rate": 2.3285024154589375e-06, + "loss": 0.7184, + "step": 241 + }, + { + "epoch": 0.01, + "grad_norm": 2.6494858191198967, + "learning_rate": 2.3381642512077297e-06, + "loss": 0.7254, + "step": 242 + }, + { + "epoch": 0.01, + "grad_norm": 2.6785402140067363, + "learning_rate": 2.347826086956522e-06, + "loss": 0.6886, + "step": 243 + }, + { + "epoch": 0.01, + "grad_norm": 2.6081855720350395, + "learning_rate": 2.357487922705314e-06, + "loss": 0.764, + "step": 244 + }, + { + "epoch": 0.01, + "grad_norm": 3.7953985403412784, + "learning_rate": 2.3671497584541063e-06, + "loss": 0.718, + "step": 245 + }, + { + "epoch": 0.01, + "grad_norm": 2.939771569857067, + "learning_rate": 2.376811594202899e-06, + "loss": 0.7093, + "step": 246 + }, + { + "epoch": 0.01, + "grad_norm": 2.946213229731639, + "learning_rate": 2.386473429951691e-06, + "loss": 0.6807, + "step": 247 + }, + { + "epoch": 0.01, + "grad_norm": 4.808992868377738, + "learning_rate": 2.3961352657004833e-06, + "loss": 0.7341, + "step": 248 + }, + { + "epoch": 0.01, + "grad_norm": 8.457605885651844, + "learning_rate": 2.4057971014492755e-06, + "loss": 0.7628, + "step": 249 + }, + { + "epoch": 0.01, + "grad_norm": 2.818665901620248, + "learning_rate": 2.4154589371980677e-06, + "loss": 0.7022, + "step": 250 + }, + { + "epoch": 0.01, + "grad_norm": 3.6051043459934102, + "learning_rate": 2.4251207729468603e-06, + "loss": 0.6999, + "step": 251 + }, + { + "epoch": 0.01, + "grad_norm": 2.7712658284084175, + "learning_rate": 2.4347826086956525e-06, + "loss": 0.7232, + "step": 252 + }, + { + "epoch": 0.01, + "grad_norm": 3.7374022590657616, + "learning_rate": 2.4444444444444447e-06, + "loss": 0.6763, + "step": 253 + }, + { + "epoch": 0.01, + "grad_norm": 2.5272417908372944, + "learning_rate": 2.454106280193237e-06, + "loss": 0.7085, + "step": 254 + }, + { + "epoch": 0.01, + "grad_norm": 2.4308869550509353, + "learning_rate": 2.4637681159420295e-06, + "loss": 0.7542, + "step": 255 + }, + { + "epoch": 0.01, + "grad_norm": 9.75810390640741, + "learning_rate": 2.4734299516908212e-06, + "loss": 0.6928, + "step": 256 + }, + { + "epoch": 0.01, + "grad_norm": 3.6373752405570126, + "learning_rate": 2.483091787439614e-06, + "loss": 0.754, + "step": 257 + }, + { + "epoch": 0.01, + "grad_norm": 2.4104614524823096, + "learning_rate": 2.492753623188406e-06, + "loss": 0.6724, + "step": 258 + }, + { + "epoch": 0.01, + "grad_norm": 2.7425538180700157, + "learning_rate": 2.5024154589371987e-06, + "loss": 0.6856, + "step": 259 + }, + { + "epoch": 0.01, + "grad_norm": 2.641967511168434, + "learning_rate": 2.5120772946859904e-06, + "loss": 0.6662, + "step": 260 + }, + { + "epoch": 0.01, + "grad_norm": 2.936024965584445, + "learning_rate": 2.5217391304347826e-06, + "loss": 0.7263, + "step": 261 + }, + { + "epoch": 0.01, + "grad_norm": 2.677427585928462, + "learning_rate": 2.5314009661835753e-06, + "loss": 0.6921, + "step": 262 + }, + { + "epoch": 0.01, + "grad_norm": 4.383568838259756, + "learning_rate": 2.5410628019323675e-06, + "loss": 0.7371, + "step": 263 + }, + { + "epoch": 0.01, + "grad_norm": 3.1462407397627823, + "learning_rate": 2.5507246376811596e-06, + "loss": 0.6786, + "step": 264 + }, + { + "epoch": 0.01, + "grad_norm": 4.300776491299097, + "learning_rate": 2.5603864734299523e-06, + "loss": 0.7488, + "step": 265 + }, + { + "epoch": 0.01, + "grad_norm": 4.184397107091566, + "learning_rate": 2.5700483091787445e-06, + "loss": 0.6749, + "step": 266 + }, + { + "epoch": 0.01, + "grad_norm": 2.8210096032537613, + "learning_rate": 2.5797101449275362e-06, + "loss": 0.6705, + "step": 267 + }, + { + "epoch": 0.01, + "grad_norm": 2.363803298771863, + "learning_rate": 2.5893719806763284e-06, + "loss": 0.7209, + "step": 268 + }, + { + "epoch": 0.01, + "grad_norm": 3.4610441558890126, + "learning_rate": 2.599033816425121e-06, + "loss": 0.6368, + "step": 269 + }, + { + "epoch": 0.01, + "grad_norm": 2.23553786779772, + "learning_rate": 2.6086956521739132e-06, + "loss": 0.6567, + "step": 270 + }, + { + "epoch": 0.01, + "grad_norm": 2.4756539433745215, + "learning_rate": 2.6183574879227054e-06, + "loss": 0.6283, + "step": 271 + }, + { + "epoch": 0.01, + "grad_norm": 2.9994636644858734, + "learning_rate": 2.628019323671498e-06, + "loss": 0.6722, + "step": 272 + }, + { + "epoch": 0.01, + "grad_norm": 2.6738052238550463, + "learning_rate": 2.63768115942029e-06, + "loss": 0.6538, + "step": 273 + }, + { + "epoch": 0.01, + "grad_norm": 2.601325489741313, + "learning_rate": 2.647342995169082e-06, + "loss": 0.7533, + "step": 274 + }, + { + "epoch": 0.01, + "grad_norm": 2.467466343370546, + "learning_rate": 2.6570048309178746e-06, + "loss": 0.7, + "step": 275 + }, + { + "epoch": 0.01, + "grad_norm": 4.435103353171959, + "learning_rate": 2.666666666666667e-06, + "loss": 0.6711, + "step": 276 + }, + { + "epoch": 0.01, + "grad_norm": 2.432364294969762, + "learning_rate": 2.676328502415459e-06, + "loss": 0.7003, + "step": 277 + }, + { + "epoch": 0.01, + "grad_norm": 2.445294463806612, + "learning_rate": 2.6859903381642516e-06, + "loss": 0.706, + "step": 278 + }, + { + "epoch": 0.01, + "grad_norm": 2.7507792998879665, + "learning_rate": 2.695652173913044e-06, + "loss": 0.7081, + "step": 279 + }, + { + "epoch": 0.01, + "grad_norm": 3.4122381717002668, + "learning_rate": 2.7053140096618356e-06, + "loss": 0.6698, + "step": 280 + }, + { + "epoch": 0.01, + "grad_norm": 3.4114878253950853, + "learning_rate": 2.7149758454106282e-06, + "loss": 0.7368, + "step": 281 + }, + { + "epoch": 0.01, + "grad_norm": 2.6663468192500184, + "learning_rate": 2.7246376811594204e-06, + "loss": 0.6477, + "step": 282 + }, + { + "epoch": 0.01, + "grad_norm": 2.3500745924520854, + "learning_rate": 2.7342995169082126e-06, + "loss": 0.6727, + "step": 283 + }, + { + "epoch": 0.01, + "grad_norm": 2.3944231262521343, + "learning_rate": 2.7439613526570052e-06, + "loss": 0.6883, + "step": 284 + }, + { + "epoch": 0.01, + "grad_norm": 2.8107493620325044, + "learning_rate": 2.7536231884057974e-06, + "loss": 0.6728, + "step": 285 + }, + { + "epoch": 0.01, + "grad_norm": 2.479481279206363, + "learning_rate": 2.7632850241545896e-06, + "loss": 0.7246, + "step": 286 + }, + { + "epoch": 0.01, + "grad_norm": 2.834653171468579, + "learning_rate": 2.7729468599033822e-06, + "loss": 0.7163, + "step": 287 + }, + { + "epoch": 0.01, + "grad_norm": 2.2248366605716905, + "learning_rate": 2.782608695652174e-06, + "loss": 0.6479, + "step": 288 + }, + { + "epoch": 0.01, + "grad_norm": 2.607107886356711, + "learning_rate": 2.792270531400966e-06, + "loss": 0.7207, + "step": 289 + }, + { + "epoch": 0.01, + "grad_norm": 2.4781449922912366, + "learning_rate": 2.801932367149759e-06, + "loss": 0.6806, + "step": 290 + }, + { + "epoch": 0.01, + "grad_norm": 1.182583122452985, + "learning_rate": 2.811594202898551e-06, + "loss": 0.4912, + "step": 291 + }, + { + "epoch": 0.01, + "grad_norm": 14.929415403429873, + "learning_rate": 2.821256038647343e-06, + "loss": 0.6698, + "step": 292 + }, + { + "epoch": 0.01, + "grad_norm": 2.66398713178944, + "learning_rate": 2.830917874396136e-06, + "loss": 0.6965, + "step": 293 + }, + { + "epoch": 0.01, + "grad_norm": 1.2221695708528317, + "learning_rate": 2.840579710144928e-06, + "loss": 0.5099, + "step": 294 + }, + { + "epoch": 0.01, + "grad_norm": 3.199484309755233, + "learning_rate": 2.85024154589372e-06, + "loss": 0.7242, + "step": 295 + }, + { + "epoch": 0.01, + "grad_norm": 2.5234192121722283, + "learning_rate": 2.8599033816425124e-06, + "loss": 0.6335, + "step": 296 + }, + { + "epoch": 0.01, + "grad_norm": 2.2890185192085144, + "learning_rate": 2.8695652173913046e-06, + "loss": 0.7086, + "step": 297 + }, + { + "epoch": 0.01, + "grad_norm": 5.264134045908752, + "learning_rate": 2.879227053140097e-06, + "loss": 0.7458, + "step": 298 + }, + { + "epoch": 0.01, + "grad_norm": 2.9325993555266487, + "learning_rate": 2.888888888888889e-06, + "loss": 0.6714, + "step": 299 + }, + { + "epoch": 0.01, + "grad_norm": 2.4342844115879116, + "learning_rate": 2.8985507246376816e-06, + "loss": 0.682, + "step": 300 + }, + { + "epoch": 0.01, + "grad_norm": 2.919181368750601, + "learning_rate": 2.9082125603864734e-06, + "loss": 0.6697, + "step": 301 + }, + { + "epoch": 0.01, + "grad_norm": 5.151406307473927, + "learning_rate": 2.9178743961352656e-06, + "loss": 0.6495, + "step": 302 + }, + { + "epoch": 0.01, + "grad_norm": 2.6875943351597247, + "learning_rate": 2.927536231884058e-06, + "loss": 0.6819, + "step": 303 + }, + { + "epoch": 0.01, + "grad_norm": 1.8625549837918802, + "learning_rate": 2.9371980676328504e-06, + "loss": 0.5826, + "step": 304 + }, + { + "epoch": 0.01, + "grad_norm": 2.91011148595618, + "learning_rate": 2.9468599033816426e-06, + "loss": 0.6504, + "step": 305 + }, + { + "epoch": 0.01, + "grad_norm": 2.859144919242814, + "learning_rate": 2.956521739130435e-06, + "loss": 0.6418, + "step": 306 + }, + { + "epoch": 0.01, + "grad_norm": 2.308698959425568, + "learning_rate": 2.9661835748792274e-06, + "loss": 0.6749, + "step": 307 + }, + { + "epoch": 0.01, + "grad_norm": 3.2696377333267743, + "learning_rate": 2.975845410628019e-06, + "loss": 0.6555, + "step": 308 + }, + { + "epoch": 0.01, + "grad_norm": 2.3535222799261013, + "learning_rate": 2.9855072463768118e-06, + "loss": 0.6778, + "step": 309 + }, + { + "epoch": 0.01, + "grad_norm": 2.7971585158292633, + "learning_rate": 2.995169082125604e-06, + "loss": 0.7092, + "step": 310 + }, + { + "epoch": 0.01, + "grad_norm": 1.2161415623852836, + "learning_rate": 3.004830917874396e-06, + "loss": 0.4803, + "step": 311 + }, + { + "epoch": 0.01, + "grad_norm": 2.7774306768499004, + "learning_rate": 3.014492753623189e-06, + "loss": 0.6666, + "step": 312 + }, + { + "epoch": 0.01, + "grad_norm": 2.428850637620306, + "learning_rate": 3.024154589371981e-06, + "loss": 0.6672, + "step": 313 + }, + { + "epoch": 0.01, + "grad_norm": 2.850537494602298, + "learning_rate": 3.033816425120773e-06, + "loss": 0.6669, + "step": 314 + }, + { + "epoch": 0.01, + "grad_norm": 5.05342807333586, + "learning_rate": 3.043478260869566e-06, + "loss": 0.6454, + "step": 315 + }, + { + "epoch": 0.01, + "grad_norm": 2.414699214549847, + "learning_rate": 3.0531400966183576e-06, + "loss": 0.6699, + "step": 316 + }, + { + "epoch": 0.01, + "grad_norm": 2.5784940067861166, + "learning_rate": 3.0628019323671498e-06, + "loss": 0.6177, + "step": 317 + }, + { + "epoch": 0.01, + "grad_norm": 2.4161595872324635, + "learning_rate": 3.0724637681159424e-06, + "loss": 0.6565, + "step": 318 + }, + { + "epoch": 0.01, + "grad_norm": 2.6018544370757914, + "learning_rate": 3.0821256038647346e-06, + "loss": 0.6782, + "step": 319 + }, + { + "epoch": 0.01, + "grad_norm": 2.383757287844922, + "learning_rate": 3.0917874396135268e-06, + "loss": 0.6741, + "step": 320 + }, + { + "epoch": 0.01, + "grad_norm": 2.2053824487328333, + "learning_rate": 3.1014492753623194e-06, + "loss": 0.6325, + "step": 321 + }, + { + "epoch": 0.01, + "grad_norm": 2.5745943569224816, + "learning_rate": 3.1111111111111116e-06, + "loss": 0.652, + "step": 322 + }, + { + "epoch": 0.01, + "grad_norm": 4.580877635289021, + "learning_rate": 3.1207729468599034e-06, + "loss": 0.6386, + "step": 323 + }, + { + "epoch": 0.01, + "grad_norm": 2.383047552968873, + "learning_rate": 3.130434782608696e-06, + "loss": 0.6789, + "step": 324 + }, + { + "epoch": 0.01, + "grad_norm": 4.439271963139634, + "learning_rate": 3.140096618357488e-06, + "loss": 0.6923, + "step": 325 + }, + { + "epoch": 0.01, + "grad_norm": 2.4599196521111293, + "learning_rate": 3.1497584541062804e-06, + "loss": 0.6795, + "step": 326 + }, + { + "epoch": 0.01, + "grad_norm": 2.6674847491170155, + "learning_rate": 3.159420289855073e-06, + "loss": 0.6052, + "step": 327 + }, + { + "epoch": 0.01, + "grad_norm": 2.2686540248698503, + "learning_rate": 3.169082125603865e-06, + "loss": 0.6427, + "step": 328 + }, + { + "epoch": 0.01, + "grad_norm": 2.671654396920156, + "learning_rate": 3.1787439613526574e-06, + "loss": 0.659, + "step": 329 + }, + { + "epoch": 0.01, + "grad_norm": 2.162614561118156, + "learning_rate": 3.188405797101449e-06, + "loss": 0.6346, + "step": 330 + }, + { + "epoch": 0.01, + "grad_norm": 2.240620631736347, + "learning_rate": 3.1980676328502418e-06, + "loss": 0.6331, + "step": 331 + }, + { + "epoch": 0.01, + "grad_norm": 2.704382093092875, + "learning_rate": 3.207729468599034e-06, + "loss": 0.6205, + "step": 332 + }, + { + "epoch": 0.01, + "grad_norm": 2.624249601985806, + "learning_rate": 3.217391304347826e-06, + "loss": 0.7299, + "step": 333 + }, + { + "epoch": 0.01, + "grad_norm": 2.106197766524283, + "learning_rate": 3.2270531400966188e-06, + "loss": 0.6504, + "step": 334 + }, + { + "epoch": 0.01, + "grad_norm": 2.522494940506132, + "learning_rate": 3.236714975845411e-06, + "loss": 0.6647, + "step": 335 + }, + { + "epoch": 0.01, + "grad_norm": 3.267837113210688, + "learning_rate": 3.2463768115942027e-06, + "loss": 0.688, + "step": 336 + }, + { + "epoch": 0.01, + "grad_norm": 3.8989463923804193, + "learning_rate": 3.2560386473429958e-06, + "loss": 0.6406, + "step": 337 + }, + { + "epoch": 0.01, + "grad_norm": 2.1749861948694167, + "learning_rate": 3.2657004830917875e-06, + "loss": 0.6139, + "step": 338 + }, + { + "epoch": 0.01, + "grad_norm": 2.2150044181019526, + "learning_rate": 3.2753623188405797e-06, + "loss": 0.6443, + "step": 339 + }, + { + "epoch": 0.01, + "grad_norm": 2.5581116265889245, + "learning_rate": 3.2850241545893724e-06, + "loss": 0.638, + "step": 340 + }, + { + "epoch": 0.01, + "grad_norm": 2.262087856852021, + "learning_rate": 3.2946859903381645e-06, + "loss": 0.6955, + "step": 341 + }, + { + "epoch": 0.01, + "grad_norm": 2.371602070376017, + "learning_rate": 3.3043478260869567e-06, + "loss": 0.6912, + "step": 342 + }, + { + "epoch": 0.01, + "grad_norm": 2.7340464864175575, + "learning_rate": 3.3140096618357494e-06, + "loss": 0.7066, + "step": 343 + }, + { + "epoch": 0.01, + "grad_norm": 2.2878481069494403, + "learning_rate": 3.323671497584541e-06, + "loss": 0.6095, + "step": 344 + }, + { + "epoch": 0.01, + "grad_norm": 2.644275022661138, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.6717, + "step": 345 + }, + { + "epoch": 0.01, + "grad_norm": 2.293914939227265, + "learning_rate": 3.342995169082126e-06, + "loss": 0.6653, + "step": 346 + }, + { + "epoch": 0.01, + "grad_norm": 2.229576133349703, + "learning_rate": 3.352657004830918e-06, + "loss": 0.6621, + "step": 347 + }, + { + "epoch": 0.01, + "grad_norm": 2.667674343263558, + "learning_rate": 3.3623188405797103e-06, + "loss": 0.6988, + "step": 348 + }, + { + "epoch": 0.01, + "grad_norm": 2.305462678626164, + "learning_rate": 3.371980676328503e-06, + "loss": 0.6635, + "step": 349 + }, + { + "epoch": 0.01, + "grad_norm": 2.484785053804451, + "learning_rate": 3.381642512077295e-06, + "loss": 0.632, + "step": 350 + }, + { + "epoch": 0.01, + "grad_norm": 2.2619595144997158, + "learning_rate": 3.391304347826087e-06, + "loss": 0.6699, + "step": 351 + }, + { + "epoch": 0.01, + "grad_norm": 2.382152886891119, + "learning_rate": 3.4009661835748795e-06, + "loss": 0.6634, + "step": 352 + }, + { + "epoch": 0.01, + "grad_norm": 2.6731655910798824, + "learning_rate": 3.4106280193236717e-06, + "loss": 0.6494, + "step": 353 + }, + { + "epoch": 0.01, + "grad_norm": 6.91280442044355, + "learning_rate": 3.420289855072464e-06, + "loss": 0.6789, + "step": 354 + }, + { + "epoch": 0.01, + "grad_norm": 2.3908409147236718, + "learning_rate": 3.4299516908212565e-06, + "loss": 0.6341, + "step": 355 + }, + { + "epoch": 0.01, + "grad_norm": 2.493426815929326, + "learning_rate": 3.4396135265700487e-06, + "loss": 0.6679, + "step": 356 + }, + { + "epoch": 0.01, + "grad_norm": 2.382515322840512, + "learning_rate": 3.449275362318841e-06, + "loss": 0.6601, + "step": 357 + }, + { + "epoch": 0.01, + "grad_norm": 2.343003593345328, + "learning_rate": 3.4589371980676335e-06, + "loss": 0.6695, + "step": 358 + }, + { + "epoch": 0.01, + "grad_norm": 2.280507918972925, + "learning_rate": 3.4685990338164253e-06, + "loss": 0.5986, + "step": 359 + }, + { + "epoch": 0.01, + "grad_norm": 2.5933842382017582, + "learning_rate": 3.4782608695652175e-06, + "loss": 0.6596, + "step": 360 + }, + { + "epoch": 0.01, + "grad_norm": 2.1968937136127042, + "learning_rate": 3.4879227053140097e-06, + "loss": 0.6428, + "step": 361 + }, + { + "epoch": 0.01, + "grad_norm": 2.248217500173368, + "learning_rate": 3.4975845410628023e-06, + "loss": 0.6639, + "step": 362 + }, + { + "epoch": 0.01, + "grad_norm": 2.4858326773434882, + "learning_rate": 3.5072463768115945e-06, + "loss": 0.6769, + "step": 363 + }, + { + "epoch": 0.01, + "grad_norm": 2.149680557933237, + "learning_rate": 3.5169082125603867e-06, + "loss": 0.6007, + "step": 364 + }, + { + "epoch": 0.01, + "grad_norm": 2.240300507937559, + "learning_rate": 3.5265700483091793e-06, + "loss": 0.5808, + "step": 365 + }, + { + "epoch": 0.01, + "grad_norm": 2.801384792200254, + "learning_rate": 3.536231884057971e-06, + "loss": 0.6527, + "step": 366 + }, + { + "epoch": 0.01, + "grad_norm": 3.334448375652601, + "learning_rate": 3.5458937198067633e-06, + "loss": 0.6272, + "step": 367 + }, + { + "epoch": 0.01, + "grad_norm": 2.3870764530389907, + "learning_rate": 3.555555555555556e-06, + "loss": 0.6538, + "step": 368 + }, + { + "epoch": 0.01, + "grad_norm": 2.2660393276321322, + "learning_rate": 3.565217391304348e-06, + "loss": 0.6292, + "step": 369 + }, + { + "epoch": 0.01, + "grad_norm": 2.325276439707573, + "learning_rate": 3.5748792270531403e-06, + "loss": 0.6662, + "step": 370 + }, + { + "epoch": 0.01, + "grad_norm": 2.101875450373029, + "learning_rate": 3.584541062801933e-06, + "loss": 0.6386, + "step": 371 + }, + { + "epoch": 0.01, + "grad_norm": 2.606913393820352, + "learning_rate": 3.594202898550725e-06, + "loss": 0.7104, + "step": 372 + }, + { + "epoch": 0.01, + "grad_norm": 3.1550642601911574, + "learning_rate": 3.603864734299517e-06, + "loss": 0.7196, + "step": 373 + }, + { + "epoch": 0.01, + "grad_norm": 2.7304864228570556, + "learning_rate": 3.6135265700483095e-06, + "loss": 0.6051, + "step": 374 + }, + { + "epoch": 0.01, + "grad_norm": 2.3047418535352326, + "learning_rate": 3.6231884057971017e-06, + "loss": 0.6919, + "step": 375 + }, + { + "epoch": 0.01, + "grad_norm": 2.717701388462901, + "learning_rate": 3.632850241545894e-06, + "loss": 0.6018, + "step": 376 + }, + { + "epoch": 0.01, + "grad_norm": 2.8149371642161243, + "learning_rate": 3.6425120772946865e-06, + "loss": 0.6719, + "step": 377 + }, + { + "epoch": 0.01, + "grad_norm": 2.176677140876607, + "learning_rate": 3.6521739130434787e-06, + "loss": 0.6455, + "step": 378 + }, + { + "epoch": 0.01, + "grad_norm": 2.1603319424506098, + "learning_rate": 3.6618357487922705e-06, + "loss": 0.628, + "step": 379 + }, + { + "epoch": 0.01, + "grad_norm": 2.0975774057155054, + "learning_rate": 3.6714975845410635e-06, + "loss": 0.6532, + "step": 380 + }, + { + "epoch": 0.01, + "grad_norm": 4.2104976222075425, + "learning_rate": 3.6811594202898553e-06, + "loss": 0.6378, + "step": 381 + }, + { + "epoch": 0.01, + "grad_norm": 2.2813328393600383, + "learning_rate": 3.6908212560386475e-06, + "loss": 0.6878, + "step": 382 + }, + { + "epoch": 0.01, + "grad_norm": 2.277354919344011, + "learning_rate": 3.70048309178744e-06, + "loss": 0.6803, + "step": 383 + }, + { + "epoch": 0.01, + "grad_norm": 2.2076078189060855, + "learning_rate": 3.7101449275362323e-06, + "loss": 0.6589, + "step": 384 + }, + { + "epoch": 0.01, + "grad_norm": 2.117366085875807, + "learning_rate": 3.7198067632850245e-06, + "loss": 0.6009, + "step": 385 + }, + { + "epoch": 0.01, + "grad_norm": 2.371405360368378, + "learning_rate": 3.729468599033817e-06, + "loss": 0.6435, + "step": 386 + }, + { + "epoch": 0.01, + "grad_norm": 2.3860807906958446, + "learning_rate": 3.739130434782609e-06, + "loss": 0.6313, + "step": 387 + }, + { + "epoch": 0.01, + "grad_norm": 2.5108696837548243, + "learning_rate": 3.748792270531401e-06, + "loss": 0.6188, + "step": 388 + }, + { + "epoch": 0.01, + "grad_norm": 2.3177210199752865, + "learning_rate": 3.7584541062801937e-06, + "loss": 0.6285, + "step": 389 + }, + { + "epoch": 0.01, + "grad_norm": 2.547732754447652, + "learning_rate": 3.768115942028986e-06, + "loss": 0.6342, + "step": 390 + }, + { + "epoch": 0.01, + "grad_norm": 2.2378223673375586, + "learning_rate": 3.777777777777778e-06, + "loss": 0.6474, + "step": 391 + }, + { + "epoch": 0.01, + "grad_norm": 2.8556741663492207, + "learning_rate": 3.7874396135265707e-06, + "loss": 0.6504, + "step": 392 + }, + { + "epoch": 0.01, + "grad_norm": 2.381697752047348, + "learning_rate": 3.797101449275363e-06, + "loss": 0.6659, + "step": 393 + }, + { + "epoch": 0.01, + "grad_norm": 2.246962150160241, + "learning_rate": 3.8067632850241547e-06, + "loss": 0.679, + "step": 394 + }, + { + "epoch": 0.01, + "grad_norm": 2.449228584543208, + "learning_rate": 3.816425120772947e-06, + "loss": 0.6269, + "step": 395 + }, + { + "epoch": 0.01, + "grad_norm": 2.342695685406817, + "learning_rate": 3.8260869565217395e-06, + "loss": 0.5921, + "step": 396 + }, + { + "epoch": 0.01, + "grad_norm": 2.281161526038237, + "learning_rate": 3.835748792270532e-06, + "loss": 0.645, + "step": 397 + }, + { + "epoch": 0.01, + "grad_norm": 2.0541234738425422, + "learning_rate": 3.845410628019324e-06, + "loss": 0.6055, + "step": 398 + }, + { + "epoch": 0.01, + "grad_norm": 2.1653179319607165, + "learning_rate": 3.855072463768116e-06, + "loss": 0.6116, + "step": 399 + }, + { + "epoch": 0.01, + "grad_norm": 2.3022269795489607, + "learning_rate": 3.864734299516908e-06, + "loss": 0.6423, + "step": 400 + }, + { + "epoch": 0.01, + "grad_norm": 2.356528681724853, + "learning_rate": 3.8743961352657004e-06, + "loss": 0.721, + "step": 401 + }, + { + "epoch": 0.01, + "grad_norm": 2.42219405582636, + "learning_rate": 3.8840579710144935e-06, + "loss": 0.6328, + "step": 402 + }, + { + "epoch": 0.01, + "grad_norm": 2.446457788435147, + "learning_rate": 3.893719806763286e-06, + "loss": 0.5755, + "step": 403 + }, + { + "epoch": 0.01, + "grad_norm": 2.2890549117927432, + "learning_rate": 3.903381642512077e-06, + "loss": 0.5944, + "step": 404 + }, + { + "epoch": 0.01, + "grad_norm": 2.26421335822207, + "learning_rate": 3.91304347826087e-06, + "loss": 0.6269, + "step": 405 + }, + { + "epoch": 0.01, + "grad_norm": 4.332676235696545, + "learning_rate": 3.922705314009662e-06, + "loss": 0.6403, + "step": 406 + }, + { + "epoch": 0.01, + "grad_norm": 2.4262892182082965, + "learning_rate": 3.9323671497584545e-06, + "loss": 0.6437, + "step": 407 + }, + { + "epoch": 0.01, + "grad_norm": 2.3354305934506456, + "learning_rate": 3.942028985507247e-06, + "loss": 0.6518, + "step": 408 + }, + { + "epoch": 0.01, + "grad_norm": 2.2236561555482512, + "learning_rate": 3.951690821256039e-06, + "loss": 0.6353, + "step": 409 + }, + { + "epoch": 0.01, + "grad_norm": 3.9522786155867893, + "learning_rate": 3.961352657004831e-06, + "loss": 0.6614, + "step": 410 + }, + { + "epoch": 0.01, + "grad_norm": 2.26059816344945, + "learning_rate": 3.971014492753624e-06, + "loss": 0.617, + "step": 411 + }, + { + "epoch": 0.01, + "grad_norm": 2.5639929779674535, + "learning_rate": 3.9806763285024154e-06, + "loss": 0.6752, + "step": 412 + }, + { + "epoch": 0.01, + "grad_norm": 2.1610548025442493, + "learning_rate": 3.990338164251208e-06, + "loss": 0.6318, + "step": 413 + }, + { + "epoch": 0.01, + "grad_norm": 2.5458472138272867, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6644, + "step": 414 + }, + { + "epoch": 0.01, + "grad_norm": 2.237896366017329, + "learning_rate": 4.009661835748793e-06, + "loss": 0.6059, + "step": 415 + }, + { + "epoch": 0.01, + "grad_norm": 2.714721115333344, + "learning_rate": 4.019323671497585e-06, + "loss": 0.5914, + "step": 416 + }, + { + "epoch": 0.01, + "grad_norm": 2.2547957668980385, + "learning_rate": 4.028985507246377e-06, + "loss": 0.6196, + "step": 417 + }, + { + "epoch": 0.01, + "grad_norm": 2.2469395781317143, + "learning_rate": 4.0386473429951694e-06, + "loss": 0.5739, + "step": 418 + }, + { + "epoch": 0.01, + "grad_norm": 2.428953945840165, + "learning_rate": 4.048309178743962e-06, + "loss": 0.6651, + "step": 419 + }, + { + "epoch": 0.01, + "grad_norm": 2.3233262307080476, + "learning_rate": 4.057971014492754e-06, + "loss": 0.5899, + "step": 420 + }, + { + "epoch": 0.01, + "grad_norm": 3.5651329983250757, + "learning_rate": 4.067632850241546e-06, + "loss": 0.6361, + "step": 421 + }, + { + "epoch": 0.01, + "grad_norm": 2.5200094168739553, + "learning_rate": 4.077294685990338e-06, + "loss": 0.6225, + "step": 422 + }, + { + "epoch": 0.01, + "grad_norm": 2.60065799408926, + "learning_rate": 4.086956521739131e-06, + "loss": 0.6175, + "step": 423 + }, + { + "epoch": 0.01, + "grad_norm": 2.926413164846649, + "learning_rate": 4.0966183574879235e-06, + "loss": 0.5972, + "step": 424 + }, + { + "epoch": 0.01, + "grad_norm": 2.187507327856475, + "learning_rate": 4.106280193236716e-06, + "loss": 0.6213, + "step": 425 + }, + { + "epoch": 0.01, + "grad_norm": 2.275505268614981, + "learning_rate": 4.115942028985507e-06, + "loss": 0.6503, + "step": 426 + }, + { + "epoch": 0.01, + "grad_norm": 2.2881956280019096, + "learning_rate": 4.1256038647343e-06, + "loss": 0.6361, + "step": 427 + }, + { + "epoch": 0.01, + "grad_norm": 2.2283785088309696, + "learning_rate": 4.135265700483092e-06, + "loss": 0.6316, + "step": 428 + }, + { + "epoch": 0.01, + "grad_norm": 2.437444082344505, + "learning_rate": 4.1449275362318844e-06, + "loss": 0.5894, + "step": 429 + }, + { + "epoch": 0.01, + "grad_norm": 2.2734340300003737, + "learning_rate": 4.154589371980677e-06, + "loss": 0.6384, + "step": 430 + }, + { + "epoch": 0.01, + "grad_norm": 2.0925804241812735, + "learning_rate": 4.164251207729469e-06, + "loss": 0.6409, + "step": 431 + }, + { + "epoch": 0.01, + "grad_norm": 2.14619186323893, + "learning_rate": 4.173913043478261e-06, + "loss": 0.6241, + "step": 432 + }, + { + "epoch": 0.01, + "grad_norm": 1.2507286124287966, + "learning_rate": 4.183574879227054e-06, + "loss": 0.5469, + "step": 433 + }, + { + "epoch": 0.01, + "grad_norm": 3.198426380787808, + "learning_rate": 4.193236714975845e-06, + "loss": 0.6556, + "step": 434 + }, + { + "epoch": 0.01, + "grad_norm": 2.4487138863860762, + "learning_rate": 4.202898550724638e-06, + "loss": 0.6117, + "step": 435 + }, + { + "epoch": 0.01, + "grad_norm": 2.096116090415379, + "learning_rate": 4.212560386473431e-06, + "loss": 0.6259, + "step": 436 + }, + { + "epoch": 0.01, + "grad_norm": 2.096857529512224, + "learning_rate": 4.222222222222223e-06, + "loss": 0.6147, + "step": 437 + }, + { + "epoch": 0.01, + "grad_norm": 1.3390259853930102, + "learning_rate": 4.231884057971015e-06, + "loss": 0.5712, + "step": 438 + }, + { + "epoch": 0.01, + "grad_norm": 3.1104711801304252, + "learning_rate": 4.241545893719807e-06, + "loss": 0.6341, + "step": 439 + }, + { + "epoch": 0.01, + "grad_norm": 2.4521160316081865, + "learning_rate": 4.251207729468599e-06, + "loss": 0.636, + "step": 440 + }, + { + "epoch": 0.01, + "grad_norm": 2.2261038901483956, + "learning_rate": 4.260869565217392e-06, + "loss": 0.6094, + "step": 441 + }, + { + "epoch": 0.01, + "grad_norm": 2.7528942656906334, + "learning_rate": 4.270531400966184e-06, + "loss": 0.632, + "step": 442 + }, + { + "epoch": 0.01, + "grad_norm": 2.7304236477215302, + "learning_rate": 4.280193236714976e-06, + "loss": 0.6783, + "step": 443 + }, + { + "epoch": 0.01, + "grad_norm": 2.537598218300933, + "learning_rate": 4.289855072463768e-06, + "loss": 0.6382, + "step": 444 + }, + { + "epoch": 0.01, + "grad_norm": 2.3247734535926408, + "learning_rate": 4.299516908212561e-06, + "loss": 0.6239, + "step": 445 + }, + { + "epoch": 0.01, + "grad_norm": 2.580125817174794, + "learning_rate": 4.3091787439613534e-06, + "loss": 0.6407, + "step": 446 + }, + { + "epoch": 0.01, + "grad_norm": 2.1151440310034064, + "learning_rate": 4.318840579710145e-06, + "loss": 0.6249, + "step": 447 + }, + { + "epoch": 0.01, + "grad_norm": 2.3019049040382273, + "learning_rate": 4.328502415458938e-06, + "loss": 0.5796, + "step": 448 + }, + { + "epoch": 0.01, + "grad_norm": 2.245723101611626, + "learning_rate": 4.33816425120773e-06, + "loss": 0.5842, + "step": 449 + }, + { + "epoch": 0.01, + "grad_norm": 2.2479174779815496, + "learning_rate": 4.347826086956522e-06, + "loss": 0.6059, + "step": 450 + }, + { + "epoch": 0.01, + "grad_norm": 2.128654214320045, + "learning_rate": 4.357487922705314e-06, + "loss": 0.5988, + "step": 451 + }, + { + "epoch": 0.01, + "grad_norm": 9.747883192593799, + "learning_rate": 4.367149758454107e-06, + "loss": 0.6592, + "step": 452 + }, + { + "epoch": 0.01, + "grad_norm": 2.769397140361096, + "learning_rate": 4.376811594202899e-06, + "loss": 0.6142, + "step": 453 + }, + { + "epoch": 0.01, + "grad_norm": 10.568068472141157, + "learning_rate": 4.386473429951692e-06, + "loss": 0.6226, + "step": 454 + }, + { + "epoch": 0.01, + "grad_norm": 2.8134033202913993, + "learning_rate": 4.396135265700483e-06, + "loss": 0.6406, + "step": 455 + }, + { + "epoch": 0.01, + "grad_norm": 2.3283638395371145, + "learning_rate": 4.405797101449275e-06, + "loss": 0.6031, + "step": 456 + }, + { + "epoch": 0.01, + "grad_norm": 2.2032920860870204, + "learning_rate": 4.4154589371980676e-06, + "loss": 0.5879, + "step": 457 + }, + { + "epoch": 0.01, + "grad_norm": 2.288739689863762, + "learning_rate": 4.425120772946861e-06, + "loss": 0.6286, + "step": 458 + }, + { + "epoch": 0.01, + "grad_norm": 2.408621671056195, + "learning_rate": 4.434782608695653e-06, + "loss": 0.5782, + "step": 459 + }, + { + "epoch": 0.01, + "grad_norm": 2.463493393755387, + "learning_rate": 4.444444444444444e-06, + "loss": 0.6268, + "step": 460 + }, + { + "epoch": 0.01, + "grad_norm": 2.666751000693471, + "learning_rate": 4.454106280193237e-06, + "loss": 0.6168, + "step": 461 + }, + { + "epoch": 0.01, + "grad_norm": 2.4637620238528957, + "learning_rate": 4.463768115942029e-06, + "loss": 0.6488, + "step": 462 + }, + { + "epoch": 0.01, + "grad_norm": 2.4397431830020486, + "learning_rate": 4.473429951690822e-06, + "loss": 0.6506, + "step": 463 + }, + { + "epoch": 0.01, + "grad_norm": 2.5313302643656903, + "learning_rate": 4.483091787439614e-06, + "loss": 0.597, + "step": 464 + }, + { + "epoch": 0.01, + "grad_norm": 2.1674420606957248, + "learning_rate": 4.492753623188406e-06, + "loss": 0.6056, + "step": 465 + }, + { + "epoch": 0.01, + "grad_norm": 2.187868113616434, + "learning_rate": 4.502415458937198e-06, + "loss": 0.5641, + "step": 466 + }, + { + "epoch": 0.01, + "grad_norm": 4.322516332022521, + "learning_rate": 4.512077294685991e-06, + "loss": 0.6256, + "step": 467 + }, + { + "epoch": 0.01, + "grad_norm": 2.20047339156932, + "learning_rate": 4.5217391304347826e-06, + "loss": 0.6284, + "step": 468 + }, + { + "epoch": 0.01, + "grad_norm": 3.7348745113707342, + "learning_rate": 4.531400966183575e-06, + "loss": 0.6356, + "step": 469 + }, + { + "epoch": 0.01, + "grad_norm": 2.26826587444583, + "learning_rate": 4.541062801932368e-06, + "loss": 0.5798, + "step": 470 + }, + { + "epoch": 0.01, + "grad_norm": 2.0217723834719057, + "learning_rate": 4.55072463768116e-06, + "loss": 0.5762, + "step": 471 + }, + { + "epoch": 0.01, + "grad_norm": 2.495946707215467, + "learning_rate": 4.560386473429952e-06, + "loss": 0.61, + "step": 472 + }, + { + "epoch": 0.01, + "grad_norm": 2.2150925358549594, + "learning_rate": 4.570048309178744e-06, + "loss": 0.5919, + "step": 473 + }, + { + "epoch": 0.01, + "grad_norm": 2.3127599774114502, + "learning_rate": 4.5797101449275366e-06, + "loss": 0.6232, + "step": 474 + }, + { + "epoch": 0.01, + "grad_norm": 2.3440431684928043, + "learning_rate": 4.589371980676329e-06, + "loss": 0.6001, + "step": 475 + }, + { + "epoch": 0.01, + "grad_norm": 2.2479194267067926, + "learning_rate": 4.599033816425121e-06, + "loss": 0.6641, + "step": 476 + }, + { + "epoch": 0.01, + "grad_norm": 2.4342467531555334, + "learning_rate": 4.608695652173913e-06, + "loss": 0.5892, + "step": 477 + }, + { + "epoch": 0.01, + "grad_norm": 2.815709769428373, + "learning_rate": 4.618357487922705e-06, + "loss": 0.6034, + "step": 478 + }, + { + "epoch": 0.01, + "grad_norm": 2.185476299102205, + "learning_rate": 4.628019323671498e-06, + "loss": 0.6052, + "step": 479 + }, + { + "epoch": 0.01, + "grad_norm": 2.3748318249944633, + "learning_rate": 4.637681159420291e-06, + "loss": 0.5686, + "step": 480 + }, + { + "epoch": 0.01, + "grad_norm": 2.6825200085450276, + "learning_rate": 4.647342995169083e-06, + "loss": 0.6103, + "step": 481 + }, + { + "epoch": 0.01, + "grad_norm": 1.4151956111717368, + "learning_rate": 4.657004830917875e-06, + "loss": 0.592, + "step": 482 + }, + { + "epoch": 0.01, + "grad_norm": 2.201929963154678, + "learning_rate": 4.666666666666667e-06, + "loss": 0.5942, + "step": 483 + }, + { + "epoch": 0.01, + "grad_norm": 2.1728934519234504, + "learning_rate": 4.676328502415459e-06, + "loss": 0.5708, + "step": 484 + }, + { + "epoch": 0.01, + "grad_norm": 2.4397186508637616, + "learning_rate": 4.6859903381642516e-06, + "loss": 0.6475, + "step": 485 + }, + { + "epoch": 0.01, + "grad_norm": 2.325480487834395, + "learning_rate": 4.695652173913044e-06, + "loss": 0.5809, + "step": 486 + }, + { + "epoch": 0.01, + "grad_norm": 2.1988092095597063, + "learning_rate": 4.705314009661836e-06, + "loss": 0.5769, + "step": 487 + }, + { + "epoch": 0.01, + "grad_norm": 3.9001604016794453, + "learning_rate": 4.714975845410628e-06, + "loss": 0.5735, + "step": 488 + }, + { + "epoch": 0.01, + "grad_norm": 3.2015614264838246, + "learning_rate": 4.724637681159421e-06, + "loss": 0.5981, + "step": 489 + }, + { + "epoch": 0.01, + "grad_norm": 2.421738936196102, + "learning_rate": 4.7342995169082125e-06, + "loss": 0.5977, + "step": 490 + }, + { + "epoch": 0.01, + "grad_norm": 3.041975132665541, + "learning_rate": 4.743961352657005e-06, + "loss": 0.6141, + "step": 491 + }, + { + "epoch": 0.01, + "grad_norm": 2.543022635102777, + "learning_rate": 4.753623188405798e-06, + "loss": 0.6239, + "step": 492 + }, + { + "epoch": 0.01, + "grad_norm": 2.6417923752555748, + "learning_rate": 4.76328502415459e-06, + "loss": 0.6028, + "step": 493 + }, + { + "epoch": 0.01, + "grad_norm": 1.9942229921994143, + "learning_rate": 4.772946859903382e-06, + "loss": 0.5603, + "step": 494 + }, + { + "epoch": 0.01, + "grad_norm": 2.364809703080118, + "learning_rate": 4.782608695652174e-06, + "loss": 0.5624, + "step": 495 + }, + { + "epoch": 0.01, + "grad_norm": 2.326902047265226, + "learning_rate": 4.7922705314009665e-06, + "loss": 0.6127, + "step": 496 + }, + { + "epoch": 0.01, + "grad_norm": 2.3075475842638378, + "learning_rate": 4.801932367149759e-06, + "loss": 0.6047, + "step": 497 + }, + { + "epoch": 0.01, + "grad_norm": 1.2655112308194252, + "learning_rate": 4.811594202898551e-06, + "loss": 0.5394, + "step": 498 + }, + { + "epoch": 0.01, + "grad_norm": 2.513448887445629, + "learning_rate": 4.821256038647343e-06, + "loss": 0.613, + "step": 499 + }, + { + "epoch": 0.01, + "grad_norm": 2.231275591854179, + "learning_rate": 4.830917874396135e-06, + "loss": 0.5895, + "step": 500 + }, + { + "epoch": 0.01, + "grad_norm": 2.3563098260652193, + "learning_rate": 4.840579710144928e-06, + "loss": 0.6402, + "step": 501 + }, + { + "epoch": 0.01, + "grad_norm": 2.109692699203977, + "learning_rate": 4.8502415458937205e-06, + "loss": 0.6108, + "step": 502 + }, + { + "epoch": 0.01, + "grad_norm": 2.572811719052501, + "learning_rate": 4.859903381642512e-06, + "loss": 0.6378, + "step": 503 + }, + { + "epoch": 0.01, + "grad_norm": 2.4920765782840375, + "learning_rate": 4.869565217391305e-06, + "loss": 0.6259, + "step": 504 + }, + { + "epoch": 0.01, + "grad_norm": 2.1237233156400093, + "learning_rate": 4.879227053140097e-06, + "loss": 0.5942, + "step": 505 + }, + { + "epoch": 0.01, + "grad_norm": 2.17471245068219, + "learning_rate": 4.888888888888889e-06, + "loss": 0.5573, + "step": 506 + }, + { + "epoch": 0.01, + "grad_norm": 2.294758249011204, + "learning_rate": 4.8985507246376815e-06, + "loss": 0.6003, + "step": 507 + }, + { + "epoch": 0.01, + "grad_norm": 2.0781418014276647, + "learning_rate": 4.908212560386474e-06, + "loss": 0.6387, + "step": 508 + }, + { + "epoch": 0.01, + "grad_norm": 2.3582518416773914, + "learning_rate": 4.917874396135266e-06, + "loss": 0.6043, + "step": 509 + }, + { + "epoch": 0.01, + "grad_norm": 2.3218386528846846, + "learning_rate": 4.927536231884059e-06, + "loss": 0.5732, + "step": 510 + }, + { + "epoch": 0.01, + "grad_norm": 2.506299161565645, + "learning_rate": 4.93719806763285e-06, + "loss": 0.6081, + "step": 511 + }, + { + "epoch": 0.01, + "grad_norm": 2.2792195059887446, + "learning_rate": 4.9468599033816425e-06, + "loss": 0.6617, + "step": 512 + }, + { + "epoch": 0.01, + "grad_norm": 4.0830816444947375, + "learning_rate": 4.9565217391304355e-06, + "loss": 0.5646, + "step": 513 + }, + { + "epoch": 0.01, + "grad_norm": 2.34318705933166, + "learning_rate": 4.966183574879228e-06, + "loss": 0.6469, + "step": 514 + }, + { + "epoch": 0.01, + "grad_norm": 2.2468039571642424, + "learning_rate": 4.97584541062802e-06, + "loss": 0.5856, + "step": 515 + }, + { + "epoch": 0.01, + "grad_norm": 2.402571796769481, + "learning_rate": 4.985507246376812e-06, + "loss": 0.5869, + "step": 516 + }, + { + "epoch": 0.01, + "grad_norm": 2.9251002871944163, + "learning_rate": 4.995169082125604e-06, + "loss": 0.6225, + "step": 517 + }, + { + "epoch": 0.02, + "grad_norm": 2.3091007131516825, + "learning_rate": 5.004830917874397e-06, + "loss": 0.655, + "step": 518 + }, + { + "epoch": 0.02, + "grad_norm": 2.471243341970812, + "learning_rate": 5.014492753623189e-06, + "loss": 0.5993, + "step": 519 + }, + { + "epoch": 0.02, + "grad_norm": 2.1114479384686042, + "learning_rate": 5.024154589371981e-06, + "loss": 0.5873, + "step": 520 + }, + { + "epoch": 0.02, + "grad_norm": 2.097353324464618, + "learning_rate": 5.033816425120773e-06, + "loss": 0.6062, + "step": 521 + }, + { + "epoch": 0.02, + "grad_norm": 2.8628279311963296, + "learning_rate": 5.043478260869565e-06, + "loss": 0.5895, + "step": 522 + }, + { + "epoch": 0.02, + "grad_norm": 2.9928606048204762, + "learning_rate": 5.0531400966183575e-06, + "loss": 0.5871, + "step": 523 + }, + { + "epoch": 0.02, + "grad_norm": 2.163955235870307, + "learning_rate": 5.0628019323671505e-06, + "loss": 0.5672, + "step": 524 + }, + { + "epoch": 0.02, + "grad_norm": 2.162390578852151, + "learning_rate": 5.072463768115943e-06, + "loss": 0.6057, + "step": 525 + }, + { + "epoch": 0.02, + "grad_norm": 2.3268734419823427, + "learning_rate": 5.082125603864735e-06, + "loss": 0.6137, + "step": 526 + }, + { + "epoch": 0.02, + "grad_norm": 2.392722374763756, + "learning_rate": 5.091787439613527e-06, + "loss": 0.6063, + "step": 527 + }, + { + "epoch": 0.02, + "grad_norm": 2.2261369814470573, + "learning_rate": 5.101449275362319e-06, + "loss": 0.608, + "step": 528 + }, + { + "epoch": 0.02, + "grad_norm": 2.1643213152165512, + "learning_rate": 5.1111111111111115e-06, + "loss": 0.5894, + "step": 529 + }, + { + "epoch": 0.02, + "grad_norm": 2.640813682462032, + "learning_rate": 5.1207729468599045e-06, + "loss": 0.6025, + "step": 530 + }, + { + "epoch": 0.02, + "grad_norm": 1.9824270948072757, + "learning_rate": 5.130434782608697e-06, + "loss": 0.6185, + "step": 531 + }, + { + "epoch": 0.02, + "grad_norm": 3.5531502187221906, + "learning_rate": 5.140096618357489e-06, + "loss": 0.6308, + "step": 532 + }, + { + "epoch": 0.02, + "grad_norm": 3.520922134675759, + "learning_rate": 5.14975845410628e-06, + "loss": 0.5662, + "step": 533 + }, + { + "epoch": 0.02, + "grad_norm": 2.1838400086669965, + "learning_rate": 5.1594202898550725e-06, + "loss": 0.5357, + "step": 534 + }, + { + "epoch": 0.02, + "grad_norm": 2.1581079155246035, + "learning_rate": 5.169082125603865e-06, + "loss": 0.6153, + "step": 535 + }, + { + "epoch": 0.02, + "grad_norm": 3.4864342404530677, + "learning_rate": 5.178743961352657e-06, + "loss": 0.582, + "step": 536 + }, + { + "epoch": 0.02, + "grad_norm": 2.401589900911072, + "learning_rate": 5.18840579710145e-06, + "loss": 0.5455, + "step": 537 + }, + { + "epoch": 0.02, + "grad_norm": 2.1285824007583556, + "learning_rate": 5.198067632850242e-06, + "loss": 0.5829, + "step": 538 + }, + { + "epoch": 0.02, + "grad_norm": 2.1845799520556337, + "learning_rate": 5.207729468599034e-06, + "loss": 0.5766, + "step": 539 + }, + { + "epoch": 0.02, + "grad_norm": 2.4261759781106487, + "learning_rate": 5.2173913043478265e-06, + "loss": 0.6225, + "step": 540 + }, + { + "epoch": 0.02, + "grad_norm": 2.117684159805507, + "learning_rate": 5.227053140096619e-06, + "loss": 0.5775, + "step": 541 + }, + { + "epoch": 0.02, + "grad_norm": 2.2601541989905143, + "learning_rate": 5.236714975845411e-06, + "loss": 0.5696, + "step": 542 + }, + { + "epoch": 0.02, + "grad_norm": 2.1653252788054993, + "learning_rate": 5.246376811594204e-06, + "loss": 0.5866, + "step": 543 + }, + { + "epoch": 0.02, + "grad_norm": 1.9667830284690209, + "learning_rate": 5.256038647342996e-06, + "loss": 0.5412, + "step": 544 + }, + { + "epoch": 0.02, + "grad_norm": 2.321718081952433, + "learning_rate": 5.265700483091788e-06, + "loss": 0.5567, + "step": 545 + }, + { + "epoch": 0.02, + "grad_norm": 2.100864161816771, + "learning_rate": 5.27536231884058e-06, + "loss": 0.571, + "step": 546 + }, + { + "epoch": 0.02, + "grad_norm": 2.9098569994354424, + "learning_rate": 5.285024154589372e-06, + "loss": 0.6255, + "step": 547 + }, + { + "epoch": 0.02, + "grad_norm": 2.2100942996322677, + "learning_rate": 5.294685990338164e-06, + "loss": 0.5899, + "step": 548 + }, + { + "epoch": 0.02, + "grad_norm": 2.2089558537189773, + "learning_rate": 5.304347826086957e-06, + "loss": 0.5999, + "step": 549 + }, + { + "epoch": 0.02, + "grad_norm": 2.4885447009205026, + "learning_rate": 5.314009661835749e-06, + "loss": 0.5819, + "step": 550 + }, + { + "epoch": 0.02, + "grad_norm": 2.1703243384942326, + "learning_rate": 5.3236714975845415e-06, + "loss": 0.5963, + "step": 551 + }, + { + "epoch": 0.02, + "grad_norm": 2.22737762375415, + "learning_rate": 5.333333333333334e-06, + "loss": 0.5811, + "step": 552 + }, + { + "epoch": 0.02, + "grad_norm": 2.2645368838946935, + "learning_rate": 5.342995169082126e-06, + "loss": 0.5537, + "step": 553 + }, + { + "epoch": 0.02, + "grad_norm": 2.3677301146585945, + "learning_rate": 5.352657004830918e-06, + "loss": 0.5775, + "step": 554 + }, + { + "epoch": 0.02, + "grad_norm": 2.1985697561046154, + "learning_rate": 5.362318840579711e-06, + "loss": 0.6345, + "step": 555 + }, + { + "epoch": 0.02, + "grad_norm": 2.054364320130401, + "learning_rate": 5.371980676328503e-06, + "loss": 0.5663, + "step": 556 + }, + { + "epoch": 0.02, + "grad_norm": 2.382388894106917, + "learning_rate": 5.3816425120772955e-06, + "loss": 0.5611, + "step": 557 + }, + { + "epoch": 0.02, + "grad_norm": 2.3066779643926942, + "learning_rate": 5.391304347826088e-06, + "loss": 0.5706, + "step": 558 + }, + { + "epoch": 0.02, + "grad_norm": 2.4432957947632197, + "learning_rate": 5.40096618357488e-06, + "loss": 0.5831, + "step": 559 + }, + { + "epoch": 0.02, + "grad_norm": 2.3839943079413817, + "learning_rate": 5.410628019323671e-06, + "loss": 0.6185, + "step": 560 + }, + { + "epoch": 0.02, + "grad_norm": 8.312284823828033, + "learning_rate": 5.420289855072465e-06, + "loss": 0.6382, + "step": 561 + }, + { + "epoch": 0.02, + "grad_norm": 2.478513271154717, + "learning_rate": 5.4299516908212564e-06, + "loss": 0.6046, + "step": 562 + }, + { + "epoch": 0.02, + "grad_norm": 2.374682773291184, + "learning_rate": 5.439613526570049e-06, + "loss": 0.5598, + "step": 563 + }, + { + "epoch": 0.02, + "grad_norm": 2.6015359453625053, + "learning_rate": 5.449275362318841e-06, + "loss": 0.5931, + "step": 564 + }, + { + "epoch": 0.02, + "grad_norm": 2.3282721417337284, + "learning_rate": 5.458937198067633e-06, + "loss": 0.6248, + "step": 565 + }, + { + "epoch": 0.02, + "grad_norm": 2.2006560503329995, + "learning_rate": 5.468599033816425e-06, + "loss": 0.6037, + "step": 566 + }, + { + "epoch": 0.02, + "grad_norm": 2.164821659676509, + "learning_rate": 5.478260869565217e-06, + "loss": 0.5859, + "step": 567 + }, + { + "epoch": 0.02, + "grad_norm": 1.8950113179849057, + "learning_rate": 5.4879227053140105e-06, + "loss": 0.6261, + "step": 568 + }, + { + "epoch": 0.02, + "grad_norm": 2.2360408221141177, + "learning_rate": 5.497584541062803e-06, + "loss": 0.5986, + "step": 569 + }, + { + "epoch": 0.02, + "grad_norm": 2.3917499499482853, + "learning_rate": 5.507246376811595e-06, + "loss": 0.5939, + "step": 570 + }, + { + "epoch": 0.02, + "grad_norm": 2.491317737621292, + "learning_rate": 5.516908212560387e-06, + "loss": 0.5583, + "step": 571 + }, + { + "epoch": 0.02, + "grad_norm": 2.2362167904501353, + "learning_rate": 5.526570048309179e-06, + "loss": 0.5795, + "step": 572 + }, + { + "epoch": 0.02, + "grad_norm": 2.0982561302757055, + "learning_rate": 5.536231884057971e-06, + "loss": 0.5577, + "step": 573 + }, + { + "epoch": 0.02, + "grad_norm": 2.492584515364987, + "learning_rate": 5.5458937198067645e-06, + "loss": 0.6171, + "step": 574 + }, + { + "epoch": 0.02, + "grad_norm": 2.116725880667006, + "learning_rate": 5.555555555555557e-06, + "loss": 0.6288, + "step": 575 + }, + { + "epoch": 0.02, + "grad_norm": 2.263943875153671, + "learning_rate": 5.565217391304348e-06, + "loss": 0.5628, + "step": 576 + }, + { + "epoch": 0.02, + "grad_norm": 2.3281611923549606, + "learning_rate": 5.57487922705314e-06, + "loss": 0.5426, + "step": 577 + }, + { + "epoch": 0.02, + "grad_norm": 2.0007036708363186, + "learning_rate": 5.584541062801932e-06, + "loss": 0.5667, + "step": 578 + }, + { + "epoch": 0.02, + "grad_norm": 2.166047198636356, + "learning_rate": 5.594202898550725e-06, + "loss": 0.5774, + "step": 579 + }, + { + "epoch": 0.02, + "grad_norm": 2.13307489048753, + "learning_rate": 5.603864734299518e-06, + "loss": 0.5776, + "step": 580 + }, + { + "epoch": 0.02, + "grad_norm": 2.171541297598018, + "learning_rate": 5.61352657004831e-06, + "loss": 0.6059, + "step": 581 + }, + { + "epoch": 0.02, + "grad_norm": 2.295743586248148, + "learning_rate": 5.623188405797102e-06, + "loss": 0.558, + "step": 582 + }, + { + "epoch": 0.02, + "grad_norm": 2.1842658131589996, + "learning_rate": 5.632850241545894e-06, + "loss": 0.61, + "step": 583 + }, + { + "epoch": 0.02, + "grad_norm": 2.0711816234519485, + "learning_rate": 5.642512077294686e-06, + "loss": 0.5961, + "step": 584 + }, + { + "epoch": 0.02, + "grad_norm": 2.721763632963868, + "learning_rate": 5.652173913043479e-06, + "loss": 0.5385, + "step": 585 + }, + { + "epoch": 0.02, + "grad_norm": 2.1585892138823035, + "learning_rate": 5.661835748792272e-06, + "loss": 0.5842, + "step": 586 + }, + { + "epoch": 0.02, + "grad_norm": 2.269761583033257, + "learning_rate": 5.671497584541064e-06, + "loss": 0.6071, + "step": 587 + }, + { + "epoch": 0.02, + "grad_norm": 2.041713636844157, + "learning_rate": 5.681159420289856e-06, + "loss": 0.5512, + "step": 588 + }, + { + "epoch": 0.02, + "grad_norm": 2.293232083338646, + "learning_rate": 5.690821256038647e-06, + "loss": 0.5525, + "step": 589 + }, + { + "epoch": 0.02, + "grad_norm": 2.0732720143971948, + "learning_rate": 5.70048309178744e-06, + "loss": 0.5385, + "step": 590 + }, + { + "epoch": 0.02, + "grad_norm": 2.147467228743109, + "learning_rate": 5.710144927536232e-06, + "loss": 0.5548, + "step": 591 + }, + { + "epoch": 0.02, + "grad_norm": 2.121121094865633, + "learning_rate": 5.719806763285025e-06, + "loss": 0.5276, + "step": 592 + }, + { + "epoch": 0.02, + "grad_norm": 2.2811681114749156, + "learning_rate": 5.729468599033817e-06, + "loss": 0.5631, + "step": 593 + }, + { + "epoch": 0.02, + "grad_norm": 2.1623048466103123, + "learning_rate": 5.739130434782609e-06, + "loss": 0.5878, + "step": 594 + }, + { + "epoch": 0.02, + "grad_norm": 2.2804556806116825, + "learning_rate": 5.748792270531401e-06, + "loss": 0.6199, + "step": 595 + }, + { + "epoch": 0.02, + "grad_norm": 2.3468410034721097, + "learning_rate": 5.758454106280194e-06, + "loss": 0.5747, + "step": 596 + }, + { + "epoch": 0.02, + "grad_norm": 3.450210188957847, + "learning_rate": 5.768115942028986e-06, + "loss": 0.5372, + "step": 597 + }, + { + "epoch": 0.02, + "grad_norm": 2.4267454171701464, + "learning_rate": 5.777777777777778e-06, + "loss": 0.5865, + "step": 598 + }, + { + "epoch": 0.02, + "grad_norm": 2.3154693327070537, + "learning_rate": 5.787439613526571e-06, + "loss": 0.5696, + "step": 599 + }, + { + "epoch": 0.02, + "grad_norm": 2.133268835337351, + "learning_rate": 5.797101449275363e-06, + "loss": 0.5624, + "step": 600 + }, + { + "epoch": 0.02, + "grad_norm": 2.1766170025182667, + "learning_rate": 5.806763285024155e-06, + "loss": 0.6293, + "step": 601 + }, + { + "epoch": 0.02, + "grad_norm": 2.291776717080552, + "learning_rate": 5.816425120772947e-06, + "loss": 0.5981, + "step": 602 + }, + { + "epoch": 0.02, + "grad_norm": 2.086430112613317, + "learning_rate": 5.826086956521739e-06, + "loss": 0.5426, + "step": 603 + }, + { + "epoch": 0.02, + "grad_norm": 2.325566202013443, + "learning_rate": 5.835748792270531e-06, + "loss": 0.5839, + "step": 604 + }, + { + "epoch": 0.02, + "grad_norm": 2.107305939906638, + "learning_rate": 5.845410628019324e-06, + "loss": 0.6001, + "step": 605 + }, + { + "epoch": 0.02, + "grad_norm": 34.224183862836774, + "learning_rate": 5.855072463768116e-06, + "loss": 0.68, + "step": 606 + }, + { + "epoch": 0.02, + "grad_norm": 2.541954846437248, + "learning_rate": 5.864734299516909e-06, + "loss": 0.5729, + "step": 607 + }, + { + "epoch": 0.02, + "grad_norm": 2.310450103602232, + "learning_rate": 5.874396135265701e-06, + "loss": 0.5584, + "step": 608 + }, + { + "epoch": 0.02, + "grad_norm": 2.2446434671562177, + "learning_rate": 5.884057971014493e-06, + "loss": 0.6083, + "step": 609 + }, + { + "epoch": 0.02, + "grad_norm": 2.0662711979376076, + "learning_rate": 5.893719806763285e-06, + "loss": 0.5477, + "step": 610 + }, + { + "epoch": 0.02, + "grad_norm": 2.6802271627321095, + "learning_rate": 5.903381642512078e-06, + "loss": 0.5751, + "step": 611 + }, + { + "epoch": 0.02, + "grad_norm": 2.386221252031209, + "learning_rate": 5.91304347826087e-06, + "loss": 0.5492, + "step": 612 + }, + { + "epoch": 0.02, + "grad_norm": 2.1764288671858396, + "learning_rate": 5.922705314009663e-06, + "loss": 0.6073, + "step": 613 + }, + { + "epoch": 0.02, + "grad_norm": 2.440924381750186, + "learning_rate": 5.932367149758455e-06, + "loss": 0.5428, + "step": 614 + }, + { + "epoch": 0.02, + "grad_norm": 2.1851225979344755, + "learning_rate": 5.942028985507247e-06, + "loss": 0.6104, + "step": 615 + }, + { + "epoch": 0.02, + "grad_norm": 2.5420797240570723, + "learning_rate": 5.951690821256038e-06, + "loss": 0.6046, + "step": 616 + }, + { + "epoch": 0.02, + "grad_norm": 2.1439786391383997, + "learning_rate": 5.961352657004832e-06, + "loss": 0.5854, + "step": 617 + }, + { + "epoch": 0.02, + "grad_norm": 2.469873639599263, + "learning_rate": 5.9710144927536236e-06, + "loss": 0.554, + "step": 618 + }, + { + "epoch": 0.02, + "grad_norm": 2.5298320032561383, + "learning_rate": 5.980676328502416e-06, + "loss": 0.5647, + "step": 619 + }, + { + "epoch": 0.02, + "grad_norm": 3.067089330118625, + "learning_rate": 5.990338164251208e-06, + "loss": 0.5947, + "step": 620 + }, + { + "epoch": 0.02, + "grad_norm": 1.7641927247067426, + "learning_rate": 6e-06, + "loss": 0.5534, + "step": 621 + }, + { + "epoch": 0.02, + "grad_norm": 2.800520214334731, + "learning_rate": 6.009661835748792e-06, + "loss": 0.5563, + "step": 622 + }, + { + "epoch": 0.02, + "grad_norm": 2.190827597674709, + "learning_rate": 6.019323671497585e-06, + "loss": 0.5764, + "step": 623 + }, + { + "epoch": 0.02, + "grad_norm": 2.392016070852951, + "learning_rate": 6.028985507246378e-06, + "loss": 0.5626, + "step": 624 + }, + { + "epoch": 0.02, + "grad_norm": 2.2741536064323844, + "learning_rate": 6.03864734299517e-06, + "loss": 0.5626, + "step": 625 + }, + { + "epoch": 0.02, + "grad_norm": 1.096582752289835, + "learning_rate": 6.048309178743962e-06, + "loss": 0.5558, + "step": 626 + }, + { + "epoch": 0.02, + "grad_norm": 2.278405173060657, + "learning_rate": 6.057971014492754e-06, + "loss": 0.557, + "step": 627 + }, + { + "epoch": 0.02, + "grad_norm": 2.547202719578397, + "learning_rate": 6.067632850241546e-06, + "loss": 0.5842, + "step": 628 + }, + { + "epoch": 0.02, + "grad_norm": 2.2177143871493796, + "learning_rate": 6.077294685990338e-06, + "loss": 0.5331, + "step": 629 + }, + { + "epoch": 0.02, + "grad_norm": 3.4466393198508816, + "learning_rate": 6.086956521739132e-06, + "loss": 0.6299, + "step": 630 + }, + { + "epoch": 0.02, + "grad_norm": 2.8291617108334948, + "learning_rate": 6.096618357487924e-06, + "loss": 0.5318, + "step": 631 + }, + { + "epoch": 0.02, + "grad_norm": 2.161077849237266, + "learning_rate": 6.106280193236715e-06, + "loss": 0.5512, + "step": 632 + }, + { + "epoch": 0.02, + "grad_norm": 1.341651755408617, + "learning_rate": 6.115942028985507e-06, + "loss": 0.5341, + "step": 633 + }, + { + "epoch": 0.02, + "grad_norm": 2.280128871861402, + "learning_rate": 6.1256038647342995e-06, + "loss": 0.5944, + "step": 634 + }, + { + "epoch": 0.02, + "grad_norm": 2.5781692568225427, + "learning_rate": 6.135265700483092e-06, + "loss": 0.6135, + "step": 635 + }, + { + "epoch": 0.02, + "grad_norm": 2.6046783482700815, + "learning_rate": 6.144927536231885e-06, + "loss": 0.568, + "step": 636 + }, + { + "epoch": 0.02, + "grad_norm": 2.6186529049597542, + "learning_rate": 6.154589371980677e-06, + "loss": 0.5802, + "step": 637 + }, + { + "epoch": 0.02, + "grad_norm": 2.6777503296234393, + "learning_rate": 6.164251207729469e-06, + "loss": 0.5845, + "step": 638 + }, + { + "epoch": 0.02, + "grad_norm": 2.1891082011020044, + "learning_rate": 6.173913043478261e-06, + "loss": 0.6044, + "step": 639 + }, + { + "epoch": 0.02, + "grad_norm": 2.3700519634192525, + "learning_rate": 6.1835748792270535e-06, + "loss": 0.5792, + "step": 640 + }, + { + "epoch": 0.02, + "grad_norm": 4.309399872031817, + "learning_rate": 6.193236714975846e-06, + "loss": 0.614, + "step": 641 + }, + { + "epoch": 0.02, + "grad_norm": 2.620092650918923, + "learning_rate": 6.202898550724639e-06, + "loss": 0.5512, + "step": 642 + }, + { + "epoch": 0.02, + "grad_norm": 2.135079290442314, + "learning_rate": 6.212560386473431e-06, + "loss": 0.5834, + "step": 643 + }, + { + "epoch": 0.02, + "grad_norm": 2.3203320951182995, + "learning_rate": 6.222222222222223e-06, + "loss": 0.5842, + "step": 644 + }, + { + "epoch": 0.02, + "grad_norm": 2.385521594983988, + "learning_rate": 6.2318840579710145e-06, + "loss": 0.5624, + "step": 645 + }, + { + "epoch": 0.02, + "grad_norm": 2.2957259080890267, + "learning_rate": 6.241545893719807e-06, + "loss": 0.5415, + "step": 646 + }, + { + "epoch": 0.02, + "grad_norm": 2.4083157049857244, + "learning_rate": 6.251207729468599e-06, + "loss": 0.551, + "step": 647 + }, + { + "epoch": 0.02, + "grad_norm": 2.2971914419447845, + "learning_rate": 6.260869565217392e-06, + "loss": 0.5258, + "step": 648 + }, + { + "epoch": 0.02, + "grad_norm": 2.575689207066459, + "learning_rate": 6.270531400966184e-06, + "loss": 0.584, + "step": 649 + }, + { + "epoch": 0.02, + "grad_norm": 1.4732341445106774, + "learning_rate": 6.280193236714976e-06, + "loss": 0.5499, + "step": 650 + }, + { + "epoch": 0.02, + "grad_norm": 3.0118057288041737, + "learning_rate": 6.2898550724637685e-06, + "loss": 0.5817, + "step": 651 + }, + { + "epoch": 0.02, + "grad_norm": 2.7779350135730625, + "learning_rate": 6.299516908212561e-06, + "loss": 0.6104, + "step": 652 + }, + { + "epoch": 0.02, + "grad_norm": 2.2292226386227307, + "learning_rate": 6.309178743961353e-06, + "loss": 0.5562, + "step": 653 + }, + { + "epoch": 0.02, + "grad_norm": 2.3050519620712695, + "learning_rate": 6.318840579710146e-06, + "loss": 0.5881, + "step": 654 + }, + { + "epoch": 0.02, + "grad_norm": 2.407353733321777, + "learning_rate": 6.328502415458938e-06, + "loss": 0.5287, + "step": 655 + }, + { + "epoch": 0.02, + "grad_norm": 2.465853108383857, + "learning_rate": 6.33816425120773e-06, + "loss": 0.5645, + "step": 656 + }, + { + "epoch": 0.02, + "grad_norm": 2.3182083839107714, + "learning_rate": 6.3478260869565225e-06, + "loss": 0.5784, + "step": 657 + }, + { + "epoch": 0.02, + "grad_norm": 2.126580917066559, + "learning_rate": 6.357487922705315e-06, + "loss": 0.5328, + "step": 658 + }, + { + "epoch": 0.02, + "grad_norm": 2.213623237652207, + "learning_rate": 6.367149758454106e-06, + "loss": 0.5523, + "step": 659 + }, + { + "epoch": 0.02, + "grad_norm": 2.2495151767932926, + "learning_rate": 6.376811594202898e-06, + "loss": 0.5685, + "step": 660 + }, + { + "epoch": 0.02, + "grad_norm": 3.89569871830028, + "learning_rate": 6.386473429951691e-06, + "loss": 0.5534, + "step": 661 + }, + { + "epoch": 0.02, + "grad_norm": 2.8525458628505573, + "learning_rate": 6.3961352657004835e-06, + "loss": 0.5596, + "step": 662 + }, + { + "epoch": 0.02, + "grad_norm": 2.4936190535699723, + "learning_rate": 6.405797101449276e-06, + "loss": 0.5909, + "step": 663 + }, + { + "epoch": 0.02, + "grad_norm": 2.506113622087724, + "learning_rate": 6.415458937198068e-06, + "loss": 0.6238, + "step": 664 + }, + { + "epoch": 0.02, + "grad_norm": 5.15583121698742, + "learning_rate": 6.42512077294686e-06, + "loss": 0.5543, + "step": 665 + }, + { + "epoch": 0.02, + "grad_norm": 11.186975232111731, + "learning_rate": 6.434782608695652e-06, + "loss": 0.5518, + "step": 666 + }, + { + "epoch": 0.02, + "grad_norm": 2.3870720060608472, + "learning_rate": 6.444444444444445e-06, + "loss": 0.5884, + "step": 667 + }, + { + "epoch": 0.02, + "grad_norm": 2.437617748329482, + "learning_rate": 6.4541062801932375e-06, + "loss": 0.5501, + "step": 668 + }, + { + "epoch": 0.02, + "grad_norm": 2.435185899025229, + "learning_rate": 6.46376811594203e-06, + "loss": 0.5291, + "step": 669 + }, + { + "epoch": 0.02, + "grad_norm": 2.923551904028663, + "learning_rate": 6.473429951690822e-06, + "loss": 0.5364, + "step": 670 + }, + { + "epoch": 0.02, + "grad_norm": 2.222750212535448, + "learning_rate": 6.483091787439614e-06, + "loss": 0.53, + "step": 671 + }, + { + "epoch": 0.02, + "grad_norm": 2.6219471857527075, + "learning_rate": 6.4927536231884055e-06, + "loss": 0.5535, + "step": 672 + }, + { + "epoch": 0.02, + "grad_norm": 2.2501224926484626, + "learning_rate": 6.502415458937199e-06, + "loss": 0.5459, + "step": 673 + }, + { + "epoch": 0.02, + "grad_norm": 2.417163773685382, + "learning_rate": 6.5120772946859915e-06, + "loss": 0.6199, + "step": 674 + }, + { + "epoch": 0.02, + "grad_norm": 2.340104982669897, + "learning_rate": 6.521739130434783e-06, + "loss": 0.5241, + "step": 675 + }, + { + "epoch": 0.02, + "grad_norm": 2.9114085974543444, + "learning_rate": 6.531400966183575e-06, + "loss": 0.5355, + "step": 676 + }, + { + "epoch": 0.02, + "grad_norm": 2.2893881682919153, + "learning_rate": 6.541062801932367e-06, + "loss": 0.5233, + "step": 677 + }, + { + "epoch": 0.02, + "grad_norm": 2.757425367926869, + "learning_rate": 6.5507246376811595e-06, + "loss": 0.5636, + "step": 678 + }, + { + "epoch": 0.02, + "grad_norm": 2.196030829946355, + "learning_rate": 6.5603864734299525e-06, + "loss": 0.5913, + "step": 679 + }, + { + "epoch": 0.02, + "grad_norm": 2.3368738944116254, + "learning_rate": 6.570048309178745e-06, + "loss": 0.6116, + "step": 680 + }, + { + "epoch": 0.02, + "grad_norm": 2.9007706098863912, + "learning_rate": 6.579710144927537e-06, + "loss": 0.5324, + "step": 681 + }, + { + "epoch": 0.02, + "grad_norm": 2.2040438792371653, + "learning_rate": 6.589371980676329e-06, + "loss": 0.5935, + "step": 682 + }, + { + "epoch": 0.02, + "grad_norm": 2.062083411293085, + "learning_rate": 6.599033816425121e-06, + "loss": 0.5459, + "step": 683 + }, + { + "epoch": 0.02, + "grad_norm": 1.3421926808701938, + "learning_rate": 6.6086956521739135e-06, + "loss": 0.5247, + "step": 684 + }, + { + "epoch": 0.02, + "grad_norm": 1.2632479173141584, + "learning_rate": 6.6183574879227065e-06, + "loss": 0.5239, + "step": 685 + }, + { + "epoch": 0.02, + "grad_norm": 3.0298157531349617, + "learning_rate": 6.628019323671499e-06, + "loss": 0.5301, + "step": 686 + }, + { + "epoch": 0.02, + "grad_norm": 2.509320962948071, + "learning_rate": 6.637681159420291e-06, + "loss": 0.561, + "step": 687 + }, + { + "epoch": 0.02, + "grad_norm": 2.18097818160301, + "learning_rate": 6.647342995169082e-06, + "loss": 0.549, + "step": 688 + }, + { + "epoch": 0.02, + "grad_norm": 2.249451933966953, + "learning_rate": 6.6570048309178745e-06, + "loss": 0.5416, + "step": 689 + }, + { + "epoch": 0.02, + "grad_norm": 2.604616877694076, + "learning_rate": 6.666666666666667e-06, + "loss": 0.564, + "step": 690 + }, + { + "epoch": 0.02, + "grad_norm": 2.2705351612738927, + "learning_rate": 6.676328502415459e-06, + "loss": 0.5313, + "step": 691 + }, + { + "epoch": 0.02, + "grad_norm": 2.225282332795569, + "learning_rate": 6.685990338164252e-06, + "loss": 0.547, + "step": 692 + }, + { + "epoch": 0.02, + "grad_norm": 2.3235348886848626, + "learning_rate": 6.695652173913044e-06, + "loss": 0.587, + "step": 693 + }, + { + "epoch": 0.02, + "grad_norm": 2.638026658812123, + "learning_rate": 6.705314009661836e-06, + "loss": 0.5614, + "step": 694 + }, + { + "epoch": 0.02, + "grad_norm": 2.1722624638818853, + "learning_rate": 6.7149758454106285e-06, + "loss": 0.5343, + "step": 695 + }, + { + "epoch": 0.02, + "grad_norm": 2.496173848014995, + "learning_rate": 6.724637681159421e-06, + "loss": 0.5411, + "step": 696 + }, + { + "epoch": 0.02, + "grad_norm": 2.4772696128075924, + "learning_rate": 6.734299516908213e-06, + "loss": 0.5617, + "step": 697 + }, + { + "epoch": 0.02, + "grad_norm": 2.343086482532649, + "learning_rate": 6.743961352657006e-06, + "loss": 0.5271, + "step": 698 + }, + { + "epoch": 0.02, + "grad_norm": 2.1970791948688384, + "learning_rate": 6.753623188405798e-06, + "loss": 0.5298, + "step": 699 + }, + { + "epoch": 0.02, + "grad_norm": 2.1807236528643137, + "learning_rate": 6.76328502415459e-06, + "loss": 0.566, + "step": 700 + }, + { + "epoch": 0.02, + "grad_norm": 2.1818235820984593, + "learning_rate": 6.7729468599033825e-06, + "loss": 0.5056, + "step": 701 + }, + { + "epoch": 0.02, + "grad_norm": 2.5812605516963187, + "learning_rate": 6.782608695652174e-06, + "loss": 0.5811, + "step": 702 + }, + { + "epoch": 0.02, + "grad_norm": 2.3250801672215107, + "learning_rate": 6.792270531400966e-06, + "loss": 0.5189, + "step": 703 + }, + { + "epoch": 0.02, + "grad_norm": 2.254139365786551, + "learning_rate": 6.801932367149759e-06, + "loss": 0.5408, + "step": 704 + }, + { + "epoch": 0.02, + "grad_norm": 2.499778612748973, + "learning_rate": 6.811594202898551e-06, + "loss": 0.5545, + "step": 705 + }, + { + "epoch": 0.02, + "grad_norm": 2.581761019311022, + "learning_rate": 6.8212560386473435e-06, + "loss": 0.5851, + "step": 706 + }, + { + "epoch": 0.02, + "grad_norm": 2.1714624767295114, + "learning_rate": 6.830917874396136e-06, + "loss": 0.5648, + "step": 707 + }, + { + "epoch": 0.02, + "grad_norm": 2.4990886085608075, + "learning_rate": 6.840579710144928e-06, + "loss": 0.5837, + "step": 708 + }, + { + "epoch": 0.02, + "grad_norm": 2.3179859419597264, + "learning_rate": 6.85024154589372e-06, + "loss": 0.6566, + "step": 709 + }, + { + "epoch": 0.02, + "grad_norm": 2.147676336857773, + "learning_rate": 6.859903381642513e-06, + "loss": 0.5726, + "step": 710 + }, + { + "epoch": 0.02, + "grad_norm": 2.2954680893528976, + "learning_rate": 6.869565217391305e-06, + "loss": 0.5944, + "step": 711 + }, + { + "epoch": 0.02, + "grad_norm": 2.450273043547352, + "learning_rate": 6.8792270531400975e-06, + "loss": 0.6034, + "step": 712 + }, + { + "epoch": 0.02, + "grad_norm": 1.9826830986247697, + "learning_rate": 6.88888888888889e-06, + "loss": 0.5361, + "step": 713 + }, + { + "epoch": 0.02, + "grad_norm": 2.4137626519907585, + "learning_rate": 6.898550724637682e-06, + "loss": 0.5514, + "step": 714 + }, + { + "epoch": 0.02, + "grad_norm": 2.3731093850768725, + "learning_rate": 6.908212560386473e-06, + "loss": 0.5198, + "step": 715 + }, + { + "epoch": 0.02, + "grad_norm": 2.5824955344371023, + "learning_rate": 6.917874396135267e-06, + "loss": 0.5643, + "step": 716 + }, + { + "epoch": 0.02, + "grad_norm": 2.2117500127849663, + "learning_rate": 6.927536231884059e-06, + "loss": 0.5358, + "step": 717 + }, + { + "epoch": 0.02, + "grad_norm": 4.109319814707089, + "learning_rate": 6.937198067632851e-06, + "loss": 0.614, + "step": 718 + }, + { + "epoch": 0.02, + "grad_norm": 2.5405566525049803, + "learning_rate": 6.946859903381643e-06, + "loss": 0.5914, + "step": 719 + }, + { + "epoch": 0.02, + "grad_norm": 2.1918960218417123, + "learning_rate": 6.956521739130435e-06, + "loss": 0.5503, + "step": 720 + }, + { + "epoch": 0.02, + "grad_norm": 2.2282342346558814, + "learning_rate": 6.966183574879227e-06, + "loss": 0.565, + "step": 721 + }, + { + "epoch": 0.02, + "grad_norm": 2.502450050613349, + "learning_rate": 6.975845410628019e-06, + "loss": 0.6459, + "step": 722 + }, + { + "epoch": 0.02, + "grad_norm": 2.2048929949089002, + "learning_rate": 6.9855072463768125e-06, + "loss": 0.5521, + "step": 723 + }, + { + "epoch": 0.02, + "grad_norm": 2.082422151820018, + "learning_rate": 6.995169082125605e-06, + "loss": 0.5366, + "step": 724 + }, + { + "epoch": 0.02, + "grad_norm": 2.1074938369022824, + "learning_rate": 7.004830917874397e-06, + "loss": 0.5843, + "step": 725 + }, + { + "epoch": 0.02, + "grad_norm": 2.1448149028349874, + "learning_rate": 7.014492753623189e-06, + "loss": 0.5814, + "step": 726 + }, + { + "epoch": 0.02, + "grad_norm": 2.080197169609223, + "learning_rate": 7.024154589371981e-06, + "loss": 0.5568, + "step": 727 + }, + { + "epoch": 0.02, + "grad_norm": 2.3408318446046206, + "learning_rate": 7.033816425120773e-06, + "loss": 0.5561, + "step": 728 + }, + { + "epoch": 0.02, + "grad_norm": 4.489655806981211, + "learning_rate": 7.0434782608695665e-06, + "loss": 0.5796, + "step": 729 + }, + { + "epoch": 0.02, + "grad_norm": 2.162746081077374, + "learning_rate": 7.053140096618359e-06, + "loss": 0.5829, + "step": 730 + }, + { + "epoch": 0.02, + "grad_norm": 2.347180148580784, + "learning_rate": 7.06280193236715e-06, + "loss": 0.5583, + "step": 731 + }, + { + "epoch": 0.02, + "grad_norm": 2.4403729690196223, + "learning_rate": 7.072463768115942e-06, + "loss": 0.5466, + "step": 732 + }, + { + "epoch": 0.02, + "grad_norm": 2.1399547754936874, + "learning_rate": 7.082125603864734e-06, + "loss": 0.5866, + "step": 733 + }, + { + "epoch": 0.02, + "grad_norm": 3.023300191732401, + "learning_rate": 7.091787439613527e-06, + "loss": 0.5798, + "step": 734 + }, + { + "epoch": 0.02, + "grad_norm": 2.260290295976808, + "learning_rate": 7.10144927536232e-06, + "loss": 0.5995, + "step": 735 + }, + { + "epoch": 0.02, + "grad_norm": 2.4664260617223897, + "learning_rate": 7.111111111111112e-06, + "loss": 0.5077, + "step": 736 + }, + { + "epoch": 0.02, + "grad_norm": 2.04363939629908, + "learning_rate": 7.120772946859904e-06, + "loss": 0.5389, + "step": 737 + }, + { + "epoch": 0.02, + "grad_norm": 2.2723961198503284, + "learning_rate": 7.130434782608696e-06, + "loss": 0.4933, + "step": 738 + }, + { + "epoch": 0.02, + "grad_norm": 2.381577853765457, + "learning_rate": 7.140096618357488e-06, + "loss": 0.5332, + "step": 739 + }, + { + "epoch": 0.02, + "grad_norm": 3.4096800069953637, + "learning_rate": 7.149758454106281e-06, + "loss": 0.6091, + "step": 740 + }, + { + "epoch": 0.02, + "grad_norm": 2.5796211775077995, + "learning_rate": 7.159420289855074e-06, + "loss": 0.5774, + "step": 741 + }, + { + "epoch": 0.02, + "grad_norm": 2.14083668937554, + "learning_rate": 7.169082125603866e-06, + "loss": 0.5825, + "step": 742 + }, + { + "epoch": 0.02, + "grad_norm": 2.501270378697401, + "learning_rate": 7.178743961352658e-06, + "loss": 0.5462, + "step": 743 + }, + { + "epoch": 0.02, + "grad_norm": 2.121309027282305, + "learning_rate": 7.18840579710145e-06, + "loss": 0.5261, + "step": 744 + }, + { + "epoch": 0.02, + "grad_norm": 2.2560306134471584, + "learning_rate": 7.1980676328502416e-06, + "loss": 0.5623, + "step": 745 + }, + { + "epoch": 0.02, + "grad_norm": 2.0294309475191987, + "learning_rate": 7.207729468599034e-06, + "loss": 0.5381, + "step": 746 + }, + { + "epoch": 0.02, + "grad_norm": 2.339297611815179, + "learning_rate": 7.217391304347827e-06, + "loss": 0.5599, + "step": 747 + }, + { + "epoch": 0.02, + "grad_norm": 2.0554471144684965, + "learning_rate": 7.227053140096619e-06, + "loss": 0.5636, + "step": 748 + }, + { + "epoch": 0.02, + "grad_norm": 2.0541644037363906, + "learning_rate": 7.236714975845411e-06, + "loss": 0.6231, + "step": 749 + }, + { + "epoch": 0.02, + "grad_norm": 2.7505117128418886, + "learning_rate": 7.246376811594203e-06, + "loss": 0.5611, + "step": 750 + }, + { + "epoch": 0.02, + "grad_norm": 1.376069862209617, + "learning_rate": 7.256038647342996e-06, + "loss": 0.5998, + "step": 751 + }, + { + "epoch": 0.02, + "grad_norm": 2.4439597481156476, + "learning_rate": 7.265700483091788e-06, + "loss": 0.5989, + "step": 752 + }, + { + "epoch": 0.02, + "grad_norm": 1.2698574742474955, + "learning_rate": 7.27536231884058e-06, + "loss": 0.5688, + "step": 753 + }, + { + "epoch": 0.02, + "grad_norm": 2.0993893985752115, + "learning_rate": 7.285024154589373e-06, + "loss": 0.5396, + "step": 754 + }, + { + "epoch": 0.02, + "grad_norm": 2.09374910044753, + "learning_rate": 7.294685990338165e-06, + "loss": 0.5644, + "step": 755 + }, + { + "epoch": 0.02, + "grad_norm": 1.1411755088507431, + "learning_rate": 7.304347826086957e-06, + "loss": 0.5731, + "step": 756 + }, + { + "epoch": 0.02, + "grad_norm": 2.2012274711648407, + "learning_rate": 7.31400966183575e-06, + "loss": 0.5518, + "step": 757 + }, + { + "epoch": 0.02, + "grad_norm": 2.375200568358663, + "learning_rate": 7.323671497584541e-06, + "loss": 0.5914, + "step": 758 + }, + { + "epoch": 0.02, + "grad_norm": 2.327021072910642, + "learning_rate": 7.333333333333333e-06, + "loss": 0.5706, + "step": 759 + }, + { + "epoch": 0.02, + "grad_norm": 2.0705862646517854, + "learning_rate": 7.342995169082127e-06, + "loss": 0.5375, + "step": 760 + }, + { + "epoch": 0.02, + "grad_norm": 2.2092822084621924, + "learning_rate": 7.352657004830918e-06, + "loss": 0.6067, + "step": 761 + }, + { + "epoch": 0.02, + "grad_norm": 2.042584062761117, + "learning_rate": 7.3623188405797106e-06, + "loss": 0.541, + "step": 762 + }, + { + "epoch": 0.02, + "grad_norm": 2.4282668926006186, + "learning_rate": 7.371980676328503e-06, + "loss": 0.5434, + "step": 763 + }, + { + "epoch": 0.02, + "grad_norm": 2.1788783804944885, + "learning_rate": 7.381642512077295e-06, + "loss": 0.5995, + "step": 764 + }, + { + "epoch": 0.02, + "grad_norm": 2.094896910457354, + "learning_rate": 7.391304347826087e-06, + "loss": 0.6145, + "step": 765 + }, + { + "epoch": 0.02, + "grad_norm": 2.3357525093960216, + "learning_rate": 7.40096618357488e-06, + "loss": 0.5361, + "step": 766 + }, + { + "epoch": 0.02, + "grad_norm": 2.175957732308539, + "learning_rate": 7.410628019323672e-06, + "loss": 0.5457, + "step": 767 + }, + { + "epoch": 0.02, + "grad_norm": 2.1813958086654788, + "learning_rate": 7.420289855072465e-06, + "loss": 0.5828, + "step": 768 + }, + { + "epoch": 0.02, + "grad_norm": 2.3719476144776572, + "learning_rate": 7.429951690821257e-06, + "loss": 0.5558, + "step": 769 + }, + { + "epoch": 0.02, + "grad_norm": 2.378777102279438, + "learning_rate": 7.439613526570049e-06, + "loss": 0.524, + "step": 770 + }, + { + "epoch": 0.02, + "grad_norm": 2.423291939284444, + "learning_rate": 7.44927536231884e-06, + "loss": 0.5411, + "step": 771 + }, + { + "epoch": 0.02, + "grad_norm": 2.1729362085203037, + "learning_rate": 7.458937198067634e-06, + "loss": 0.5753, + "step": 772 + }, + { + "epoch": 0.02, + "grad_norm": 2.270170911240875, + "learning_rate": 7.468599033816426e-06, + "loss": 0.5644, + "step": 773 + }, + { + "epoch": 0.02, + "grad_norm": 2.4511001886516364, + "learning_rate": 7.478260869565218e-06, + "loss": 0.5424, + "step": 774 + }, + { + "epoch": 0.02, + "grad_norm": 2.265753871870641, + "learning_rate": 7.48792270531401e-06, + "loss": 0.5384, + "step": 775 + }, + { + "epoch": 0.02, + "grad_norm": 2.2060278003709297, + "learning_rate": 7.497584541062802e-06, + "loss": 0.5526, + "step": 776 + }, + { + "epoch": 0.02, + "grad_norm": 2.1761464560760007, + "learning_rate": 7.507246376811594e-06, + "loss": 0.5607, + "step": 777 + }, + { + "epoch": 0.02, + "grad_norm": 2.0195636048311614, + "learning_rate": 7.516908212560387e-06, + "loss": 0.5181, + "step": 778 + }, + { + "epoch": 0.02, + "grad_norm": 2.019245469221886, + "learning_rate": 7.5265700483091796e-06, + "loss": 0.5333, + "step": 779 + }, + { + "epoch": 0.02, + "grad_norm": 2.4521956986590934, + "learning_rate": 7.536231884057972e-06, + "loss": 0.5752, + "step": 780 + }, + { + "epoch": 0.02, + "grad_norm": 2.4751196166913263, + "learning_rate": 7.545893719806764e-06, + "loss": 0.5553, + "step": 781 + }, + { + "epoch": 0.02, + "grad_norm": 2.238710321517815, + "learning_rate": 7.555555555555556e-06, + "loss": 0.5747, + "step": 782 + }, + { + "epoch": 0.02, + "grad_norm": 2.36756902780346, + "learning_rate": 7.565217391304348e-06, + "loss": 0.5568, + "step": 783 + }, + { + "epoch": 0.02, + "grad_norm": 3.0667413908623793, + "learning_rate": 7.574879227053141e-06, + "loss": 0.5518, + "step": 784 + }, + { + "epoch": 0.02, + "grad_norm": 9.601984291674473, + "learning_rate": 7.584541062801934e-06, + "loss": 0.5287, + "step": 785 + }, + { + "epoch": 0.02, + "grad_norm": 2.309016086630243, + "learning_rate": 7.594202898550726e-06, + "loss": 0.561, + "step": 786 + }, + { + "epoch": 0.02, + "grad_norm": 2.378007026508888, + "learning_rate": 7.603864734299517e-06, + "loss": 0.53, + "step": 787 + }, + { + "epoch": 0.02, + "grad_norm": 2.22506387512355, + "learning_rate": 7.613526570048309e-06, + "loss": 0.582, + "step": 788 + }, + { + "epoch": 0.02, + "grad_norm": 2.301742211210203, + "learning_rate": 7.6231884057971015e-06, + "loss": 0.6027, + "step": 789 + }, + { + "epoch": 0.02, + "grad_norm": 2.084184466928866, + "learning_rate": 7.632850241545895e-06, + "loss": 0.5797, + "step": 790 + }, + { + "epoch": 0.02, + "grad_norm": 2.009906876745183, + "learning_rate": 7.642512077294687e-06, + "loss": 0.5896, + "step": 791 + }, + { + "epoch": 0.02, + "grad_norm": 2.9509064247485566, + "learning_rate": 7.652173913043479e-06, + "loss": 0.5818, + "step": 792 + }, + { + "epoch": 0.02, + "grad_norm": 2.3475465776940245, + "learning_rate": 7.661835748792271e-06, + "loss": 0.5095, + "step": 793 + }, + { + "epoch": 0.02, + "grad_norm": 2.2374767427561824, + "learning_rate": 7.671497584541063e-06, + "loss": 0.5771, + "step": 794 + }, + { + "epoch": 0.02, + "grad_norm": 1.7686947767641608, + "learning_rate": 7.681159420289856e-06, + "loss": 0.5692, + "step": 795 + }, + { + "epoch": 0.02, + "grad_norm": 2.505152187852343, + "learning_rate": 7.690821256038648e-06, + "loss": 0.5845, + "step": 796 + }, + { + "epoch": 0.02, + "grad_norm": 2.4267470814307948, + "learning_rate": 7.70048309178744e-06, + "loss": 0.5644, + "step": 797 + }, + { + "epoch": 0.02, + "grad_norm": 2.3782160179598026, + "learning_rate": 7.710144927536232e-06, + "loss": 0.5603, + "step": 798 + }, + { + "epoch": 0.02, + "grad_norm": 2.346469327310697, + "learning_rate": 7.719806763285024e-06, + "loss": 0.5721, + "step": 799 + }, + { + "epoch": 0.02, + "grad_norm": 2.1822199429277673, + "learning_rate": 7.729468599033817e-06, + "loss": 0.5686, + "step": 800 + }, + { + "epoch": 0.02, + "grad_norm": 2.3813953711671485, + "learning_rate": 7.739130434782609e-06, + "loss": 0.5767, + "step": 801 + }, + { + "epoch": 0.02, + "grad_norm": 2.1050164962373983, + "learning_rate": 7.748792270531401e-06, + "loss": 0.5792, + "step": 802 + }, + { + "epoch": 0.02, + "grad_norm": 2.046883747968651, + "learning_rate": 7.758454106280195e-06, + "loss": 0.5271, + "step": 803 + }, + { + "epoch": 0.02, + "grad_norm": 2.142551173741456, + "learning_rate": 7.768115942028987e-06, + "loss": 0.5241, + "step": 804 + }, + { + "epoch": 0.02, + "grad_norm": 1.9195864514945147, + "learning_rate": 7.77777777777778e-06, + "loss": 0.5106, + "step": 805 + }, + { + "epoch": 0.02, + "grad_norm": 2.4081833772147996, + "learning_rate": 7.787439613526571e-06, + "loss": 0.5365, + "step": 806 + }, + { + "epoch": 0.02, + "grad_norm": 2.4911409984928357, + "learning_rate": 7.797101449275364e-06, + "loss": 0.5827, + "step": 807 + }, + { + "epoch": 0.02, + "grad_norm": 2.479670870558392, + "learning_rate": 7.806763285024154e-06, + "loss": 0.5419, + "step": 808 + }, + { + "epoch": 0.02, + "grad_norm": 2.200077481429668, + "learning_rate": 7.816425120772948e-06, + "loss": 0.5404, + "step": 809 + }, + { + "epoch": 0.02, + "grad_norm": 2.3706799160648617, + "learning_rate": 7.82608695652174e-06, + "loss": 0.5525, + "step": 810 + }, + { + "epoch": 0.02, + "grad_norm": 2.513827427972404, + "learning_rate": 7.835748792270532e-06, + "loss": 0.5319, + "step": 811 + }, + { + "epoch": 0.02, + "grad_norm": 2.533808241809668, + "learning_rate": 7.845410628019325e-06, + "loss": 0.5671, + "step": 812 + }, + { + "epoch": 0.02, + "grad_norm": 1.2670460239178156, + "learning_rate": 7.855072463768117e-06, + "loss": 0.545, + "step": 813 + }, + { + "epoch": 0.02, + "grad_norm": 15.785804459841206, + "learning_rate": 7.864734299516909e-06, + "loss": 0.5448, + "step": 814 + }, + { + "epoch": 0.02, + "grad_norm": 2.8377522488198514, + "learning_rate": 7.874396135265701e-06, + "loss": 0.5676, + "step": 815 + }, + { + "epoch": 0.02, + "grad_norm": 2.5866341661904326, + "learning_rate": 7.884057971014493e-06, + "loss": 0.5786, + "step": 816 + }, + { + "epoch": 0.02, + "grad_norm": 1.1663429444877749, + "learning_rate": 7.893719806763285e-06, + "loss": 0.5652, + "step": 817 + }, + { + "epoch": 0.02, + "grad_norm": 1.2188261455078482, + "learning_rate": 7.903381642512078e-06, + "loss": 0.5613, + "step": 818 + }, + { + "epoch": 0.02, + "grad_norm": 2.558591908522754, + "learning_rate": 7.91304347826087e-06, + "loss": 0.576, + "step": 819 + }, + { + "epoch": 0.02, + "grad_norm": 2.457285654107997, + "learning_rate": 7.922705314009662e-06, + "loss": 0.5709, + "step": 820 + }, + { + "epoch": 0.02, + "grad_norm": 2.127618192290175, + "learning_rate": 7.932367149758454e-06, + "loss": 0.5762, + "step": 821 + }, + { + "epoch": 0.02, + "grad_norm": 1.2174358935187346, + "learning_rate": 7.942028985507248e-06, + "loss": 0.5334, + "step": 822 + }, + { + "epoch": 0.02, + "grad_norm": 3.157587092528467, + "learning_rate": 7.95169082125604e-06, + "loss": 0.5663, + "step": 823 + }, + { + "epoch": 0.02, + "grad_norm": 2.333866395811985, + "learning_rate": 7.961352657004831e-06, + "loss": 0.5449, + "step": 824 + }, + { + "epoch": 0.02, + "grad_norm": 2.2242480355429297, + "learning_rate": 7.971014492753623e-06, + "loss": 0.6143, + "step": 825 + }, + { + "epoch": 0.02, + "grad_norm": 2.7835161401125412, + "learning_rate": 7.980676328502415e-06, + "loss": 0.5983, + "step": 826 + }, + { + "epoch": 0.02, + "grad_norm": 2.414438633512525, + "learning_rate": 7.990338164251207e-06, + "loss": 0.5629, + "step": 827 + }, + { + "epoch": 0.02, + "grad_norm": 2.520230851492475, + "learning_rate": 8.000000000000001e-06, + "loss": 0.5379, + "step": 828 + }, + { + "epoch": 0.02, + "grad_norm": 2.178275583905771, + "learning_rate": 8.009661835748794e-06, + "loss": 0.5255, + "step": 829 + }, + { + "epoch": 0.02, + "grad_norm": 2.4360684450489614, + "learning_rate": 8.019323671497586e-06, + "loss": 0.532, + "step": 830 + }, + { + "epoch": 0.02, + "grad_norm": 2.34525246609181, + "learning_rate": 8.028985507246378e-06, + "loss": 0.5641, + "step": 831 + }, + { + "epoch": 0.02, + "grad_norm": 2.280248857185107, + "learning_rate": 8.03864734299517e-06, + "loss": 0.5196, + "step": 832 + }, + { + "epoch": 0.02, + "grad_norm": 2.188824277322615, + "learning_rate": 8.048309178743962e-06, + "loss": 0.5201, + "step": 833 + }, + { + "epoch": 0.02, + "grad_norm": 2.164003281934054, + "learning_rate": 8.057971014492754e-06, + "loss": 0.5042, + "step": 834 + }, + { + "epoch": 0.02, + "grad_norm": 2.3814269167550246, + "learning_rate": 8.067632850241547e-06, + "loss": 0.6172, + "step": 835 + }, + { + "epoch": 0.02, + "grad_norm": 2.900253176183805, + "learning_rate": 8.077294685990339e-06, + "loss": 0.5571, + "step": 836 + }, + { + "epoch": 0.02, + "grad_norm": 2.162532781738342, + "learning_rate": 8.086956521739131e-06, + "loss": 0.5035, + "step": 837 + }, + { + "epoch": 0.02, + "grad_norm": 2.0861443852847867, + "learning_rate": 8.096618357487923e-06, + "loss": 0.5895, + "step": 838 + }, + { + "epoch": 0.02, + "grad_norm": 2.2830073139210403, + "learning_rate": 8.106280193236715e-06, + "loss": 0.5434, + "step": 839 + }, + { + "epoch": 0.02, + "grad_norm": 2.1245103919032413, + "learning_rate": 8.115942028985508e-06, + "loss": 0.555, + "step": 840 + }, + { + "epoch": 0.02, + "grad_norm": 1.3699297535855348, + "learning_rate": 8.1256038647343e-06, + "loss": 0.5425, + "step": 841 + }, + { + "epoch": 0.02, + "grad_norm": 5.287639149504299, + "learning_rate": 8.135265700483092e-06, + "loss": 0.537, + "step": 842 + }, + { + "epoch": 0.02, + "grad_norm": 2.6269681739028865, + "learning_rate": 8.144927536231884e-06, + "loss": 0.5944, + "step": 843 + }, + { + "epoch": 0.02, + "grad_norm": 2.242002835258647, + "learning_rate": 8.154589371980676e-06, + "loss": 0.5044, + "step": 844 + }, + { + "epoch": 0.02, + "grad_norm": 2.036783204485323, + "learning_rate": 8.164251207729469e-06, + "loss": 0.5162, + "step": 845 + }, + { + "epoch": 0.02, + "grad_norm": 2.192838129395925, + "learning_rate": 8.173913043478263e-06, + "loss": 0.5421, + "step": 846 + }, + { + "epoch": 0.02, + "grad_norm": 2.474582358705798, + "learning_rate": 8.183574879227055e-06, + "loss": 0.5504, + "step": 847 + }, + { + "epoch": 0.02, + "grad_norm": 2.2817513128900644, + "learning_rate": 8.193236714975847e-06, + "loss": 0.5286, + "step": 848 + }, + { + "epoch": 0.02, + "grad_norm": 2.3320922425141117, + "learning_rate": 8.202898550724639e-06, + "loss": 0.527, + "step": 849 + }, + { + "epoch": 0.02, + "grad_norm": 2.0825854391218988, + "learning_rate": 8.212560386473431e-06, + "loss": 0.5561, + "step": 850 + }, + { + "epoch": 0.02, + "grad_norm": 2.221710547726399, + "learning_rate": 8.222222222222222e-06, + "loss": 0.5413, + "step": 851 + }, + { + "epoch": 0.02, + "grad_norm": 3.808258408787786, + "learning_rate": 8.231884057971014e-06, + "loss": 0.5649, + "step": 852 + }, + { + "epoch": 0.02, + "grad_norm": 2.4720999406882123, + "learning_rate": 8.241545893719808e-06, + "loss": 0.5505, + "step": 853 + }, + { + "epoch": 0.02, + "grad_norm": 3.0583700225216437, + "learning_rate": 8.2512077294686e-06, + "loss": 0.5283, + "step": 854 + }, + { + "epoch": 0.02, + "grad_norm": 2.0545321271545145, + "learning_rate": 8.260869565217392e-06, + "loss": 0.546, + "step": 855 + }, + { + "epoch": 0.02, + "grad_norm": 1.9391249023615478, + "learning_rate": 8.270531400966184e-06, + "loss": 0.5404, + "step": 856 + }, + { + "epoch": 0.02, + "grad_norm": 2.123899508911802, + "learning_rate": 8.280193236714977e-06, + "loss": 0.5473, + "step": 857 + }, + { + "epoch": 0.02, + "grad_norm": 2.3189065931096113, + "learning_rate": 8.289855072463769e-06, + "loss": 0.5778, + "step": 858 + }, + { + "epoch": 0.02, + "grad_norm": 2.107003657487457, + "learning_rate": 8.299516908212561e-06, + "loss": 0.5485, + "step": 859 + }, + { + "epoch": 0.02, + "grad_norm": 2.1902299135167365, + "learning_rate": 8.309178743961353e-06, + "loss": 0.5905, + "step": 860 + }, + { + "epoch": 0.02, + "grad_norm": 2.4818695234464623, + "learning_rate": 8.318840579710145e-06, + "loss": 0.5372, + "step": 861 + }, + { + "epoch": 0.03, + "grad_norm": 2.348849991000941, + "learning_rate": 8.328502415458938e-06, + "loss": 0.6066, + "step": 862 + }, + { + "epoch": 0.03, + "grad_norm": 2.2042071905185208, + "learning_rate": 8.33816425120773e-06, + "loss": 0.53, + "step": 863 + }, + { + "epoch": 0.03, + "grad_norm": 2.2573153871636067, + "learning_rate": 8.347826086956522e-06, + "loss": 0.5147, + "step": 864 + }, + { + "epoch": 0.03, + "grad_norm": 2.202892330595367, + "learning_rate": 8.357487922705316e-06, + "loss": 0.6055, + "step": 865 + }, + { + "epoch": 0.03, + "grad_norm": 2.0462771213929734, + "learning_rate": 8.367149758454108e-06, + "loss": 0.5511, + "step": 866 + }, + { + "epoch": 0.03, + "grad_norm": 1.9143657452856582, + "learning_rate": 8.376811594202899e-06, + "loss": 0.5074, + "step": 867 + }, + { + "epoch": 0.03, + "grad_norm": 3.0543600870612777, + "learning_rate": 8.38647342995169e-06, + "loss": 0.59, + "step": 868 + }, + { + "epoch": 0.03, + "grad_norm": 2.0578215284669104, + "learning_rate": 8.396135265700483e-06, + "loss": 0.5306, + "step": 869 + }, + { + "epoch": 0.03, + "grad_norm": 1.9953586923582742, + "learning_rate": 8.405797101449275e-06, + "loss": 0.5281, + "step": 870 + }, + { + "epoch": 0.03, + "grad_norm": 1.9746607465089316, + "learning_rate": 8.415458937198069e-06, + "loss": 0.5246, + "step": 871 + }, + { + "epoch": 0.03, + "grad_norm": 1.97758719489782, + "learning_rate": 8.425120772946861e-06, + "loss": 0.5333, + "step": 872 + }, + { + "epoch": 0.03, + "grad_norm": 2.0758905311642213, + "learning_rate": 8.434782608695653e-06, + "loss": 0.5933, + "step": 873 + }, + { + "epoch": 0.03, + "grad_norm": 1.9978082825990882, + "learning_rate": 8.444444444444446e-06, + "loss": 0.559, + "step": 874 + }, + { + "epoch": 0.03, + "grad_norm": 2.0619571425004226, + "learning_rate": 8.454106280193238e-06, + "loss": 0.5714, + "step": 875 + }, + { + "epoch": 0.03, + "grad_norm": 2.1653352143031492, + "learning_rate": 8.46376811594203e-06, + "loss": 0.5257, + "step": 876 + }, + { + "epoch": 0.03, + "grad_norm": 1.5543260333810156, + "learning_rate": 8.473429951690822e-06, + "loss": 0.5583, + "step": 877 + }, + { + "epoch": 0.03, + "grad_norm": 4.484747575089779, + "learning_rate": 8.483091787439614e-06, + "loss": 0.5496, + "step": 878 + }, + { + "epoch": 0.03, + "grad_norm": 2.320168598689743, + "learning_rate": 8.492753623188407e-06, + "loss": 0.5464, + "step": 879 + }, + { + "epoch": 0.03, + "grad_norm": 2.2163987034523767, + "learning_rate": 8.502415458937199e-06, + "loss": 0.5664, + "step": 880 + }, + { + "epoch": 0.03, + "grad_norm": 1.8871822876691982, + "learning_rate": 8.512077294685991e-06, + "loss": 0.5504, + "step": 881 + }, + { + "epoch": 0.03, + "grad_norm": 2.0528712807462943, + "learning_rate": 8.521739130434783e-06, + "loss": 0.5675, + "step": 882 + }, + { + "epoch": 0.03, + "grad_norm": 2.3954966874717654, + "learning_rate": 8.531400966183575e-06, + "loss": 0.5131, + "step": 883 + }, + { + "epoch": 0.03, + "grad_norm": 2.1292301308696913, + "learning_rate": 8.541062801932368e-06, + "loss": 0.5651, + "step": 884 + }, + { + "epoch": 0.03, + "grad_norm": 2.04805492621656, + "learning_rate": 8.55072463768116e-06, + "loss": 0.5238, + "step": 885 + }, + { + "epoch": 0.03, + "grad_norm": 2.1812490130953037, + "learning_rate": 8.560386473429952e-06, + "loss": 0.5674, + "step": 886 + }, + { + "epoch": 0.03, + "grad_norm": 2.314282699562947, + "learning_rate": 8.570048309178744e-06, + "loss": 0.5377, + "step": 887 + }, + { + "epoch": 0.03, + "grad_norm": 2.1150888187270565, + "learning_rate": 8.579710144927536e-06, + "loss": 0.5272, + "step": 888 + }, + { + "epoch": 0.03, + "grad_norm": 2.1028319227388637, + "learning_rate": 8.589371980676329e-06, + "loss": 0.5316, + "step": 889 + }, + { + "epoch": 0.03, + "grad_norm": 2.4313706266029245, + "learning_rate": 8.599033816425122e-06, + "loss": 0.5695, + "step": 890 + }, + { + "epoch": 0.03, + "grad_norm": 3.156853778587398, + "learning_rate": 8.608695652173915e-06, + "loss": 0.5347, + "step": 891 + }, + { + "epoch": 0.03, + "grad_norm": 2.4879288661390557, + "learning_rate": 8.618357487922707e-06, + "loss": 0.5487, + "step": 892 + }, + { + "epoch": 0.03, + "grad_norm": 2.204286337216785, + "learning_rate": 8.628019323671497e-06, + "loss": 0.5566, + "step": 893 + }, + { + "epoch": 0.03, + "grad_norm": 2.162411165701868, + "learning_rate": 8.63768115942029e-06, + "loss": 0.5764, + "step": 894 + }, + { + "epoch": 0.03, + "grad_norm": 2.1105750653620983, + "learning_rate": 8.647342995169082e-06, + "loss": 0.5429, + "step": 895 + }, + { + "epoch": 0.03, + "grad_norm": 2.749128967176161, + "learning_rate": 8.657004830917876e-06, + "loss": 0.5517, + "step": 896 + }, + { + "epoch": 0.03, + "grad_norm": 2.437056815920807, + "learning_rate": 8.666666666666668e-06, + "loss": 0.6075, + "step": 897 + }, + { + "epoch": 0.03, + "grad_norm": 2.527861418392224, + "learning_rate": 8.67632850241546e-06, + "loss": 0.5723, + "step": 898 + }, + { + "epoch": 0.03, + "grad_norm": 2.0607461651189256, + "learning_rate": 8.685990338164252e-06, + "loss": 0.5231, + "step": 899 + }, + { + "epoch": 0.03, + "grad_norm": 2.394574818011686, + "learning_rate": 8.695652173913044e-06, + "loss": 0.5122, + "step": 900 + }, + { + "epoch": 0.03, + "grad_norm": 2.587321954477451, + "learning_rate": 8.705314009661837e-06, + "loss": 0.5145, + "step": 901 + }, + { + "epoch": 0.03, + "grad_norm": 1.9599707044921038, + "learning_rate": 8.714975845410629e-06, + "loss": 0.494, + "step": 902 + }, + { + "epoch": 0.03, + "grad_norm": 2.535712092239288, + "learning_rate": 8.724637681159421e-06, + "loss": 0.5695, + "step": 903 + }, + { + "epoch": 0.03, + "grad_norm": 2.1682059793720394, + "learning_rate": 8.734299516908213e-06, + "loss": 0.544, + "step": 904 + }, + { + "epoch": 0.03, + "grad_norm": 2.297121009916737, + "learning_rate": 8.743961352657005e-06, + "loss": 0.5599, + "step": 905 + }, + { + "epoch": 0.03, + "grad_norm": 2.134395412218305, + "learning_rate": 8.753623188405798e-06, + "loss": 0.5236, + "step": 906 + }, + { + "epoch": 0.03, + "grad_norm": 2.288810192232918, + "learning_rate": 8.76328502415459e-06, + "loss": 0.5643, + "step": 907 + }, + { + "epoch": 0.03, + "grad_norm": 2.1811834077909014, + "learning_rate": 8.772946859903384e-06, + "loss": 0.5567, + "step": 908 + }, + { + "epoch": 0.03, + "grad_norm": 2.0167957503614544, + "learning_rate": 8.782608695652174e-06, + "loss": 0.5393, + "step": 909 + }, + { + "epoch": 0.03, + "grad_norm": 2.21441550071937, + "learning_rate": 8.792270531400966e-06, + "loss": 0.533, + "step": 910 + }, + { + "epoch": 0.03, + "grad_norm": 2.1344835347136484, + "learning_rate": 8.801932367149759e-06, + "loss": 0.5649, + "step": 911 + }, + { + "epoch": 0.03, + "grad_norm": 1.7250433433848753, + "learning_rate": 8.81159420289855e-06, + "loss": 0.6353, + "step": 912 + }, + { + "epoch": 0.03, + "grad_norm": 2.257650302608693, + "learning_rate": 8.821256038647343e-06, + "loss": 0.567, + "step": 913 + }, + { + "epoch": 0.03, + "grad_norm": 1.2155916348301055, + "learning_rate": 8.830917874396135e-06, + "loss": 0.5907, + "step": 914 + }, + { + "epoch": 0.03, + "grad_norm": 2.527552718019071, + "learning_rate": 8.840579710144929e-06, + "loss": 0.5437, + "step": 915 + }, + { + "epoch": 0.03, + "grad_norm": 2.8358404447719443, + "learning_rate": 8.850241545893721e-06, + "loss": 0.5707, + "step": 916 + }, + { + "epoch": 0.03, + "grad_norm": 1.3187653805995865, + "learning_rate": 8.859903381642513e-06, + "loss": 0.549, + "step": 917 + }, + { + "epoch": 0.03, + "grad_norm": 2.201476233730597, + "learning_rate": 8.869565217391306e-06, + "loss": 0.55, + "step": 918 + }, + { + "epoch": 0.03, + "grad_norm": 2.3320210712746414, + "learning_rate": 8.879227053140098e-06, + "loss": 0.5316, + "step": 919 + }, + { + "epoch": 0.03, + "grad_norm": 2.071890154129608, + "learning_rate": 8.888888888888888e-06, + "loss": 0.5239, + "step": 920 + }, + { + "epoch": 0.03, + "grad_norm": 2.6548872857829506, + "learning_rate": 8.898550724637682e-06, + "loss": 0.5515, + "step": 921 + }, + { + "epoch": 0.03, + "grad_norm": 2.1941983634997864, + "learning_rate": 8.908212560386474e-06, + "loss": 0.5655, + "step": 922 + }, + { + "epoch": 0.03, + "grad_norm": 2.3352817591983617, + "learning_rate": 8.917874396135267e-06, + "loss": 0.5274, + "step": 923 + }, + { + "epoch": 0.03, + "grad_norm": 2.6293905067036243, + "learning_rate": 8.927536231884059e-06, + "loss": 0.5615, + "step": 924 + }, + { + "epoch": 0.03, + "grad_norm": 2.5778498943234975, + "learning_rate": 8.937198067632851e-06, + "loss": 0.5673, + "step": 925 + }, + { + "epoch": 0.03, + "grad_norm": 1.8514560817932075, + "learning_rate": 8.946859903381643e-06, + "loss": 0.5851, + "step": 926 + }, + { + "epoch": 0.03, + "grad_norm": 2.3241311609969406, + "learning_rate": 8.956521739130435e-06, + "loss": 0.5983, + "step": 927 + }, + { + "epoch": 0.03, + "grad_norm": 2.8107987158274454, + "learning_rate": 8.966183574879228e-06, + "loss": 0.5315, + "step": 928 + }, + { + "epoch": 0.03, + "grad_norm": 2.213545198644213, + "learning_rate": 8.97584541062802e-06, + "loss": 0.5444, + "step": 929 + }, + { + "epoch": 0.03, + "grad_norm": 2.4810464585679477, + "learning_rate": 8.985507246376812e-06, + "loss": 0.5693, + "step": 930 + }, + { + "epoch": 0.03, + "grad_norm": 1.9429111468225337, + "learning_rate": 8.995169082125604e-06, + "loss": 0.535, + "step": 931 + }, + { + "epoch": 0.03, + "grad_norm": 2.147990122266563, + "learning_rate": 9.004830917874396e-06, + "loss": 0.5402, + "step": 932 + }, + { + "epoch": 0.03, + "grad_norm": 2.4364861157398527, + "learning_rate": 9.01449275362319e-06, + "loss": 0.5204, + "step": 933 + }, + { + "epoch": 0.03, + "grad_norm": 2.034888737799752, + "learning_rate": 9.024154589371982e-06, + "loss": 0.5582, + "step": 934 + }, + { + "epoch": 0.03, + "grad_norm": 2.2749258174752605, + "learning_rate": 9.033816425120775e-06, + "loss": 0.5746, + "step": 935 + }, + { + "epoch": 0.03, + "grad_norm": 2.2226057201287754, + "learning_rate": 9.043478260869565e-06, + "loss": 0.5591, + "step": 936 + }, + { + "epoch": 0.03, + "grad_norm": 2.0271219787489785, + "learning_rate": 9.053140096618357e-06, + "loss": 0.5598, + "step": 937 + }, + { + "epoch": 0.03, + "grad_norm": 2.1797858181452896, + "learning_rate": 9.06280193236715e-06, + "loss": 0.511, + "step": 938 + }, + { + "epoch": 0.03, + "grad_norm": 2.1212649675085755, + "learning_rate": 9.072463768115943e-06, + "loss": 0.5389, + "step": 939 + }, + { + "epoch": 0.03, + "grad_norm": 3.174712641845075, + "learning_rate": 9.082125603864736e-06, + "loss": 0.552, + "step": 940 + }, + { + "epoch": 0.03, + "grad_norm": 2.180694597162045, + "learning_rate": 9.091787439613528e-06, + "loss": 0.5278, + "step": 941 + }, + { + "epoch": 0.03, + "grad_norm": 4.2586960924999415, + "learning_rate": 9.10144927536232e-06, + "loss": 0.5687, + "step": 942 + }, + { + "epoch": 0.03, + "grad_norm": 2.0784673965635507, + "learning_rate": 9.111111111111112e-06, + "loss": 0.5196, + "step": 943 + }, + { + "epoch": 0.03, + "grad_norm": 2.1076627043699765, + "learning_rate": 9.120772946859904e-06, + "loss": 0.5821, + "step": 944 + }, + { + "epoch": 0.03, + "grad_norm": 2.27553444447076, + "learning_rate": 9.130434782608697e-06, + "loss": 0.5379, + "step": 945 + }, + { + "epoch": 0.03, + "grad_norm": 2.708004015509368, + "learning_rate": 9.140096618357489e-06, + "loss": 0.6157, + "step": 946 + }, + { + "epoch": 0.03, + "grad_norm": 2.2638182487144443, + "learning_rate": 9.149758454106281e-06, + "loss": 0.5692, + "step": 947 + }, + { + "epoch": 0.03, + "grad_norm": 2.6249665017667616, + "learning_rate": 9.159420289855073e-06, + "loss": 0.512, + "step": 948 + }, + { + "epoch": 0.03, + "grad_norm": 2.2810472536394117, + "learning_rate": 9.169082125603865e-06, + "loss": 0.5515, + "step": 949 + }, + { + "epoch": 0.03, + "grad_norm": 2.0792815913137876, + "learning_rate": 9.178743961352658e-06, + "loss": 0.5336, + "step": 950 + }, + { + "epoch": 0.03, + "grad_norm": 1.9308167090886488, + "learning_rate": 9.18840579710145e-06, + "loss": 0.4969, + "step": 951 + }, + { + "epoch": 0.03, + "grad_norm": 2.031302555138775, + "learning_rate": 9.198067632850242e-06, + "loss": 0.5433, + "step": 952 + }, + { + "epoch": 0.03, + "grad_norm": 2.0516801017253727, + "learning_rate": 9.207729468599034e-06, + "loss": 0.5223, + "step": 953 + }, + { + "epoch": 0.03, + "grad_norm": 2.2432174288557407, + "learning_rate": 9.217391304347826e-06, + "loss": 0.5057, + "step": 954 + }, + { + "epoch": 0.03, + "grad_norm": 2.2399001136774572, + "learning_rate": 9.227053140096618e-06, + "loss": 0.5381, + "step": 955 + }, + { + "epoch": 0.03, + "grad_norm": 2.034853895911741, + "learning_rate": 9.23671497584541e-06, + "loss": 0.5127, + "step": 956 + }, + { + "epoch": 0.03, + "grad_norm": 2.1120083443994337, + "learning_rate": 9.246376811594203e-06, + "loss": 0.5519, + "step": 957 + }, + { + "epoch": 0.03, + "grad_norm": 3.728710282532445, + "learning_rate": 9.256038647342997e-06, + "loss": 0.5158, + "step": 958 + }, + { + "epoch": 0.03, + "grad_norm": 2.0646415129850615, + "learning_rate": 9.265700483091789e-06, + "loss": 0.5355, + "step": 959 + }, + { + "epoch": 0.03, + "grad_norm": 2.2081596769122385, + "learning_rate": 9.275362318840581e-06, + "loss": 0.504, + "step": 960 + }, + { + "epoch": 0.03, + "grad_norm": 2.42275835993863, + "learning_rate": 9.285024154589373e-06, + "loss": 0.5221, + "step": 961 + }, + { + "epoch": 0.03, + "grad_norm": 2.125838927334284, + "learning_rate": 9.294685990338166e-06, + "loss": 0.5212, + "step": 962 + }, + { + "epoch": 0.03, + "grad_norm": 1.9834390653702056, + "learning_rate": 9.304347826086956e-06, + "loss": 0.4918, + "step": 963 + }, + { + "epoch": 0.03, + "grad_norm": 2.036214848058864, + "learning_rate": 9.31400966183575e-06, + "loss": 0.5385, + "step": 964 + }, + { + "epoch": 0.03, + "grad_norm": 2.088400597597943, + "learning_rate": 9.323671497584542e-06, + "loss": 0.5142, + "step": 965 + }, + { + "epoch": 0.03, + "grad_norm": 2.2187080120773808, + "learning_rate": 9.333333333333334e-06, + "loss": 0.571, + "step": 966 + }, + { + "epoch": 0.03, + "grad_norm": 2.1303630305630112, + "learning_rate": 9.342995169082127e-06, + "loss": 0.4939, + "step": 967 + }, + { + "epoch": 0.03, + "grad_norm": 2.128602772189002, + "learning_rate": 9.352657004830919e-06, + "loss": 0.5136, + "step": 968 + }, + { + "epoch": 0.03, + "grad_norm": 1.9640762695083183, + "learning_rate": 9.362318840579711e-06, + "loss": 0.5428, + "step": 969 + }, + { + "epoch": 0.03, + "grad_norm": 2.693731458941723, + "learning_rate": 9.371980676328503e-06, + "loss": 0.4822, + "step": 970 + }, + { + "epoch": 0.03, + "grad_norm": 2.5143253284599854, + "learning_rate": 9.381642512077295e-06, + "loss": 0.4891, + "step": 971 + }, + { + "epoch": 0.03, + "grad_norm": 2.208931858119794, + "learning_rate": 9.391304347826087e-06, + "loss": 0.5624, + "step": 972 + }, + { + "epoch": 0.03, + "grad_norm": 2.0962171735444777, + "learning_rate": 9.40096618357488e-06, + "loss": 0.5199, + "step": 973 + }, + { + "epoch": 0.03, + "grad_norm": 2.153623891692663, + "learning_rate": 9.410628019323672e-06, + "loss": 0.5234, + "step": 974 + }, + { + "epoch": 0.03, + "grad_norm": 2.2645844890188314, + "learning_rate": 9.420289855072464e-06, + "loss": 0.5693, + "step": 975 + }, + { + "epoch": 0.03, + "grad_norm": 2.080308022180058, + "learning_rate": 9.429951690821256e-06, + "loss": 0.5183, + "step": 976 + }, + { + "epoch": 0.03, + "grad_norm": 2.364465149268429, + "learning_rate": 9.43961352657005e-06, + "loss": 0.5382, + "step": 977 + }, + { + "epoch": 0.03, + "grad_norm": 2.391128424009843, + "learning_rate": 9.449275362318842e-06, + "loss": 0.5252, + "step": 978 + }, + { + "epoch": 0.03, + "grad_norm": 2.583277977152929, + "learning_rate": 9.458937198067633e-06, + "loss": 0.5165, + "step": 979 + }, + { + "epoch": 0.03, + "grad_norm": 2.087598047656671, + "learning_rate": 9.468599033816425e-06, + "loss": 0.5404, + "step": 980 + }, + { + "epoch": 0.03, + "grad_norm": 2.652455867407948, + "learning_rate": 9.478260869565217e-06, + "loss": 0.5123, + "step": 981 + }, + { + "epoch": 0.03, + "grad_norm": 2.5479026183578073, + "learning_rate": 9.48792270531401e-06, + "loss": 0.5628, + "step": 982 + }, + { + "epoch": 0.03, + "grad_norm": 3.4458731876502378, + "learning_rate": 9.497584541062803e-06, + "loss": 0.5535, + "step": 983 + }, + { + "epoch": 0.03, + "grad_norm": 2.274396300602917, + "learning_rate": 9.507246376811596e-06, + "loss": 0.541, + "step": 984 + }, + { + "epoch": 0.03, + "grad_norm": 2.2131557399795185, + "learning_rate": 9.516908212560388e-06, + "loss": 0.5231, + "step": 985 + }, + { + "epoch": 0.03, + "grad_norm": 2.464312339601553, + "learning_rate": 9.52657004830918e-06, + "loss": 0.5967, + "step": 986 + }, + { + "epoch": 0.03, + "grad_norm": 2.174554004043785, + "learning_rate": 9.536231884057972e-06, + "loss": 0.5427, + "step": 987 + }, + { + "epoch": 0.03, + "grad_norm": 2.212053654699053, + "learning_rate": 9.545893719806764e-06, + "loss": 0.5263, + "step": 988 + }, + { + "epoch": 0.03, + "grad_norm": 2.166594277833534, + "learning_rate": 9.555555555555556e-06, + "loss": 0.5521, + "step": 989 + }, + { + "epoch": 0.03, + "grad_norm": 2.2180822126522606, + "learning_rate": 9.565217391304349e-06, + "loss": 0.5498, + "step": 990 + }, + { + "epoch": 0.03, + "grad_norm": 2.731395967914563, + "learning_rate": 9.574879227053141e-06, + "loss": 0.551, + "step": 991 + }, + { + "epoch": 0.03, + "grad_norm": 2.7979128182634807, + "learning_rate": 9.584541062801933e-06, + "loss": 0.5422, + "step": 992 + }, + { + "epoch": 0.03, + "grad_norm": 2.0312271797113834, + "learning_rate": 9.594202898550725e-06, + "loss": 0.4934, + "step": 993 + }, + { + "epoch": 0.03, + "grad_norm": 2.4605625327121836, + "learning_rate": 9.603864734299517e-06, + "loss": 0.5485, + "step": 994 + }, + { + "epoch": 0.03, + "grad_norm": 2.8417524117802975, + "learning_rate": 9.61352657004831e-06, + "loss": 0.5282, + "step": 995 + }, + { + "epoch": 0.03, + "grad_norm": 1.2628113148738684, + "learning_rate": 9.623188405797102e-06, + "loss": 0.5107, + "step": 996 + }, + { + "epoch": 0.03, + "grad_norm": 1.3030208102139957, + "learning_rate": 9.632850241545894e-06, + "loss": 0.4985, + "step": 997 + }, + { + "epoch": 0.03, + "grad_norm": 2.284076224339835, + "learning_rate": 9.642512077294686e-06, + "loss": 0.5272, + "step": 998 + }, + { + "epoch": 0.03, + "grad_norm": 2.2350805012127934, + "learning_rate": 9.652173913043478e-06, + "loss": 0.5801, + "step": 999 + }, + { + "epoch": 0.03, + "grad_norm": 2.2511018245569097, + "learning_rate": 9.66183574879227e-06, + "loss": 0.5056, + "step": 1000 + }, + { + "epoch": 0.03, + "grad_norm": 2.374967925965416, + "learning_rate": 9.671497584541065e-06, + "loss": 0.5072, + "step": 1001 + }, + { + "epoch": 0.03, + "grad_norm": 2.3099365697387264, + "learning_rate": 9.681159420289857e-06, + "loss": 0.5243, + "step": 1002 + }, + { + "epoch": 0.03, + "grad_norm": 2.5235477942678233, + "learning_rate": 9.690821256038649e-06, + "loss": 0.6158, + "step": 1003 + }, + { + "epoch": 0.03, + "grad_norm": 2.1181088006745727, + "learning_rate": 9.700483091787441e-06, + "loss": 0.5162, + "step": 1004 + }, + { + "epoch": 0.03, + "grad_norm": 2.7002100936647273, + "learning_rate": 9.710144927536233e-06, + "loss": 0.5328, + "step": 1005 + }, + { + "epoch": 0.03, + "grad_norm": 2.1820292049214456, + "learning_rate": 9.719806763285024e-06, + "loss": 0.5057, + "step": 1006 + }, + { + "epoch": 0.03, + "grad_norm": 2.418296480834973, + "learning_rate": 9.729468599033816e-06, + "loss": 0.5194, + "step": 1007 + }, + { + "epoch": 0.03, + "grad_norm": 1.9636823673440575, + "learning_rate": 9.73913043478261e-06, + "loss": 0.5376, + "step": 1008 + }, + { + "epoch": 0.03, + "grad_norm": 2.397898388907613, + "learning_rate": 9.748792270531402e-06, + "loss": 0.5742, + "step": 1009 + }, + { + "epoch": 0.03, + "grad_norm": 2.371229683330484, + "learning_rate": 9.758454106280194e-06, + "loss": 0.5392, + "step": 1010 + }, + { + "epoch": 0.03, + "grad_norm": 1.8444828156726316, + "learning_rate": 9.768115942028986e-06, + "loss": 0.5088, + "step": 1011 + }, + { + "epoch": 0.03, + "grad_norm": 2.072577330007912, + "learning_rate": 9.777777777777779e-06, + "loss": 0.5512, + "step": 1012 + }, + { + "epoch": 0.03, + "grad_norm": 3.206146371011565, + "learning_rate": 9.787439613526571e-06, + "loss": 0.669, + "step": 1013 + }, + { + "epoch": 0.03, + "grad_norm": 3.6291479330945493, + "learning_rate": 9.797101449275363e-06, + "loss": 0.5371, + "step": 1014 + }, + { + "epoch": 0.03, + "grad_norm": 2.1044789387547667, + "learning_rate": 9.806763285024155e-06, + "loss": 0.5351, + "step": 1015 + }, + { + "epoch": 0.03, + "grad_norm": 2.432203937777234, + "learning_rate": 9.816425120772947e-06, + "loss": 0.5301, + "step": 1016 + }, + { + "epoch": 0.03, + "grad_norm": 2.1149784718792737, + "learning_rate": 9.82608695652174e-06, + "loss": 0.5539, + "step": 1017 + }, + { + "epoch": 0.03, + "grad_norm": 1.8058306603691419, + "learning_rate": 9.835748792270532e-06, + "loss": 0.5483, + "step": 1018 + }, + { + "epoch": 0.03, + "grad_norm": 2.053737699236065, + "learning_rate": 9.845410628019324e-06, + "loss": 0.5237, + "step": 1019 + }, + { + "epoch": 0.03, + "grad_norm": 2.9280759650190684, + "learning_rate": 9.855072463768118e-06, + "loss": 0.5295, + "step": 1020 + }, + { + "epoch": 0.03, + "grad_norm": 3.3525943052571683, + "learning_rate": 9.86473429951691e-06, + "loss": 0.5436, + "step": 1021 + }, + { + "epoch": 0.03, + "grad_norm": 2.000461108858767, + "learning_rate": 9.8743961352657e-06, + "loss": 0.5564, + "step": 1022 + }, + { + "epoch": 0.03, + "grad_norm": 1.8405553539953476, + "learning_rate": 9.884057971014493e-06, + "loss": 0.5563, + "step": 1023 + }, + { + "epoch": 0.03, + "grad_norm": 2.1181589337489815, + "learning_rate": 9.893719806763285e-06, + "loss": 0.6218, + "step": 1024 + }, + { + "epoch": 0.03, + "grad_norm": 2.088010835931957, + "learning_rate": 9.903381642512077e-06, + "loss": 0.5067, + "step": 1025 + }, + { + "epoch": 0.03, + "grad_norm": 2.4034934292281624, + "learning_rate": 9.913043478260871e-06, + "loss": 0.5144, + "step": 1026 + }, + { + "epoch": 0.03, + "grad_norm": 2.102414706457093, + "learning_rate": 9.922705314009663e-06, + "loss": 0.5394, + "step": 1027 + }, + { + "epoch": 0.03, + "grad_norm": 2.2567627818087685, + "learning_rate": 9.932367149758455e-06, + "loss": 0.5457, + "step": 1028 + }, + { + "epoch": 0.03, + "grad_norm": 2.1964155406446024, + "learning_rate": 9.942028985507248e-06, + "loss": 0.5153, + "step": 1029 + }, + { + "epoch": 0.03, + "grad_norm": 1.9530767574697978, + "learning_rate": 9.95169082125604e-06, + "loss": 0.5311, + "step": 1030 + }, + { + "epoch": 0.03, + "grad_norm": 2.0564129935390816, + "learning_rate": 9.961352657004832e-06, + "loss": 0.5583, + "step": 1031 + }, + { + "epoch": 0.03, + "grad_norm": 2.464753305370668, + "learning_rate": 9.971014492753624e-06, + "loss": 0.59, + "step": 1032 + }, + { + "epoch": 0.03, + "grad_norm": 2.206222842435782, + "learning_rate": 9.980676328502416e-06, + "loss": 0.5482, + "step": 1033 + }, + { + "epoch": 0.03, + "grad_norm": 2.164006709526497, + "learning_rate": 9.990338164251209e-06, + "loss": 0.5623, + "step": 1034 + }, + { + "epoch": 0.03, + "grad_norm": 2.2737868919552713, + "learning_rate": 1e-05, + "loss": 0.4991, + "step": 1035 + }, + { + "epoch": 0.03, + "grad_norm": 1.9784904239417131, + "learning_rate": 9.999999977937474e-06, + "loss": 0.5047, + "step": 1036 + }, + { + "epoch": 0.03, + "grad_norm": 1.9722719015742753, + "learning_rate": 9.99999991174989e-06, + "loss": 0.5187, + "step": 1037 + }, + { + "epoch": 0.03, + "grad_norm": 1.9176724597820876, + "learning_rate": 9.999999801437252e-06, + "loss": 0.5682, + "step": 1038 + }, + { + "epoch": 0.03, + "grad_norm": 1.9326351292016015, + "learning_rate": 9.99999964699956e-06, + "loss": 0.5229, + "step": 1039 + }, + { + "epoch": 0.03, + "grad_norm": 1.9776649487541307, + "learning_rate": 9.999999448436816e-06, + "loss": 0.4893, + "step": 1040 + }, + { + "epoch": 0.03, + "grad_norm": 2.1052683807422894, + "learning_rate": 9.999999205749022e-06, + "loss": 0.5003, + "step": 1041 + }, + { + "epoch": 0.03, + "grad_norm": 1.8687150001665, + "learning_rate": 9.99999891893618e-06, + "loss": 0.5614, + "step": 1042 + }, + { + "epoch": 0.03, + "grad_norm": 2.0209408520668073, + "learning_rate": 9.999998587998292e-06, + "loss": 0.539, + "step": 1043 + }, + { + "epoch": 0.03, + "grad_norm": 2.092626290821088, + "learning_rate": 9.99999821293536e-06, + "loss": 0.5445, + "step": 1044 + }, + { + "epoch": 0.03, + "grad_norm": 2.402905709572209, + "learning_rate": 9.999997793747388e-06, + "loss": 0.5595, + "step": 1045 + }, + { + "epoch": 0.03, + "grad_norm": 1.9029614623834343, + "learning_rate": 9.99999733043438e-06, + "loss": 0.4853, + "step": 1046 + }, + { + "epoch": 0.03, + "grad_norm": 2.114702816370528, + "learning_rate": 9.99999682299634e-06, + "loss": 0.5249, + "step": 1047 + }, + { + "epoch": 0.03, + "grad_norm": 1.9648197459625958, + "learning_rate": 9.999996271433275e-06, + "loss": 0.598, + "step": 1048 + }, + { + "epoch": 0.03, + "grad_norm": 2.111114715495133, + "learning_rate": 9.999995675745184e-06, + "loss": 0.5558, + "step": 1049 + }, + { + "epoch": 0.03, + "grad_norm": 2.1131646505901807, + "learning_rate": 9.999995035932078e-06, + "loss": 0.5143, + "step": 1050 + }, + { + "epoch": 0.03, + "grad_norm": 1.9321708013006595, + "learning_rate": 9.99999435199396e-06, + "loss": 0.5204, + "step": 1051 + }, + { + "epoch": 0.03, + "grad_norm": 2.0468263787795413, + "learning_rate": 9.999993623930837e-06, + "loss": 0.5102, + "step": 1052 + }, + { + "epoch": 0.03, + "grad_norm": 2.017744317315158, + "learning_rate": 9.999992851742712e-06, + "loss": 0.5544, + "step": 1053 + }, + { + "epoch": 0.03, + "grad_norm": 2.335665767523259, + "learning_rate": 9.999992035429598e-06, + "loss": 0.5341, + "step": 1054 + }, + { + "epoch": 0.03, + "grad_norm": 2.244592099635973, + "learning_rate": 9.999991174991497e-06, + "loss": 0.525, + "step": 1055 + }, + { + "epoch": 0.03, + "grad_norm": 1.939189481176987, + "learning_rate": 9.99999027042842e-06, + "loss": 0.5404, + "step": 1056 + }, + { + "epoch": 0.03, + "grad_norm": 1.9679529904955466, + "learning_rate": 9.999989321740372e-06, + "loss": 0.5072, + "step": 1057 + }, + { + "epoch": 0.03, + "grad_norm": 1.2960737806945244, + "learning_rate": 9.999988328927362e-06, + "loss": 0.5815, + "step": 1058 + }, + { + "epoch": 0.03, + "grad_norm": 1.937531659827897, + "learning_rate": 9.999987291989401e-06, + "loss": 0.5524, + "step": 1059 + }, + { + "epoch": 0.03, + "grad_norm": 1.9601796990196463, + "learning_rate": 9.999986210926495e-06, + "loss": 0.5219, + "step": 1060 + }, + { + "epoch": 0.03, + "grad_norm": 2.251886495418737, + "learning_rate": 9.999985085738657e-06, + "loss": 0.5376, + "step": 1061 + }, + { + "epoch": 0.03, + "grad_norm": 1.986706597370441, + "learning_rate": 9.999983916425894e-06, + "loss": 0.5389, + "step": 1062 + }, + { + "epoch": 0.03, + "grad_norm": 2.514452464547429, + "learning_rate": 9.999982702988218e-06, + "loss": 0.5047, + "step": 1063 + }, + { + "epoch": 0.03, + "grad_norm": 2.7569927591935857, + "learning_rate": 9.999981445425639e-06, + "loss": 0.5357, + "step": 1064 + }, + { + "epoch": 0.03, + "grad_norm": 1.853677798747448, + "learning_rate": 9.999980143738169e-06, + "loss": 0.5532, + "step": 1065 + }, + { + "epoch": 0.03, + "grad_norm": 1.9750786560500297, + "learning_rate": 9.999978797925818e-06, + "loss": 0.5365, + "step": 1066 + }, + { + "epoch": 0.03, + "grad_norm": 1.9370639185064042, + "learning_rate": 9.999977407988599e-06, + "loss": 0.4989, + "step": 1067 + }, + { + "epoch": 0.03, + "grad_norm": 2.0495089986145496, + "learning_rate": 9.999975973926524e-06, + "loss": 0.5343, + "step": 1068 + }, + { + "epoch": 0.03, + "grad_norm": 1.8966478867340064, + "learning_rate": 9.999974495739603e-06, + "loss": 0.508, + "step": 1069 + }, + { + "epoch": 0.03, + "grad_norm": 2.0043466079208625, + "learning_rate": 9.999972973427855e-06, + "loss": 0.4806, + "step": 1070 + }, + { + "epoch": 0.03, + "grad_norm": 2.92580857755538, + "learning_rate": 9.99997140699129e-06, + "loss": 0.4973, + "step": 1071 + }, + { + "epoch": 0.03, + "grad_norm": 2.1612051609405665, + "learning_rate": 9.999969796429921e-06, + "loss": 0.5068, + "step": 1072 + }, + { + "epoch": 0.03, + "grad_norm": 2.143569676425012, + "learning_rate": 9.99996814174376e-06, + "loss": 0.6188, + "step": 1073 + }, + { + "epoch": 0.03, + "grad_norm": 1.870087352757718, + "learning_rate": 9.99996644293283e-06, + "loss": 0.5049, + "step": 1074 + }, + { + "epoch": 0.03, + "grad_norm": 2.036628157721531, + "learning_rate": 9.999964699997137e-06, + "loss": 0.5283, + "step": 1075 + }, + { + "epoch": 0.03, + "grad_norm": 2.0403124698097694, + "learning_rate": 9.999962912936703e-06, + "loss": 0.5905, + "step": 1076 + }, + { + "epoch": 0.03, + "grad_norm": 1.7841959286295557, + "learning_rate": 9.999961081751538e-06, + "loss": 0.511, + "step": 1077 + }, + { + "epoch": 0.03, + "grad_norm": 1.8647668559641077, + "learning_rate": 9.999959206441662e-06, + "loss": 0.5253, + "step": 1078 + }, + { + "epoch": 0.03, + "grad_norm": 2.109164406843629, + "learning_rate": 9.99995728700709e-06, + "loss": 0.5339, + "step": 1079 + }, + { + "epoch": 0.03, + "grad_norm": 2.320161010583819, + "learning_rate": 9.999955323447841e-06, + "loss": 0.5087, + "step": 1080 + }, + { + "epoch": 0.03, + "grad_norm": 2.4356685646130956, + "learning_rate": 9.999953315763929e-06, + "loss": 0.5548, + "step": 1081 + }, + { + "epoch": 0.03, + "grad_norm": 2.1437069801319018, + "learning_rate": 9.999951263955375e-06, + "loss": 0.4837, + "step": 1082 + }, + { + "epoch": 0.03, + "grad_norm": 1.9797621585775684, + "learning_rate": 9.999949168022195e-06, + "loss": 0.4852, + "step": 1083 + }, + { + "epoch": 0.03, + "grad_norm": 2.082085961272968, + "learning_rate": 9.999947027964409e-06, + "loss": 0.5448, + "step": 1084 + }, + { + "epoch": 0.03, + "grad_norm": 2.0035739043904828, + "learning_rate": 9.999944843782034e-06, + "loss": 0.5148, + "step": 1085 + }, + { + "epoch": 0.03, + "grad_norm": 1.929556503569673, + "learning_rate": 9.99994261547509e-06, + "loss": 0.4922, + "step": 1086 + }, + { + "epoch": 0.03, + "grad_norm": 2.0421212505629485, + "learning_rate": 9.999940343043597e-06, + "loss": 0.5198, + "step": 1087 + }, + { + "epoch": 0.03, + "grad_norm": 2.000911492595907, + "learning_rate": 9.999938026487577e-06, + "loss": 0.4915, + "step": 1088 + }, + { + "epoch": 0.03, + "grad_norm": 2.0123327777442643, + "learning_rate": 9.999935665807046e-06, + "loss": 0.5419, + "step": 1089 + }, + { + "epoch": 0.03, + "grad_norm": 2.481543650395298, + "learning_rate": 9.999933261002028e-06, + "loss": 0.5592, + "step": 1090 + }, + { + "epoch": 0.03, + "grad_norm": 2.2693907363664034, + "learning_rate": 9.999930812072544e-06, + "loss": 0.4915, + "step": 1091 + }, + { + "epoch": 0.03, + "grad_norm": 1.588082451942196, + "learning_rate": 9.999928319018614e-06, + "loss": 0.6277, + "step": 1092 + }, + { + "epoch": 0.03, + "grad_norm": 2.137191560025375, + "learning_rate": 9.999925781840263e-06, + "loss": 0.5505, + "step": 1093 + }, + { + "epoch": 0.03, + "grad_norm": 2.4315547887639997, + "learning_rate": 9.999923200537509e-06, + "loss": 0.5655, + "step": 1094 + }, + { + "epoch": 0.03, + "grad_norm": 2.0568144343209456, + "learning_rate": 9.99992057511038e-06, + "loss": 0.5375, + "step": 1095 + }, + { + "epoch": 0.03, + "grad_norm": 2.0114278146599336, + "learning_rate": 9.999917905558894e-06, + "loss": 0.5217, + "step": 1096 + }, + { + "epoch": 0.03, + "grad_norm": 2.2095025817048937, + "learning_rate": 9.999915191883078e-06, + "loss": 0.5106, + "step": 1097 + }, + { + "epoch": 0.03, + "grad_norm": 2.086824306611745, + "learning_rate": 9.999912434082957e-06, + "loss": 0.4962, + "step": 1098 + }, + { + "epoch": 0.03, + "grad_norm": 2.070416558031631, + "learning_rate": 9.999909632158551e-06, + "loss": 0.5023, + "step": 1099 + }, + { + "epoch": 0.03, + "grad_norm": 2.18233664675088, + "learning_rate": 9.999906786109888e-06, + "loss": 0.5266, + "step": 1100 + }, + { + "epoch": 0.03, + "grad_norm": 2.07554861081762, + "learning_rate": 9.999903895936991e-06, + "loss": 0.5538, + "step": 1101 + }, + { + "epoch": 0.03, + "grad_norm": 1.990671678594112, + "learning_rate": 9.999900961639887e-06, + "loss": 0.525, + "step": 1102 + }, + { + "epoch": 0.03, + "grad_norm": 2.05234339087404, + "learning_rate": 9.999897983218602e-06, + "loss": 0.5073, + "step": 1103 + }, + { + "epoch": 0.03, + "grad_norm": 1.9044075358457537, + "learning_rate": 9.999894960673162e-06, + "loss": 0.5076, + "step": 1104 + }, + { + "epoch": 0.03, + "grad_norm": 2.0034529794314486, + "learning_rate": 9.999891894003593e-06, + "loss": 0.4866, + "step": 1105 + }, + { + "epoch": 0.03, + "grad_norm": 2.0578613715297003, + "learning_rate": 9.999888783209922e-06, + "loss": 0.5696, + "step": 1106 + }, + { + "epoch": 0.03, + "grad_norm": 2.1804903681996968, + "learning_rate": 9.999885628292179e-06, + "loss": 0.5605, + "step": 1107 + }, + { + "epoch": 0.03, + "grad_norm": 2.219382537296648, + "learning_rate": 9.999882429250387e-06, + "loss": 0.5453, + "step": 1108 + }, + { + "epoch": 0.03, + "grad_norm": 2.7521367941716446, + "learning_rate": 9.99987918608458e-06, + "loss": 0.5497, + "step": 1109 + }, + { + "epoch": 0.03, + "grad_norm": 2.040343587676756, + "learning_rate": 9.999875898794782e-06, + "loss": 0.5456, + "step": 1110 + }, + { + "epoch": 0.03, + "grad_norm": 2.3495302435662926, + "learning_rate": 9.999872567381024e-06, + "loss": 0.5174, + "step": 1111 + }, + { + "epoch": 0.03, + "grad_norm": 2.289430209590093, + "learning_rate": 9.999869191843336e-06, + "loss": 0.4973, + "step": 1112 + }, + { + "epoch": 0.03, + "grad_norm": 2.7465020336608874, + "learning_rate": 9.999865772181745e-06, + "loss": 0.5536, + "step": 1113 + }, + { + "epoch": 0.03, + "grad_norm": 2.7713402842029553, + "learning_rate": 9.999862308396285e-06, + "loss": 0.5171, + "step": 1114 + }, + { + "epoch": 0.03, + "grad_norm": 2.0188576893175028, + "learning_rate": 9.999858800486983e-06, + "loss": 0.4807, + "step": 1115 + }, + { + "epoch": 0.03, + "grad_norm": 2.0508531935965815, + "learning_rate": 9.999855248453873e-06, + "loss": 0.5411, + "step": 1116 + }, + { + "epoch": 0.03, + "grad_norm": 2.333147829203726, + "learning_rate": 9.999851652296986e-06, + "loss": 0.501, + "step": 1117 + }, + { + "epoch": 0.03, + "grad_norm": 2.0360370318290055, + "learning_rate": 9.999848012016352e-06, + "loss": 0.5153, + "step": 1118 + }, + { + "epoch": 0.03, + "grad_norm": 2.5490390122764985, + "learning_rate": 9.999844327612e-06, + "loss": 0.608, + "step": 1119 + }, + { + "epoch": 0.03, + "grad_norm": 2.3098824695808036, + "learning_rate": 9.99984059908397e-06, + "loss": 0.5491, + "step": 1120 + }, + { + "epoch": 0.03, + "grad_norm": 2.320710137954726, + "learning_rate": 9.999836826432292e-06, + "loss": 0.5648, + "step": 1121 + }, + { + "epoch": 0.03, + "grad_norm": 2.2736097815764733, + "learning_rate": 9.999833009656996e-06, + "loss": 0.5318, + "step": 1122 + }, + { + "epoch": 0.03, + "grad_norm": 2.2226587901197954, + "learning_rate": 9.99982914875812e-06, + "loss": 0.5299, + "step": 1123 + }, + { + "epoch": 0.03, + "grad_norm": 2.321622822915191, + "learning_rate": 9.999825243735696e-06, + "loss": 0.5414, + "step": 1124 + }, + { + "epoch": 0.03, + "grad_norm": 2.227952061079021, + "learning_rate": 9.999821294589758e-06, + "loss": 0.5196, + "step": 1125 + }, + { + "epoch": 0.03, + "grad_norm": 2.197067513610616, + "learning_rate": 9.999817301320343e-06, + "loss": 0.5258, + "step": 1126 + }, + { + "epoch": 0.03, + "grad_norm": 1.478783715372909, + "learning_rate": 9.999813263927483e-06, + "loss": 0.6126, + "step": 1127 + }, + { + "epoch": 0.03, + "grad_norm": 3.056675122443056, + "learning_rate": 9.999809182411217e-06, + "loss": 0.5868, + "step": 1128 + }, + { + "epoch": 0.03, + "grad_norm": 2.6068779724177844, + "learning_rate": 9.99980505677158e-06, + "loss": 0.5435, + "step": 1129 + }, + { + "epoch": 0.03, + "grad_norm": 2.0391743999007232, + "learning_rate": 9.999800887008607e-06, + "loss": 0.5464, + "step": 1130 + }, + { + "epoch": 0.03, + "grad_norm": 2.0912583597837364, + "learning_rate": 9.999796673122336e-06, + "loss": 0.4966, + "step": 1131 + }, + { + "epoch": 0.03, + "grad_norm": 3.1887506518179904, + "learning_rate": 9.9997924151128e-06, + "loss": 0.5336, + "step": 1132 + }, + { + "epoch": 0.03, + "grad_norm": 1.2632711548975308, + "learning_rate": 9.999788112980046e-06, + "loss": 0.6089, + "step": 1133 + }, + { + "epoch": 0.03, + "grad_norm": 2.2276422918928835, + "learning_rate": 9.999783766724104e-06, + "loss": 0.5297, + "step": 1134 + }, + { + "epoch": 0.03, + "grad_norm": 2.058398054037662, + "learning_rate": 9.999779376345015e-06, + "loss": 0.5103, + "step": 1135 + }, + { + "epoch": 0.03, + "grad_norm": 1.8855991256267315, + "learning_rate": 9.999774941842818e-06, + "loss": 0.5057, + "step": 1136 + }, + { + "epoch": 0.03, + "grad_norm": 1.9548153658370622, + "learning_rate": 9.999770463217552e-06, + "loss": 0.5588, + "step": 1137 + }, + { + "epoch": 0.03, + "grad_norm": 2.206121435128665, + "learning_rate": 9.999765940469255e-06, + "loss": 0.5346, + "step": 1138 + }, + { + "epoch": 0.03, + "grad_norm": 2.1469027417576925, + "learning_rate": 9.999761373597968e-06, + "loss": 0.5013, + "step": 1139 + }, + { + "epoch": 0.03, + "grad_norm": 5.716692374693654, + "learning_rate": 9.999756762603732e-06, + "loss": 0.4903, + "step": 1140 + }, + { + "epoch": 0.03, + "grad_norm": 1.9441707886758917, + "learning_rate": 9.999752107486588e-06, + "loss": 0.4997, + "step": 1141 + }, + { + "epoch": 0.03, + "grad_norm": 2.6276169376262937, + "learning_rate": 9.999747408246576e-06, + "loss": 0.5237, + "step": 1142 + }, + { + "epoch": 0.03, + "grad_norm": 2.9260033396921394, + "learning_rate": 9.999742664883736e-06, + "loss": 0.4761, + "step": 1143 + }, + { + "epoch": 0.03, + "grad_norm": 2.0528801899745077, + "learning_rate": 9.999737877398112e-06, + "loss": 0.5274, + "step": 1144 + }, + { + "epoch": 0.03, + "grad_norm": 1.9889604014738405, + "learning_rate": 9.999733045789746e-06, + "loss": 0.5487, + "step": 1145 + }, + { + "epoch": 0.03, + "grad_norm": 2.0808012095613995, + "learning_rate": 9.999728170058683e-06, + "loss": 0.5075, + "step": 1146 + }, + { + "epoch": 0.03, + "grad_norm": 6.602188446688631, + "learning_rate": 9.99972325020496e-06, + "loss": 0.5575, + "step": 1147 + }, + { + "epoch": 0.03, + "grad_norm": 1.969775892270181, + "learning_rate": 9.999718286228626e-06, + "loss": 0.4967, + "step": 1148 + }, + { + "epoch": 0.03, + "grad_norm": 2.0018627571692065, + "learning_rate": 9.999713278129722e-06, + "loss": 0.4766, + "step": 1149 + }, + { + "epoch": 0.03, + "grad_norm": 3.5876847125383775, + "learning_rate": 9.999708225908292e-06, + "loss": 0.5371, + "step": 1150 + }, + { + "epoch": 0.03, + "grad_norm": 2.1180331726312627, + "learning_rate": 9.99970312956438e-06, + "loss": 0.5628, + "step": 1151 + }, + { + "epoch": 0.03, + "grad_norm": 1.9261285248422497, + "learning_rate": 9.999697989098037e-06, + "loss": 0.5611, + "step": 1152 + }, + { + "epoch": 0.03, + "grad_norm": 1.8458816528060475, + "learning_rate": 9.9996928045093e-06, + "loss": 0.4948, + "step": 1153 + }, + { + "epoch": 0.03, + "grad_norm": 2.179545034595672, + "learning_rate": 9.99968757579822e-06, + "loss": 0.5633, + "step": 1154 + }, + { + "epoch": 0.03, + "grad_norm": 2.6088837799634272, + "learning_rate": 9.999682302964841e-06, + "loss": 0.5686, + "step": 1155 + }, + { + "epoch": 0.03, + "grad_norm": 2.429637654636818, + "learning_rate": 9.99967698600921e-06, + "loss": 0.5038, + "step": 1156 + }, + { + "epoch": 0.03, + "grad_norm": 2.048654404797917, + "learning_rate": 9.999671624931375e-06, + "loss": 0.5085, + "step": 1157 + }, + { + "epoch": 0.03, + "grad_norm": 2.5146039997236254, + "learning_rate": 9.999666219731384e-06, + "loss": 0.5043, + "step": 1158 + }, + { + "epoch": 0.03, + "grad_norm": 2.0175209027384486, + "learning_rate": 9.99966077040928e-06, + "loss": 0.5281, + "step": 1159 + }, + { + "epoch": 0.03, + "grad_norm": 2.1878201143085474, + "learning_rate": 9.999655276965114e-06, + "loss": 0.5195, + "step": 1160 + }, + { + "epoch": 0.03, + "grad_norm": 2.1704695252516406, + "learning_rate": 9.999649739398938e-06, + "loss": 0.5583, + "step": 1161 + }, + { + "epoch": 0.03, + "grad_norm": 2.12351123883333, + "learning_rate": 9.999644157710794e-06, + "loss": 0.5237, + "step": 1162 + }, + { + "epoch": 0.03, + "grad_norm": 2.0327865513264265, + "learning_rate": 9.999638531900738e-06, + "loss": 0.5409, + "step": 1163 + }, + { + "epoch": 0.03, + "grad_norm": 2.6608379810445424, + "learning_rate": 9.999632861968816e-06, + "loss": 0.5699, + "step": 1164 + }, + { + "epoch": 0.03, + "grad_norm": 2.023722293749278, + "learning_rate": 9.999627147915077e-06, + "loss": 0.5541, + "step": 1165 + }, + { + "epoch": 0.03, + "grad_norm": 2.337320583956699, + "learning_rate": 9.999621389739574e-06, + "loss": 0.5052, + "step": 1166 + }, + { + "epoch": 0.03, + "grad_norm": 2.384434785357677, + "learning_rate": 9.999615587442356e-06, + "loss": 0.5254, + "step": 1167 + }, + { + "epoch": 0.03, + "grad_norm": 1.910826620873089, + "learning_rate": 9.999609741023477e-06, + "loss": 0.474, + "step": 1168 + }, + { + "epoch": 0.03, + "grad_norm": 1.445390539974742, + "learning_rate": 9.999603850482983e-06, + "loss": 0.6016, + "step": 1169 + }, + { + "epoch": 0.03, + "grad_norm": 2.4331352430217454, + "learning_rate": 9.999597915820932e-06, + "loss": 0.5443, + "step": 1170 + }, + { + "epoch": 0.03, + "grad_norm": 2.452032671145338, + "learning_rate": 9.999591937037374e-06, + "loss": 0.5317, + "step": 1171 + }, + { + "epoch": 0.03, + "grad_norm": 1.9020808123016333, + "learning_rate": 9.999585914132362e-06, + "loss": 0.5253, + "step": 1172 + }, + { + "epoch": 0.03, + "grad_norm": 1.0860868866649258, + "learning_rate": 9.999579847105947e-06, + "loss": 0.6008, + "step": 1173 + }, + { + "epoch": 0.03, + "grad_norm": 3.994895817084652, + "learning_rate": 9.999573735958186e-06, + "loss": 0.5234, + "step": 1174 + }, + { + "epoch": 0.03, + "grad_norm": 2.2871381213647797, + "learning_rate": 9.99956758068913e-06, + "loss": 0.4884, + "step": 1175 + }, + { + "epoch": 0.03, + "grad_norm": 2.786498553627318, + "learning_rate": 9.999561381298835e-06, + "loss": 0.5282, + "step": 1176 + }, + { + "epoch": 0.03, + "grad_norm": 2.104725669385385, + "learning_rate": 9.999555137787356e-06, + "loss": 0.519, + "step": 1177 + }, + { + "epoch": 0.03, + "grad_norm": 2.1454142898496156, + "learning_rate": 9.999548850154748e-06, + "loss": 0.4866, + "step": 1178 + }, + { + "epoch": 0.03, + "grad_norm": 2.059678149629929, + "learning_rate": 9.999542518401063e-06, + "loss": 0.4931, + "step": 1179 + }, + { + "epoch": 0.03, + "grad_norm": 2.0721652841364384, + "learning_rate": 9.999536142526363e-06, + "loss": 0.5491, + "step": 1180 + }, + { + "epoch": 0.03, + "grad_norm": 2.1044381068589377, + "learning_rate": 9.9995297225307e-06, + "loss": 0.4902, + "step": 1181 + }, + { + "epoch": 0.03, + "grad_norm": 2.2784660197452484, + "learning_rate": 9.999523258414132e-06, + "loss": 0.5295, + "step": 1182 + }, + { + "epoch": 0.03, + "grad_norm": 2.11720432421661, + "learning_rate": 9.999516750176717e-06, + "loss": 0.5197, + "step": 1183 + }, + { + "epoch": 0.03, + "grad_norm": 2.2009655544106015, + "learning_rate": 9.99951019781851e-06, + "loss": 0.5608, + "step": 1184 + }, + { + "epoch": 0.03, + "grad_norm": 3.2541915503676173, + "learning_rate": 9.999503601339568e-06, + "loss": 0.5416, + "step": 1185 + }, + { + "epoch": 0.03, + "grad_norm": 2.028970855939298, + "learning_rate": 9.999496960739953e-06, + "loss": 0.5112, + "step": 1186 + }, + { + "epoch": 0.03, + "grad_norm": 1.3215826843726095, + "learning_rate": 9.999490276019724e-06, + "loss": 0.5126, + "step": 1187 + }, + { + "epoch": 0.03, + "grad_norm": 2.3149296549755762, + "learning_rate": 9.999483547178933e-06, + "loss": 0.5055, + "step": 1188 + }, + { + "epoch": 0.03, + "grad_norm": 2.07950868022197, + "learning_rate": 9.999476774217649e-06, + "loss": 0.4969, + "step": 1189 + }, + { + "epoch": 0.03, + "grad_norm": 1.8536681980654435, + "learning_rate": 9.999469957135924e-06, + "loss": 0.506, + "step": 1190 + }, + { + "epoch": 0.03, + "grad_norm": 2.3479980277094152, + "learning_rate": 9.999463095933824e-06, + "loss": 0.5278, + "step": 1191 + }, + { + "epoch": 0.03, + "grad_norm": 2.116391212405376, + "learning_rate": 9.999456190611403e-06, + "loss": 0.535, + "step": 1192 + }, + { + "epoch": 0.03, + "grad_norm": 1.8464489910644104, + "learning_rate": 9.999449241168729e-06, + "loss": 0.5124, + "step": 1193 + }, + { + "epoch": 0.03, + "grad_norm": 1.2050516923259929, + "learning_rate": 9.999442247605858e-06, + "loss": 0.574, + "step": 1194 + }, + { + "epoch": 0.03, + "grad_norm": 3.0068790028855217, + "learning_rate": 9.999435209922854e-06, + "loss": 0.5087, + "step": 1195 + }, + { + "epoch": 0.03, + "grad_norm": 2.4859637774847485, + "learning_rate": 9.999428128119779e-06, + "loss": 0.5156, + "step": 1196 + }, + { + "epoch": 0.03, + "grad_norm": 2.5020977370107222, + "learning_rate": 9.999421002196696e-06, + "loss": 0.5339, + "step": 1197 + }, + { + "epoch": 0.03, + "grad_norm": 2.0471951853906374, + "learning_rate": 9.999413832153667e-06, + "loss": 0.5096, + "step": 1198 + }, + { + "epoch": 0.03, + "grad_norm": 2.196684303540584, + "learning_rate": 9.999406617990755e-06, + "loss": 0.5384, + "step": 1199 + }, + { + "epoch": 0.03, + "grad_norm": 2.092794358295197, + "learning_rate": 9.999399359708023e-06, + "loss": 0.4522, + "step": 1200 + }, + { + "epoch": 0.03, + "grad_norm": 2.0689982348687104, + "learning_rate": 9.999392057305537e-06, + "loss": 0.5061, + "step": 1201 + }, + { + "epoch": 0.03, + "grad_norm": 1.2923553322305343, + "learning_rate": 9.999384710783362e-06, + "loss": 0.5271, + "step": 1202 + }, + { + "epoch": 0.03, + "grad_norm": 2.577336200541685, + "learning_rate": 9.99937732014156e-06, + "loss": 0.5654, + "step": 1203 + }, + { + "epoch": 0.03, + "grad_norm": 4.7651501713595525, + "learning_rate": 9.999369885380199e-06, + "loss": 0.4926, + "step": 1204 + }, + { + "epoch": 0.03, + "grad_norm": 1.9676907740334817, + "learning_rate": 9.999362406499342e-06, + "loss": 0.5088, + "step": 1205 + }, + { + "epoch": 0.03, + "grad_norm": 1.153697134462872, + "learning_rate": 9.999354883499056e-06, + "loss": 0.5318, + "step": 1206 + }, + { + "epoch": 0.04, + "grad_norm": 2.560484409422626, + "learning_rate": 9.99934731637941e-06, + "loss": 0.5339, + "step": 1207 + }, + { + "epoch": 0.04, + "grad_norm": 2.390800438305061, + "learning_rate": 9.999339705140466e-06, + "loss": 0.5441, + "step": 1208 + }, + { + "epoch": 0.04, + "grad_norm": 3.764465675319786, + "learning_rate": 9.999332049782295e-06, + "loss": 0.5322, + "step": 1209 + }, + { + "epoch": 0.04, + "grad_norm": 6.8273036132740925, + "learning_rate": 9.999324350304961e-06, + "loss": 0.4948, + "step": 1210 + }, + { + "epoch": 0.04, + "grad_norm": 2.5478248720562386, + "learning_rate": 9.999316606708538e-06, + "loss": 0.5755, + "step": 1211 + }, + { + "epoch": 0.04, + "grad_norm": 4.71706298039446, + "learning_rate": 9.999308818993088e-06, + "loss": 0.5562, + "step": 1212 + }, + { + "epoch": 0.04, + "grad_norm": 2.7956908640242784, + "learning_rate": 9.999300987158683e-06, + "loss": 0.535, + "step": 1213 + }, + { + "epoch": 0.04, + "grad_norm": 2.342030910716139, + "learning_rate": 9.99929311120539e-06, + "loss": 0.5187, + "step": 1214 + }, + { + "epoch": 0.04, + "grad_norm": 2.096060996801265, + "learning_rate": 9.999285191133281e-06, + "loss": 0.5234, + "step": 1215 + }, + { + "epoch": 0.04, + "grad_norm": 2.1805948076947383, + "learning_rate": 9.999277226942424e-06, + "loss": 0.509, + "step": 1216 + }, + { + "epoch": 0.04, + "grad_norm": 2.113164433048254, + "learning_rate": 9.99926921863289e-06, + "loss": 0.5352, + "step": 1217 + }, + { + "epoch": 0.04, + "grad_norm": 2.303244255218326, + "learning_rate": 9.99926116620475e-06, + "loss": 0.5205, + "step": 1218 + }, + { + "epoch": 0.04, + "grad_norm": 2.220471251065992, + "learning_rate": 9.999253069658074e-06, + "loss": 0.5925, + "step": 1219 + }, + { + "epoch": 0.04, + "grad_norm": 1.9423779137163277, + "learning_rate": 9.999244928992937e-06, + "loss": 0.557, + "step": 1220 + }, + { + "epoch": 0.04, + "grad_norm": 2.42304217702803, + "learning_rate": 9.999236744209406e-06, + "loss": 0.5214, + "step": 1221 + }, + { + "epoch": 0.04, + "grad_norm": 2.240916015046336, + "learning_rate": 9.999228515307554e-06, + "loss": 0.4968, + "step": 1222 + }, + { + "epoch": 0.04, + "grad_norm": 3.1279126600810887, + "learning_rate": 9.999220242287458e-06, + "loss": 0.5084, + "step": 1223 + }, + { + "epoch": 0.04, + "grad_norm": 2.0267913219222913, + "learning_rate": 9.999211925149186e-06, + "loss": 0.4953, + "step": 1224 + }, + { + "epoch": 0.04, + "grad_norm": 1.8232905479066068, + "learning_rate": 9.999203563892814e-06, + "loss": 0.526, + "step": 1225 + }, + { + "epoch": 0.04, + "grad_norm": 1.9355454974579995, + "learning_rate": 9.999195158518415e-06, + "loss": 0.5333, + "step": 1226 + }, + { + "epoch": 0.04, + "grad_norm": 2.3926810664451184, + "learning_rate": 9.999186709026061e-06, + "loss": 0.5179, + "step": 1227 + }, + { + "epoch": 0.04, + "grad_norm": 1.6887397730490628, + "learning_rate": 9.999178215415832e-06, + "loss": 0.4804, + "step": 1228 + }, + { + "epoch": 0.04, + "grad_norm": 1.9216203016982625, + "learning_rate": 9.999169677687799e-06, + "loss": 0.5331, + "step": 1229 + }, + { + "epoch": 0.04, + "grad_norm": 2.0427260010260744, + "learning_rate": 9.999161095842038e-06, + "loss": 0.5104, + "step": 1230 + }, + { + "epoch": 0.04, + "grad_norm": 2.269656901441269, + "learning_rate": 9.999152469878623e-06, + "loss": 0.5418, + "step": 1231 + }, + { + "epoch": 0.04, + "grad_norm": 1.8496638557007172, + "learning_rate": 9.999143799797635e-06, + "loss": 0.4742, + "step": 1232 + }, + { + "epoch": 0.04, + "grad_norm": 1.8092908717224738, + "learning_rate": 9.999135085599145e-06, + "loss": 0.4841, + "step": 1233 + }, + { + "epoch": 0.04, + "grad_norm": 2.269467931364478, + "learning_rate": 9.999126327283234e-06, + "loss": 0.5327, + "step": 1234 + }, + { + "epoch": 0.04, + "grad_norm": 2.182682610754309, + "learning_rate": 9.999117524849977e-06, + "loss": 0.4787, + "step": 1235 + }, + { + "epoch": 0.04, + "grad_norm": 1.9198298576186041, + "learning_rate": 9.999108678299452e-06, + "loss": 0.4649, + "step": 1236 + }, + { + "epoch": 0.04, + "grad_norm": 1.320975876898533, + "learning_rate": 9.999099787631739e-06, + "loss": 0.5889, + "step": 1237 + }, + { + "epoch": 0.04, + "grad_norm": 2.5733193919574537, + "learning_rate": 9.999090852846913e-06, + "loss": 0.4617, + "step": 1238 + }, + { + "epoch": 0.04, + "grad_norm": 1.8994141585035293, + "learning_rate": 9.999081873945056e-06, + "loss": 0.4857, + "step": 1239 + }, + { + "epoch": 0.04, + "grad_norm": 2.1117261998634618, + "learning_rate": 9.999072850926245e-06, + "loss": 0.5622, + "step": 1240 + }, + { + "epoch": 0.04, + "grad_norm": 8.491670506888479, + "learning_rate": 9.999063783790563e-06, + "loss": 0.5286, + "step": 1241 + }, + { + "epoch": 0.04, + "grad_norm": 5.405333573561132, + "learning_rate": 9.999054672538085e-06, + "loss": 0.501, + "step": 1242 + }, + { + "epoch": 0.04, + "grad_norm": 2.3552150697629037, + "learning_rate": 9.999045517168895e-06, + "loss": 0.554, + "step": 1243 + }, + { + "epoch": 0.04, + "grad_norm": 2.2345240121448793, + "learning_rate": 9.999036317683073e-06, + "loss": 0.5181, + "step": 1244 + }, + { + "epoch": 0.04, + "grad_norm": 2.418254422398706, + "learning_rate": 9.999027074080701e-06, + "loss": 0.5041, + "step": 1245 + }, + { + "epoch": 0.04, + "grad_norm": 2.2544041156317496, + "learning_rate": 9.999017786361858e-06, + "loss": 0.5419, + "step": 1246 + }, + { + "epoch": 0.04, + "grad_norm": 2.1591691129843134, + "learning_rate": 9.999008454526628e-06, + "loss": 0.5012, + "step": 1247 + }, + { + "epoch": 0.04, + "grad_norm": 1.9912864574414564, + "learning_rate": 9.998999078575094e-06, + "loss": 0.5337, + "step": 1248 + }, + { + "epoch": 0.04, + "grad_norm": 1.1495442653429948, + "learning_rate": 9.998989658507335e-06, + "loss": 0.5656, + "step": 1249 + }, + { + "epoch": 0.04, + "grad_norm": 1.179933449522636, + "learning_rate": 9.99898019432344e-06, + "loss": 0.5341, + "step": 1250 + }, + { + "epoch": 0.04, + "grad_norm": 3.041298984688934, + "learning_rate": 9.998970686023487e-06, + "loss": 0.5253, + "step": 1251 + }, + { + "epoch": 0.04, + "grad_norm": 2.1659001404893807, + "learning_rate": 9.998961133607564e-06, + "loss": 0.5394, + "step": 1252 + }, + { + "epoch": 0.04, + "grad_norm": 2.1613430823172903, + "learning_rate": 9.998951537075751e-06, + "loss": 0.5077, + "step": 1253 + }, + { + "epoch": 0.04, + "grad_norm": 2.1859334335279037, + "learning_rate": 9.998941896428138e-06, + "loss": 0.5193, + "step": 1254 + }, + { + "epoch": 0.04, + "grad_norm": 1.963138307228976, + "learning_rate": 9.998932211664805e-06, + "loss": 0.493, + "step": 1255 + }, + { + "epoch": 0.04, + "grad_norm": 1.9964065299800098, + "learning_rate": 9.998922482785843e-06, + "loss": 0.4939, + "step": 1256 + }, + { + "epoch": 0.04, + "grad_norm": 1.978387440473267, + "learning_rate": 9.99891270979133e-06, + "loss": 0.5066, + "step": 1257 + }, + { + "epoch": 0.04, + "grad_norm": 2.22464090489889, + "learning_rate": 9.998902892681361e-06, + "loss": 0.5599, + "step": 1258 + }, + { + "epoch": 0.04, + "grad_norm": 2.0858133282199827, + "learning_rate": 9.998893031456016e-06, + "loss": 0.5198, + "step": 1259 + }, + { + "epoch": 0.04, + "grad_norm": 1.9500800364128612, + "learning_rate": 9.998883126115386e-06, + "loss": 0.5367, + "step": 1260 + }, + { + "epoch": 0.04, + "grad_norm": 2.0524198438216965, + "learning_rate": 9.998873176659558e-06, + "loss": 0.6115, + "step": 1261 + }, + { + "epoch": 0.04, + "grad_norm": 1.9535338309930839, + "learning_rate": 9.998863183088617e-06, + "loss": 0.5791, + "step": 1262 + }, + { + "epoch": 0.04, + "grad_norm": 2.4605534604526937, + "learning_rate": 9.998853145402653e-06, + "loss": 0.491, + "step": 1263 + }, + { + "epoch": 0.04, + "grad_norm": 2.0007286330354006, + "learning_rate": 9.998843063601755e-06, + "loss": 0.5326, + "step": 1264 + }, + { + "epoch": 0.04, + "grad_norm": 2.7747655099529798, + "learning_rate": 9.99883293768601e-06, + "loss": 0.5305, + "step": 1265 + }, + { + "epoch": 0.04, + "grad_norm": 1.9542728901570905, + "learning_rate": 9.998822767655511e-06, + "loss": 0.4934, + "step": 1266 + }, + { + "epoch": 0.04, + "grad_norm": 2.21230604256064, + "learning_rate": 9.998812553510345e-06, + "loss": 0.5144, + "step": 1267 + }, + { + "epoch": 0.04, + "grad_norm": 2.265314140124065, + "learning_rate": 9.998802295250602e-06, + "loss": 0.5042, + "step": 1268 + }, + { + "epoch": 0.04, + "grad_norm": 2.02684187316098, + "learning_rate": 9.998791992876373e-06, + "loss": 0.5511, + "step": 1269 + }, + { + "epoch": 0.04, + "grad_norm": 2.16978582621346, + "learning_rate": 9.99878164638775e-06, + "loss": 0.4922, + "step": 1270 + }, + { + "epoch": 0.04, + "grad_norm": 1.977565939975679, + "learning_rate": 9.998771255784825e-06, + "loss": 0.4491, + "step": 1271 + }, + { + "epoch": 0.04, + "grad_norm": 2.1117441724281605, + "learning_rate": 9.998760821067686e-06, + "loss": 0.4879, + "step": 1272 + }, + { + "epoch": 0.04, + "grad_norm": 2.0759891168074622, + "learning_rate": 9.998750342236427e-06, + "loss": 0.4958, + "step": 1273 + }, + { + "epoch": 0.04, + "grad_norm": 2.429654742191639, + "learning_rate": 9.998739819291144e-06, + "loss": 0.523, + "step": 1274 + }, + { + "epoch": 0.04, + "grad_norm": 3.197664955536915, + "learning_rate": 9.998729252231923e-06, + "loss": 0.5103, + "step": 1275 + }, + { + "epoch": 0.04, + "grad_norm": 2.3048545525568085, + "learning_rate": 9.998718641058862e-06, + "loss": 0.5052, + "step": 1276 + }, + { + "epoch": 0.04, + "grad_norm": 2.0982775531553655, + "learning_rate": 9.998707985772055e-06, + "loss": 0.4594, + "step": 1277 + }, + { + "epoch": 0.04, + "grad_norm": 2.081028313721837, + "learning_rate": 9.998697286371592e-06, + "loss": 0.4847, + "step": 1278 + }, + { + "epoch": 0.04, + "grad_norm": 1.9155321728245347, + "learning_rate": 9.998686542857573e-06, + "loss": 0.4933, + "step": 1279 + }, + { + "epoch": 0.04, + "grad_norm": 2.309781563014404, + "learning_rate": 9.998675755230088e-06, + "loss": 0.467, + "step": 1280 + }, + { + "epoch": 0.04, + "grad_norm": 1.9123418348697925, + "learning_rate": 9.998664923489234e-06, + "loss": 0.5394, + "step": 1281 + }, + { + "epoch": 0.04, + "grad_norm": 1.8603344557221153, + "learning_rate": 9.998654047635107e-06, + "loss": 0.5207, + "step": 1282 + }, + { + "epoch": 0.04, + "grad_norm": 2.2428329037329924, + "learning_rate": 9.998643127667801e-06, + "loss": 0.4966, + "step": 1283 + }, + { + "epoch": 0.04, + "grad_norm": 2.347656328183639, + "learning_rate": 9.998632163587418e-06, + "loss": 0.5182, + "step": 1284 + }, + { + "epoch": 0.04, + "grad_norm": 2.199122362343416, + "learning_rate": 9.998621155394047e-06, + "loss": 0.5179, + "step": 1285 + }, + { + "epoch": 0.04, + "grad_norm": 2.2183165245757053, + "learning_rate": 9.998610103087791e-06, + "loss": 0.5499, + "step": 1286 + }, + { + "epoch": 0.04, + "grad_norm": 2.954477115112511, + "learning_rate": 9.998599006668743e-06, + "loss": 0.4666, + "step": 1287 + }, + { + "epoch": 0.04, + "grad_norm": 3.151007859766221, + "learning_rate": 9.998587866137005e-06, + "loss": 0.5095, + "step": 1288 + }, + { + "epoch": 0.04, + "grad_norm": 2.1767515136353444, + "learning_rate": 9.998576681492674e-06, + "loss": 0.4949, + "step": 1289 + }, + { + "epoch": 0.04, + "grad_norm": 2.215713486325688, + "learning_rate": 9.998565452735846e-06, + "loss": 0.4971, + "step": 1290 + }, + { + "epoch": 0.04, + "grad_norm": 2.118206033722443, + "learning_rate": 9.998554179866625e-06, + "loss": 0.5081, + "step": 1291 + }, + { + "epoch": 0.04, + "grad_norm": 2.423128634552543, + "learning_rate": 9.998542862885107e-06, + "loss": 0.568, + "step": 1292 + }, + { + "epoch": 0.04, + "grad_norm": 2.1382835656907173, + "learning_rate": 9.998531501791394e-06, + "loss": 0.5697, + "step": 1293 + }, + { + "epoch": 0.04, + "grad_norm": 2.244466948802007, + "learning_rate": 9.998520096585582e-06, + "loss": 0.4498, + "step": 1294 + }, + { + "epoch": 0.04, + "grad_norm": 2.4547134053257085, + "learning_rate": 9.998508647267777e-06, + "loss": 0.5062, + "step": 1295 + }, + { + "epoch": 0.04, + "grad_norm": 2.6317132504054803, + "learning_rate": 9.998497153838077e-06, + "loss": 0.4975, + "step": 1296 + }, + { + "epoch": 0.04, + "grad_norm": 2.4952580277582497, + "learning_rate": 9.998485616296585e-06, + "loss": 0.5691, + "step": 1297 + }, + { + "epoch": 0.04, + "grad_norm": 5.432917082874448, + "learning_rate": 9.998474034643402e-06, + "loss": 0.6099, + "step": 1298 + }, + { + "epoch": 0.04, + "grad_norm": 2.190398698623534, + "learning_rate": 9.998462408878628e-06, + "loss": 0.4975, + "step": 1299 + }, + { + "epoch": 0.04, + "grad_norm": 2.0034731382521214, + "learning_rate": 9.99845073900237e-06, + "loss": 0.4823, + "step": 1300 + }, + { + "epoch": 0.04, + "grad_norm": 4.508827950842665, + "learning_rate": 9.998439025014728e-06, + "loss": 0.4904, + "step": 1301 + }, + { + "epoch": 0.04, + "grad_norm": 2.0004185324815817, + "learning_rate": 9.998427266915807e-06, + "loss": 0.4791, + "step": 1302 + }, + { + "epoch": 0.04, + "grad_norm": 2.6038379207812685, + "learning_rate": 9.998415464705709e-06, + "loss": 0.6811, + "step": 1303 + }, + { + "epoch": 0.04, + "grad_norm": 2.4329175411669235, + "learning_rate": 9.99840361838454e-06, + "loss": 0.5177, + "step": 1304 + }, + { + "epoch": 0.04, + "grad_norm": 2.8294807967097264, + "learning_rate": 9.998391727952403e-06, + "loss": 0.4937, + "step": 1305 + }, + { + "epoch": 0.04, + "grad_norm": 2.1909204260999906, + "learning_rate": 9.998379793409402e-06, + "loss": 0.529, + "step": 1306 + }, + { + "epoch": 0.04, + "grad_norm": 2.2449219433633707, + "learning_rate": 9.998367814755646e-06, + "loss": 0.4972, + "step": 1307 + }, + { + "epoch": 0.04, + "grad_norm": 2.187251790114967, + "learning_rate": 9.99835579199124e-06, + "loss": 0.5314, + "step": 1308 + }, + { + "epoch": 0.04, + "grad_norm": 2.262979536174458, + "learning_rate": 9.998343725116285e-06, + "loss": 0.5693, + "step": 1309 + }, + { + "epoch": 0.04, + "grad_norm": 2.585178796159689, + "learning_rate": 9.998331614130891e-06, + "loss": 0.4981, + "step": 1310 + }, + { + "epoch": 0.04, + "grad_norm": 2.136701522801022, + "learning_rate": 9.998319459035168e-06, + "loss": 0.482, + "step": 1311 + }, + { + "epoch": 0.04, + "grad_norm": 2.1101711091096336, + "learning_rate": 9.99830725982922e-06, + "loss": 0.5278, + "step": 1312 + }, + { + "epoch": 0.04, + "grad_norm": 2.4906248539538014, + "learning_rate": 9.998295016513153e-06, + "loss": 0.5218, + "step": 1313 + }, + { + "epoch": 0.04, + "grad_norm": 2.4745081696887525, + "learning_rate": 9.998282729087078e-06, + "loss": 0.51, + "step": 1314 + }, + { + "epoch": 0.04, + "grad_norm": 2.6027042972984233, + "learning_rate": 9.998270397551104e-06, + "loss": 0.5192, + "step": 1315 + }, + { + "epoch": 0.04, + "grad_norm": 2.0001958000992226, + "learning_rate": 9.998258021905336e-06, + "loss": 0.531, + "step": 1316 + }, + { + "epoch": 0.04, + "grad_norm": 2.161250320780201, + "learning_rate": 9.998245602149888e-06, + "loss": 0.565, + "step": 1317 + }, + { + "epoch": 0.04, + "grad_norm": 2.2357028577480973, + "learning_rate": 9.998233138284865e-06, + "loss": 0.4607, + "step": 1318 + }, + { + "epoch": 0.04, + "grad_norm": 2.16228521070335, + "learning_rate": 9.998220630310379e-06, + "loss": 0.4629, + "step": 1319 + }, + { + "epoch": 0.04, + "grad_norm": 1.0823136011283265, + "learning_rate": 9.99820807822654e-06, + "loss": 0.5441, + "step": 1320 + }, + { + "epoch": 0.04, + "grad_norm": 2.218201043956567, + "learning_rate": 9.998195482033462e-06, + "loss": 0.5008, + "step": 1321 + }, + { + "epoch": 0.04, + "grad_norm": 2.0891234784661905, + "learning_rate": 9.998182841731252e-06, + "loss": 0.5067, + "step": 1322 + }, + { + "epoch": 0.04, + "grad_norm": 2.2130710677189698, + "learning_rate": 9.998170157320022e-06, + "loss": 0.5484, + "step": 1323 + }, + { + "epoch": 0.04, + "grad_norm": 2.174245849450159, + "learning_rate": 9.998157428799887e-06, + "loss": 0.5165, + "step": 1324 + }, + { + "epoch": 0.04, + "grad_norm": 2.9798136569517015, + "learning_rate": 9.998144656170956e-06, + "loss": 0.5063, + "step": 1325 + }, + { + "epoch": 0.04, + "grad_norm": 2.0977152533093197, + "learning_rate": 9.998131839433342e-06, + "loss": 0.4793, + "step": 1326 + }, + { + "epoch": 0.04, + "grad_norm": 1.1039719110765345, + "learning_rate": 9.99811897858716e-06, + "loss": 0.5599, + "step": 1327 + }, + { + "epoch": 0.04, + "grad_norm": 2.3594942325266124, + "learning_rate": 9.998106073632523e-06, + "loss": 0.5083, + "step": 1328 + }, + { + "epoch": 0.04, + "grad_norm": 3.12269394745947, + "learning_rate": 9.998093124569543e-06, + "loss": 0.5119, + "step": 1329 + }, + { + "epoch": 0.04, + "grad_norm": 2.078143342289411, + "learning_rate": 9.998080131398338e-06, + "loss": 0.5183, + "step": 1330 + }, + { + "epoch": 0.04, + "grad_norm": 2.2450039970455857, + "learning_rate": 9.99806709411902e-06, + "loss": 0.5111, + "step": 1331 + }, + { + "epoch": 0.04, + "grad_norm": 2.0321751586639465, + "learning_rate": 9.998054012731702e-06, + "loss": 0.517, + "step": 1332 + }, + { + "epoch": 0.04, + "grad_norm": 2.053081143072184, + "learning_rate": 9.998040887236505e-06, + "loss": 0.5177, + "step": 1333 + }, + { + "epoch": 0.04, + "grad_norm": 2.091018204767946, + "learning_rate": 9.998027717633539e-06, + "loss": 0.4756, + "step": 1334 + }, + { + "epoch": 0.04, + "grad_norm": 2.660433262922176, + "learning_rate": 9.998014503922924e-06, + "loss": 0.4932, + "step": 1335 + }, + { + "epoch": 0.04, + "grad_norm": 2.3137869231517336, + "learning_rate": 9.998001246104777e-06, + "loss": 0.4658, + "step": 1336 + }, + { + "epoch": 0.04, + "grad_norm": 2.2471500058508664, + "learning_rate": 9.997987944179211e-06, + "loss": 0.4653, + "step": 1337 + }, + { + "epoch": 0.04, + "grad_norm": 2.822263144461826, + "learning_rate": 9.997974598146348e-06, + "loss": 0.5362, + "step": 1338 + }, + { + "epoch": 0.04, + "grad_norm": 2.066371900933471, + "learning_rate": 9.997961208006302e-06, + "loss": 0.5144, + "step": 1339 + }, + { + "epoch": 0.04, + "grad_norm": 1.1676680186675914, + "learning_rate": 9.997947773759195e-06, + "loss": 0.5255, + "step": 1340 + }, + { + "epoch": 0.04, + "grad_norm": 2.1580689237779587, + "learning_rate": 9.997934295405142e-06, + "loss": 0.5012, + "step": 1341 + }, + { + "epoch": 0.04, + "grad_norm": 2.0297781308341696, + "learning_rate": 9.997920772944262e-06, + "loss": 0.49, + "step": 1342 + }, + { + "epoch": 0.04, + "grad_norm": 2.188924511612295, + "learning_rate": 9.99790720637668e-06, + "loss": 0.5417, + "step": 1343 + }, + { + "epoch": 0.04, + "grad_norm": 2.3743310876256225, + "learning_rate": 9.997893595702507e-06, + "loss": 0.5729, + "step": 1344 + }, + { + "epoch": 0.04, + "grad_norm": 2.42965897478836, + "learning_rate": 9.997879940921871e-06, + "loss": 0.5437, + "step": 1345 + }, + { + "epoch": 0.04, + "grad_norm": 2.3615553142250905, + "learning_rate": 9.997866242034887e-06, + "loss": 0.4629, + "step": 1346 + }, + { + "epoch": 0.04, + "grad_norm": 2.0501499961166045, + "learning_rate": 9.997852499041678e-06, + "loss": 0.4772, + "step": 1347 + }, + { + "epoch": 0.04, + "grad_norm": 2.274596658859864, + "learning_rate": 9.997838711942366e-06, + "loss": 0.499, + "step": 1348 + }, + { + "epoch": 0.04, + "grad_norm": 2.0378253837128875, + "learning_rate": 9.997824880737073e-06, + "loss": 0.4937, + "step": 1349 + }, + { + "epoch": 0.04, + "grad_norm": 2.1082236123708444, + "learning_rate": 9.997811005425918e-06, + "loss": 0.4813, + "step": 1350 + }, + { + "epoch": 0.04, + "grad_norm": 2.529931335802892, + "learning_rate": 9.997797086009026e-06, + "loss": 0.51, + "step": 1351 + }, + { + "epoch": 0.04, + "grad_norm": 2.244104250758986, + "learning_rate": 9.99778312248652e-06, + "loss": 0.5134, + "step": 1352 + }, + { + "epoch": 0.04, + "grad_norm": 2.272364504422478, + "learning_rate": 9.997769114858523e-06, + "loss": 0.4896, + "step": 1353 + }, + { + "epoch": 0.04, + "grad_norm": 2.0395368055072183, + "learning_rate": 9.997755063125155e-06, + "loss": 0.5426, + "step": 1354 + }, + { + "epoch": 0.04, + "grad_norm": 2.2105759116312957, + "learning_rate": 9.997740967286545e-06, + "loss": 0.5183, + "step": 1355 + }, + { + "epoch": 0.04, + "grad_norm": 2.1019210950938647, + "learning_rate": 9.997726827342816e-06, + "loss": 0.5268, + "step": 1356 + }, + { + "epoch": 0.04, + "grad_norm": 1.996599655388312, + "learning_rate": 9.997712643294093e-06, + "loss": 0.4389, + "step": 1357 + }, + { + "epoch": 0.04, + "grad_norm": 2.030868778776754, + "learning_rate": 9.997698415140499e-06, + "loss": 0.4897, + "step": 1358 + }, + { + "epoch": 0.04, + "grad_norm": 2.084075609254249, + "learning_rate": 9.997684142882161e-06, + "loss": 0.4445, + "step": 1359 + }, + { + "epoch": 0.04, + "grad_norm": 2.276522118400389, + "learning_rate": 9.997669826519205e-06, + "loss": 0.4821, + "step": 1360 + }, + { + "epoch": 0.04, + "grad_norm": 2.4119872497887482, + "learning_rate": 9.997655466051759e-06, + "loss": 0.5303, + "step": 1361 + }, + { + "epoch": 0.04, + "grad_norm": 2.2009993369010328, + "learning_rate": 9.997641061479946e-06, + "loss": 0.5197, + "step": 1362 + }, + { + "epoch": 0.04, + "grad_norm": 2.039155174123141, + "learning_rate": 9.997626612803897e-06, + "loss": 0.4813, + "step": 1363 + }, + { + "epoch": 0.04, + "grad_norm": 2.908159268655018, + "learning_rate": 9.997612120023737e-06, + "loss": 0.5187, + "step": 1364 + }, + { + "epoch": 0.04, + "grad_norm": 1.8420281496090474, + "learning_rate": 9.997597583139593e-06, + "loss": 0.5267, + "step": 1365 + }, + { + "epoch": 0.04, + "grad_norm": 2.2025636838128277, + "learning_rate": 9.997583002151597e-06, + "loss": 0.5098, + "step": 1366 + }, + { + "epoch": 0.04, + "grad_norm": 1.8780862344602198, + "learning_rate": 9.997568377059876e-06, + "loss": 0.5323, + "step": 1367 + }, + { + "epoch": 0.04, + "grad_norm": 1.875695467890045, + "learning_rate": 9.997553707864555e-06, + "loss": 0.4916, + "step": 1368 + }, + { + "epoch": 0.04, + "grad_norm": 1.8263579811067432, + "learning_rate": 9.99753899456577e-06, + "loss": 0.4839, + "step": 1369 + }, + { + "epoch": 0.04, + "grad_norm": 1.7919777115908266, + "learning_rate": 9.99752423716365e-06, + "loss": 0.5107, + "step": 1370 + }, + { + "epoch": 0.04, + "grad_norm": 2.098337136236248, + "learning_rate": 9.997509435658318e-06, + "loss": 0.468, + "step": 1371 + }, + { + "epoch": 0.04, + "grad_norm": 2.0060360608797634, + "learning_rate": 9.997494590049914e-06, + "loss": 0.4882, + "step": 1372 + }, + { + "epoch": 0.04, + "grad_norm": 2.25712676819738, + "learning_rate": 9.997479700338563e-06, + "loss": 0.5018, + "step": 1373 + }, + { + "epoch": 0.04, + "grad_norm": 2.004839862732765, + "learning_rate": 9.997464766524399e-06, + "loss": 0.5036, + "step": 1374 + }, + { + "epoch": 0.04, + "grad_norm": 2.007380852144738, + "learning_rate": 9.997449788607551e-06, + "loss": 0.4926, + "step": 1375 + }, + { + "epoch": 0.04, + "grad_norm": 2.0061959525887443, + "learning_rate": 9.997434766588156e-06, + "loss": 0.4937, + "step": 1376 + }, + { + "epoch": 0.04, + "grad_norm": 2.0701731381515605, + "learning_rate": 9.997419700466343e-06, + "loss": 0.4949, + "step": 1377 + }, + { + "epoch": 0.04, + "grad_norm": 2.4290257866219953, + "learning_rate": 9.997404590242244e-06, + "loss": 0.5027, + "step": 1378 + }, + { + "epoch": 0.04, + "grad_norm": 2.0505509051775093, + "learning_rate": 9.997389435915996e-06, + "loss": 0.5053, + "step": 1379 + }, + { + "epoch": 0.04, + "grad_norm": 2.174253907769365, + "learning_rate": 9.997374237487729e-06, + "loss": 0.5122, + "step": 1380 + }, + { + "epoch": 0.04, + "grad_norm": 2.0476606805832587, + "learning_rate": 9.99735899495758e-06, + "loss": 0.5691, + "step": 1381 + }, + { + "epoch": 0.04, + "grad_norm": 2.053304321651982, + "learning_rate": 9.997343708325682e-06, + "loss": 0.5301, + "step": 1382 + }, + { + "epoch": 0.04, + "grad_norm": 3.149213038227992, + "learning_rate": 9.99732837759217e-06, + "loss": 0.5187, + "step": 1383 + }, + { + "epoch": 0.04, + "grad_norm": 2.0087629055310448, + "learning_rate": 9.997313002757181e-06, + "loss": 0.4828, + "step": 1384 + }, + { + "epoch": 0.04, + "grad_norm": 2.3014857954244, + "learning_rate": 9.997297583820849e-06, + "loss": 0.499, + "step": 1385 + }, + { + "epoch": 0.04, + "grad_norm": 1.9764442102781168, + "learning_rate": 9.99728212078331e-06, + "loss": 0.5075, + "step": 1386 + }, + { + "epoch": 0.04, + "grad_norm": 2.0489868518932304, + "learning_rate": 9.997266613644701e-06, + "loss": 0.5247, + "step": 1387 + }, + { + "epoch": 0.04, + "grad_norm": 2.0553927334641755, + "learning_rate": 9.997251062405158e-06, + "loss": 0.5078, + "step": 1388 + }, + { + "epoch": 0.04, + "grad_norm": 4.657062011923158, + "learning_rate": 9.99723546706482e-06, + "loss": 0.5209, + "step": 1389 + }, + { + "epoch": 0.04, + "grad_norm": 2.1633890722226488, + "learning_rate": 9.997219827623823e-06, + "loss": 0.5294, + "step": 1390 + }, + { + "epoch": 0.04, + "grad_norm": 1.1079795630265674, + "learning_rate": 9.997204144082306e-06, + "loss": 0.519, + "step": 1391 + }, + { + "epoch": 0.04, + "grad_norm": 1.9137407968624687, + "learning_rate": 9.997188416440406e-06, + "loss": 0.5092, + "step": 1392 + }, + { + "epoch": 0.04, + "grad_norm": 2.048152235596892, + "learning_rate": 9.997172644698264e-06, + "loss": 0.4957, + "step": 1393 + }, + { + "epoch": 0.04, + "grad_norm": 2.240825730237096, + "learning_rate": 9.997156828856016e-06, + "loss": 0.5311, + "step": 1394 + }, + { + "epoch": 0.04, + "grad_norm": 2.1254938982041676, + "learning_rate": 9.997140968913806e-06, + "loss": 0.5274, + "step": 1395 + }, + { + "epoch": 0.04, + "grad_norm": 1.9821081655075992, + "learning_rate": 9.99712506487177e-06, + "loss": 0.4813, + "step": 1396 + }, + { + "epoch": 0.04, + "grad_norm": 2.0932251855302253, + "learning_rate": 9.99710911673005e-06, + "loss": 0.478, + "step": 1397 + }, + { + "epoch": 0.04, + "grad_norm": 1.9590046868619155, + "learning_rate": 9.997093124488787e-06, + "loss": 0.4681, + "step": 1398 + }, + { + "epoch": 0.04, + "grad_norm": 1.9086869534648983, + "learning_rate": 9.997077088148122e-06, + "loss": 0.526, + "step": 1399 + }, + { + "epoch": 0.04, + "grad_norm": 2.086838493571413, + "learning_rate": 9.997061007708196e-06, + "loss": 0.5327, + "step": 1400 + }, + { + "epoch": 0.04, + "grad_norm": 1.7455706397494968, + "learning_rate": 9.997044883169151e-06, + "loss": 0.4728, + "step": 1401 + }, + { + "epoch": 0.04, + "grad_norm": 2.2208989832536212, + "learning_rate": 9.99702871453113e-06, + "loss": 0.5238, + "step": 1402 + }, + { + "epoch": 0.04, + "grad_norm": 1.821782559493978, + "learning_rate": 9.997012501794273e-06, + "loss": 0.4849, + "step": 1403 + }, + { + "epoch": 0.04, + "grad_norm": 1.9136273460146702, + "learning_rate": 9.996996244958728e-06, + "loss": 0.5159, + "step": 1404 + }, + { + "epoch": 0.04, + "grad_norm": 1.8418302700428288, + "learning_rate": 9.996979944024634e-06, + "loss": 0.5034, + "step": 1405 + }, + { + "epoch": 0.04, + "grad_norm": 2.110126068745162, + "learning_rate": 9.996963598992137e-06, + "loss": 0.535, + "step": 1406 + }, + { + "epoch": 0.04, + "grad_norm": 7.349688338922731, + "learning_rate": 9.996947209861382e-06, + "loss": 0.4775, + "step": 1407 + }, + { + "epoch": 0.04, + "grad_norm": 1.972620172181636, + "learning_rate": 9.996930776632511e-06, + "loss": 0.5328, + "step": 1408 + }, + { + "epoch": 0.04, + "grad_norm": 1.9347051727406002, + "learning_rate": 9.99691429930567e-06, + "loss": 0.5037, + "step": 1409 + }, + { + "epoch": 0.04, + "grad_norm": 1.9513599093678304, + "learning_rate": 9.996897777881007e-06, + "loss": 0.5039, + "step": 1410 + }, + { + "epoch": 0.04, + "grad_norm": 2.018351606739926, + "learning_rate": 9.996881212358665e-06, + "loss": 0.4724, + "step": 1411 + }, + { + "epoch": 0.04, + "grad_norm": 1.8044263239732385, + "learning_rate": 9.99686460273879e-06, + "loss": 0.4662, + "step": 1412 + }, + { + "epoch": 0.04, + "grad_norm": 1.7826999929621976, + "learning_rate": 9.996847949021529e-06, + "loss": 0.5236, + "step": 1413 + }, + { + "epoch": 0.04, + "grad_norm": 1.9467659649583182, + "learning_rate": 9.996831251207031e-06, + "loss": 0.4858, + "step": 1414 + }, + { + "epoch": 0.04, + "grad_norm": 1.83839251150796, + "learning_rate": 9.99681450929544e-06, + "loss": 0.4716, + "step": 1415 + }, + { + "epoch": 0.04, + "grad_norm": 2.018534137128427, + "learning_rate": 9.996797723286908e-06, + "loss": 0.486, + "step": 1416 + }, + { + "epoch": 0.04, + "grad_norm": 2.162680588005274, + "learning_rate": 9.99678089318158e-06, + "loss": 0.5306, + "step": 1417 + }, + { + "epoch": 0.04, + "grad_norm": 2.057944254452887, + "learning_rate": 9.996764018979603e-06, + "loss": 0.4873, + "step": 1418 + }, + { + "epoch": 0.04, + "grad_norm": 1.8475961202991071, + "learning_rate": 9.99674710068113e-06, + "loss": 0.4827, + "step": 1419 + }, + { + "epoch": 0.04, + "grad_norm": 1.9106963886903585, + "learning_rate": 9.996730138286307e-06, + "loss": 0.499, + "step": 1420 + }, + { + "epoch": 0.04, + "grad_norm": 1.9946565121371969, + "learning_rate": 9.996713131795287e-06, + "loss": 0.5523, + "step": 1421 + }, + { + "epoch": 0.04, + "grad_norm": 1.876662361099636, + "learning_rate": 9.996696081208218e-06, + "loss": 0.5303, + "step": 1422 + }, + { + "epoch": 0.04, + "grad_norm": 1.9277578601315444, + "learning_rate": 9.996678986525247e-06, + "loss": 0.5665, + "step": 1423 + }, + { + "epoch": 0.04, + "grad_norm": 2.1680459956956617, + "learning_rate": 9.996661847746532e-06, + "loss": 0.4647, + "step": 1424 + }, + { + "epoch": 0.04, + "grad_norm": 1.8472051279136024, + "learning_rate": 9.99664466487222e-06, + "loss": 0.5247, + "step": 1425 + }, + { + "epoch": 0.04, + "grad_norm": 2.053761714077594, + "learning_rate": 9.996627437902465e-06, + "loss": 0.5608, + "step": 1426 + }, + { + "epoch": 0.04, + "grad_norm": 2.033927160014579, + "learning_rate": 9.996610166837416e-06, + "loss": 0.5099, + "step": 1427 + }, + { + "epoch": 0.04, + "grad_norm": 1.8614014355422288, + "learning_rate": 9.996592851677225e-06, + "loss": 0.5328, + "step": 1428 + }, + { + "epoch": 0.04, + "grad_norm": 2.082003235323375, + "learning_rate": 9.99657549242205e-06, + "loss": 0.5153, + "step": 1429 + }, + { + "epoch": 0.04, + "grad_norm": 1.8963410179644251, + "learning_rate": 9.996558089072039e-06, + "loss": 0.4699, + "step": 1430 + }, + { + "epoch": 0.04, + "grad_norm": 2.295048507351842, + "learning_rate": 9.996540641627347e-06, + "loss": 0.4944, + "step": 1431 + }, + { + "epoch": 0.04, + "grad_norm": 2.6575825724472524, + "learning_rate": 9.996523150088129e-06, + "loss": 0.4841, + "step": 1432 + }, + { + "epoch": 0.04, + "grad_norm": 1.9888242108843215, + "learning_rate": 9.99650561445454e-06, + "loss": 0.5362, + "step": 1433 + }, + { + "epoch": 0.04, + "grad_norm": 5.5788930797680845, + "learning_rate": 9.996488034726732e-06, + "loss": 0.4635, + "step": 1434 + }, + { + "epoch": 0.04, + "grad_norm": 1.9403237767490247, + "learning_rate": 9.996470410904862e-06, + "loss": 0.4659, + "step": 1435 + }, + { + "epoch": 0.04, + "grad_norm": 1.8637562227327884, + "learning_rate": 9.996452742989085e-06, + "loss": 0.4848, + "step": 1436 + }, + { + "epoch": 0.04, + "grad_norm": 2.113928569551113, + "learning_rate": 9.996435030979556e-06, + "loss": 0.5031, + "step": 1437 + }, + { + "epoch": 0.04, + "grad_norm": 1.8333326030311328, + "learning_rate": 9.996417274876435e-06, + "loss": 0.4803, + "step": 1438 + }, + { + "epoch": 0.04, + "grad_norm": 2.2090275771415873, + "learning_rate": 9.996399474679875e-06, + "loss": 0.5122, + "step": 1439 + }, + { + "epoch": 0.04, + "grad_norm": 1.862865782816604, + "learning_rate": 9.996381630390033e-06, + "loss": 0.499, + "step": 1440 + }, + { + "epoch": 0.04, + "grad_norm": 2.3653806501182055, + "learning_rate": 9.996363742007069e-06, + "loss": 0.4734, + "step": 1441 + }, + { + "epoch": 0.04, + "grad_norm": 1.8450612997004334, + "learning_rate": 9.99634580953114e-06, + "loss": 0.4923, + "step": 1442 + }, + { + "epoch": 0.04, + "grad_norm": 2.451063040527425, + "learning_rate": 9.996327832962402e-06, + "loss": 0.5986, + "step": 1443 + }, + { + "epoch": 0.04, + "grad_norm": 1.9998175784122654, + "learning_rate": 9.996309812301018e-06, + "loss": 0.5022, + "step": 1444 + }, + { + "epoch": 0.04, + "grad_norm": 1.8856471476460268, + "learning_rate": 9.996291747547142e-06, + "loss": 0.4838, + "step": 1445 + }, + { + "epoch": 0.04, + "grad_norm": 1.848427487960376, + "learning_rate": 9.996273638700937e-06, + "loss": 0.5226, + "step": 1446 + }, + { + "epoch": 0.04, + "grad_norm": 1.904688333567376, + "learning_rate": 9.996255485762563e-06, + "loss": 0.5244, + "step": 1447 + }, + { + "epoch": 0.04, + "grad_norm": 2.0043326003993647, + "learning_rate": 9.996237288732179e-06, + "loss": 0.5366, + "step": 1448 + }, + { + "epoch": 0.04, + "grad_norm": 1.9721717957710638, + "learning_rate": 9.996219047609943e-06, + "loss": 0.5074, + "step": 1449 + }, + { + "epoch": 0.04, + "grad_norm": 2.269732879084015, + "learning_rate": 9.996200762396019e-06, + "loss": 0.4788, + "step": 1450 + }, + { + "epoch": 0.04, + "grad_norm": 1.8848716319708914, + "learning_rate": 9.996182433090568e-06, + "loss": 0.5115, + "step": 1451 + }, + { + "epoch": 0.04, + "grad_norm": 2.141053528353127, + "learning_rate": 9.996164059693753e-06, + "loss": 0.4761, + "step": 1452 + }, + { + "epoch": 0.04, + "grad_norm": 1.9856028979815061, + "learning_rate": 9.996145642205733e-06, + "loss": 0.4553, + "step": 1453 + }, + { + "epoch": 0.04, + "grad_norm": 2.470622025815937, + "learning_rate": 9.996127180626674e-06, + "loss": 0.4561, + "step": 1454 + }, + { + "epoch": 0.04, + "grad_norm": 3.2748849519021137, + "learning_rate": 9.996108674956735e-06, + "loss": 0.5432, + "step": 1455 + }, + { + "epoch": 0.04, + "grad_norm": 2.0784408556001495, + "learning_rate": 9.996090125196083e-06, + "loss": 0.4766, + "step": 1456 + }, + { + "epoch": 0.04, + "grad_norm": 2.222560310446972, + "learning_rate": 9.99607153134488e-06, + "loss": 0.532, + "step": 1457 + }, + { + "epoch": 0.04, + "grad_norm": 2.402851578593894, + "learning_rate": 9.996052893403291e-06, + "loss": 0.4921, + "step": 1458 + }, + { + "epoch": 0.04, + "grad_norm": 2.1027826048043816, + "learning_rate": 9.99603421137148e-06, + "loss": 0.5242, + "step": 1459 + }, + { + "epoch": 0.04, + "grad_norm": 2.030293839113943, + "learning_rate": 9.996015485249609e-06, + "loss": 0.49, + "step": 1460 + }, + { + "epoch": 0.04, + "grad_norm": 1.9238485485086287, + "learning_rate": 9.995996715037848e-06, + "loss": 0.5266, + "step": 1461 + }, + { + "epoch": 0.04, + "grad_norm": 2.000694294113113, + "learning_rate": 9.99597790073636e-06, + "loss": 0.5323, + "step": 1462 + }, + { + "epoch": 0.04, + "grad_norm": 1.9137072486027902, + "learning_rate": 9.995959042345312e-06, + "loss": 0.4753, + "step": 1463 + }, + { + "epoch": 0.04, + "grad_norm": 1.285891555709676, + "learning_rate": 9.99594013986487e-06, + "loss": 0.6168, + "step": 1464 + }, + { + "epoch": 0.04, + "grad_norm": 1.9929667381445673, + "learning_rate": 9.995921193295202e-06, + "loss": 0.4997, + "step": 1465 + }, + { + "epoch": 0.04, + "grad_norm": 2.5641623928394304, + "learning_rate": 9.995902202636472e-06, + "loss": 0.5247, + "step": 1466 + }, + { + "epoch": 0.04, + "grad_norm": 1.9756532365690134, + "learning_rate": 9.995883167888851e-06, + "loss": 0.5156, + "step": 1467 + }, + { + "epoch": 0.04, + "grad_norm": 2.3234785624972254, + "learning_rate": 9.995864089052504e-06, + "loss": 0.4878, + "step": 1468 + }, + { + "epoch": 0.04, + "grad_norm": 1.9084480002864426, + "learning_rate": 9.995844966127603e-06, + "loss": 0.4666, + "step": 1469 + }, + { + "epoch": 0.04, + "grad_norm": 1.9395282295900453, + "learning_rate": 9.995825799114313e-06, + "loss": 0.5134, + "step": 1470 + }, + { + "epoch": 0.04, + "grad_norm": 2.3969388458470955, + "learning_rate": 9.995806588012806e-06, + "loss": 0.515, + "step": 1471 + }, + { + "epoch": 0.04, + "grad_norm": 1.9482197103228291, + "learning_rate": 9.99578733282325e-06, + "loss": 0.5113, + "step": 1472 + }, + { + "epoch": 0.04, + "grad_norm": 2.0222414257856385, + "learning_rate": 9.995768033545816e-06, + "loss": 0.463, + "step": 1473 + }, + { + "epoch": 0.04, + "grad_norm": 2.0088584622181873, + "learning_rate": 9.995748690180673e-06, + "loss": 0.5031, + "step": 1474 + }, + { + "epoch": 0.04, + "grad_norm": 1.8906420919317777, + "learning_rate": 9.995729302727992e-06, + "loss": 0.4648, + "step": 1475 + }, + { + "epoch": 0.04, + "grad_norm": 1.9129140040798933, + "learning_rate": 9.995709871187946e-06, + "loss": 0.4874, + "step": 1476 + }, + { + "epoch": 0.04, + "grad_norm": 2.0348858759341186, + "learning_rate": 9.995690395560704e-06, + "loss": 0.4561, + "step": 1477 + }, + { + "epoch": 0.04, + "grad_norm": 1.8386552123105984, + "learning_rate": 9.99567087584644e-06, + "loss": 0.5189, + "step": 1478 + }, + { + "epoch": 0.04, + "grad_norm": 1.824022830133996, + "learning_rate": 9.995651312045322e-06, + "loss": 0.5194, + "step": 1479 + }, + { + "epoch": 0.04, + "grad_norm": 1.716537233908352, + "learning_rate": 9.99563170415753e-06, + "loss": 0.4844, + "step": 1480 + }, + { + "epoch": 0.04, + "grad_norm": 2.2156462427774755, + "learning_rate": 9.995612052183229e-06, + "loss": 0.4971, + "step": 1481 + }, + { + "epoch": 0.04, + "grad_norm": 1.7534304849113225, + "learning_rate": 9.995592356122598e-06, + "loss": 0.4501, + "step": 1482 + }, + { + "epoch": 0.04, + "grad_norm": 1.8730661187201723, + "learning_rate": 9.995572615975808e-06, + "loss": 0.4432, + "step": 1483 + }, + { + "epoch": 0.04, + "grad_norm": 2.6963890415905496, + "learning_rate": 9.995552831743037e-06, + "loss": 0.4943, + "step": 1484 + }, + { + "epoch": 0.04, + "grad_norm": 2.932543940201432, + "learning_rate": 9.995533003424454e-06, + "loss": 0.4524, + "step": 1485 + }, + { + "epoch": 0.04, + "grad_norm": 2.001901759291378, + "learning_rate": 9.995513131020237e-06, + "loss": 0.4878, + "step": 1486 + }, + { + "epoch": 0.04, + "grad_norm": 1.7841271298517245, + "learning_rate": 9.995493214530562e-06, + "loss": 0.5027, + "step": 1487 + }, + { + "epoch": 0.04, + "grad_norm": 2.1154631256509906, + "learning_rate": 9.995473253955603e-06, + "loss": 0.5299, + "step": 1488 + }, + { + "epoch": 0.04, + "grad_norm": 4.4741528276878375, + "learning_rate": 9.99545324929554e-06, + "loss": 0.4937, + "step": 1489 + }, + { + "epoch": 0.04, + "grad_norm": 1.908778481000744, + "learning_rate": 9.995433200550544e-06, + "loss": 0.5231, + "step": 1490 + }, + { + "epoch": 0.04, + "grad_norm": 1.6842505716227214, + "learning_rate": 9.995413107720795e-06, + "loss": 0.4822, + "step": 1491 + }, + { + "epoch": 0.04, + "grad_norm": 1.895156269202581, + "learning_rate": 9.99539297080647e-06, + "loss": 0.5604, + "step": 1492 + }, + { + "epoch": 0.04, + "grad_norm": 2.165103062610577, + "learning_rate": 9.995372789807746e-06, + "loss": 0.4981, + "step": 1493 + }, + { + "epoch": 0.04, + "grad_norm": 2.202369069397467, + "learning_rate": 9.995352564724802e-06, + "loss": 0.5072, + "step": 1494 + }, + { + "epoch": 0.04, + "grad_norm": 1.7997941042722192, + "learning_rate": 9.995332295557818e-06, + "loss": 0.5351, + "step": 1495 + }, + { + "epoch": 0.04, + "grad_norm": 1.8005176448058817, + "learning_rate": 9.99531198230697e-06, + "loss": 0.4872, + "step": 1496 + }, + { + "epoch": 0.04, + "grad_norm": 1.9002703403137178, + "learning_rate": 9.995291624972438e-06, + "loss": 0.5092, + "step": 1497 + }, + { + "epoch": 0.04, + "grad_norm": 2.301977110559295, + "learning_rate": 9.995271223554402e-06, + "loss": 0.5493, + "step": 1498 + }, + { + "epoch": 0.04, + "grad_norm": 1.7334363838910427, + "learning_rate": 9.995250778053041e-06, + "loss": 0.4883, + "step": 1499 + }, + { + "epoch": 0.04, + "grad_norm": 1.8890711549840795, + "learning_rate": 9.995230288468538e-06, + "loss": 0.5126, + "step": 1500 + }, + { + "epoch": 0.04, + "grad_norm": 2.4927339636490764, + "learning_rate": 9.995209754801071e-06, + "loss": 0.4772, + "step": 1501 + }, + { + "epoch": 0.04, + "grad_norm": 3.956834713230083, + "learning_rate": 9.995189177050825e-06, + "loss": 0.4785, + "step": 1502 + }, + { + "epoch": 0.04, + "grad_norm": 1.3223065951115327, + "learning_rate": 9.995168555217977e-06, + "loss": 0.6226, + "step": 1503 + }, + { + "epoch": 0.04, + "grad_norm": 1.8470942759454017, + "learning_rate": 9.995147889302712e-06, + "loss": 0.5187, + "step": 1504 + }, + { + "epoch": 0.04, + "grad_norm": 1.8660973688634583, + "learning_rate": 9.99512717930521e-06, + "loss": 0.4756, + "step": 1505 + }, + { + "epoch": 0.04, + "grad_norm": 3.0092752190702443, + "learning_rate": 9.995106425225657e-06, + "loss": 0.4741, + "step": 1506 + }, + { + "epoch": 0.04, + "grad_norm": 2.2059101368582095, + "learning_rate": 9.995085627064235e-06, + "loss": 0.5199, + "step": 1507 + }, + { + "epoch": 0.04, + "grad_norm": 2.011590620185854, + "learning_rate": 9.995064784821125e-06, + "loss": 0.4484, + "step": 1508 + }, + { + "epoch": 0.04, + "grad_norm": 1.872662337687947, + "learning_rate": 9.995043898496512e-06, + "loss": 0.4703, + "step": 1509 + }, + { + "epoch": 0.04, + "grad_norm": 2.9115220858858573, + "learning_rate": 9.995022968090583e-06, + "loss": 0.4814, + "step": 1510 + }, + { + "epoch": 0.04, + "grad_norm": 2.0608314095801066, + "learning_rate": 9.995001993603521e-06, + "loss": 0.5197, + "step": 1511 + }, + { + "epoch": 0.04, + "grad_norm": 1.9079517539047124, + "learning_rate": 9.994980975035511e-06, + "loss": 0.4706, + "step": 1512 + }, + { + "epoch": 0.04, + "grad_norm": 1.9770957842755081, + "learning_rate": 9.994959912386738e-06, + "loss": 0.5254, + "step": 1513 + }, + { + "epoch": 0.04, + "grad_norm": 2.353186361636589, + "learning_rate": 9.994938805657389e-06, + "loss": 0.4678, + "step": 1514 + }, + { + "epoch": 0.04, + "grad_norm": 1.878568801974294, + "learning_rate": 9.994917654847648e-06, + "loss": 0.4744, + "step": 1515 + }, + { + "epoch": 0.04, + "grad_norm": 1.329549239479055, + "learning_rate": 9.994896459957703e-06, + "loss": 0.5876, + "step": 1516 + }, + { + "epoch": 0.04, + "grad_norm": 1.8584389091145006, + "learning_rate": 9.994875220987743e-06, + "loss": 0.5132, + "step": 1517 + }, + { + "epoch": 0.04, + "grad_norm": 2.6077128318869516, + "learning_rate": 9.994853937937954e-06, + "loss": 0.4959, + "step": 1518 + }, + { + "epoch": 0.04, + "grad_norm": 1.7743873479426706, + "learning_rate": 9.994832610808522e-06, + "loss": 0.5032, + "step": 1519 + }, + { + "epoch": 0.04, + "grad_norm": 2.0987215145455584, + "learning_rate": 9.994811239599639e-06, + "loss": 0.5328, + "step": 1520 + }, + { + "epoch": 0.04, + "grad_norm": 1.8544447759414062, + "learning_rate": 9.994789824311491e-06, + "loss": 0.4732, + "step": 1521 + }, + { + "epoch": 0.04, + "grad_norm": 1.6869785448997714, + "learning_rate": 9.994768364944264e-06, + "loss": 0.483, + "step": 1522 + }, + { + "epoch": 0.04, + "grad_norm": 1.7614291425683766, + "learning_rate": 9.994746861498154e-06, + "loss": 0.562, + "step": 1523 + }, + { + "epoch": 0.04, + "grad_norm": 2.033474352239148, + "learning_rate": 9.994725313973348e-06, + "loss": 0.4855, + "step": 1524 + }, + { + "epoch": 0.04, + "grad_norm": 1.8338099529137242, + "learning_rate": 9.994703722370034e-06, + "loss": 0.4803, + "step": 1525 + }, + { + "epoch": 0.04, + "grad_norm": 1.1036296207160494, + "learning_rate": 9.994682086688405e-06, + "loss": 0.6354, + "step": 1526 + }, + { + "epoch": 0.04, + "grad_norm": 2.0766753909428206, + "learning_rate": 9.994660406928651e-06, + "loss": 0.5551, + "step": 1527 + }, + { + "epoch": 0.04, + "grad_norm": 1.8087307069901934, + "learning_rate": 9.994638683090962e-06, + "loss": 0.4862, + "step": 1528 + }, + { + "epoch": 0.04, + "grad_norm": 1.7576537966015164, + "learning_rate": 9.994616915175534e-06, + "loss": 0.4774, + "step": 1529 + }, + { + "epoch": 0.04, + "grad_norm": 1.8499349075263463, + "learning_rate": 9.994595103182556e-06, + "loss": 0.4724, + "step": 1530 + }, + { + "epoch": 0.04, + "grad_norm": 2.0991814915437965, + "learning_rate": 9.994573247112218e-06, + "loss": 0.5231, + "step": 1531 + }, + { + "epoch": 0.04, + "grad_norm": 1.9073531390262954, + "learning_rate": 9.994551346964717e-06, + "loss": 0.5206, + "step": 1532 + }, + { + "epoch": 0.04, + "grad_norm": 1.940660799948957, + "learning_rate": 9.994529402740248e-06, + "loss": 0.4823, + "step": 1533 + }, + { + "epoch": 0.04, + "grad_norm": 1.989373123769444, + "learning_rate": 9.994507414438997e-06, + "loss": 0.5242, + "step": 1534 + }, + { + "epoch": 0.04, + "grad_norm": 1.1963606597200145, + "learning_rate": 9.994485382061165e-06, + "loss": 0.5928, + "step": 1535 + }, + { + "epoch": 0.04, + "grad_norm": 1.9601846996206842, + "learning_rate": 9.994463305606944e-06, + "loss": 0.5442, + "step": 1536 + }, + { + "epoch": 0.04, + "grad_norm": 1.9314384002216658, + "learning_rate": 9.994441185076529e-06, + "loss": 0.4514, + "step": 1537 + }, + { + "epoch": 0.04, + "grad_norm": 1.9305515761005603, + "learning_rate": 9.994419020470114e-06, + "loss": 0.5126, + "step": 1538 + }, + { + "epoch": 0.04, + "grad_norm": 2.0050296155354577, + "learning_rate": 9.994396811787897e-06, + "loss": 0.497, + "step": 1539 + }, + { + "epoch": 0.04, + "grad_norm": 2.441179940589015, + "learning_rate": 9.994374559030074e-06, + "loss": 0.4772, + "step": 1540 + }, + { + "epoch": 0.04, + "grad_norm": 2.2553074873369794, + "learning_rate": 9.994352262196839e-06, + "loss": 0.5306, + "step": 1541 + }, + { + "epoch": 0.04, + "grad_norm": 1.911349670907581, + "learning_rate": 9.994329921288388e-06, + "loss": 0.4937, + "step": 1542 + }, + { + "epoch": 0.04, + "grad_norm": 1.7956023417899971, + "learning_rate": 9.994307536304922e-06, + "loss": 0.479, + "step": 1543 + }, + { + "epoch": 0.04, + "grad_norm": 2.268483963023476, + "learning_rate": 9.994285107246635e-06, + "loss": 0.4582, + "step": 1544 + }, + { + "epoch": 0.04, + "grad_norm": 2.006719298377162, + "learning_rate": 9.994262634113728e-06, + "loss": 0.4517, + "step": 1545 + }, + { + "epoch": 0.04, + "grad_norm": 1.818675814413188, + "learning_rate": 9.994240116906397e-06, + "loss": 0.507, + "step": 1546 + }, + { + "epoch": 0.04, + "grad_norm": 1.810957005564031, + "learning_rate": 9.994217555624841e-06, + "loss": 0.455, + "step": 1547 + }, + { + "epoch": 0.04, + "grad_norm": 2.166684157036481, + "learning_rate": 9.994194950269262e-06, + "loss": 0.4903, + "step": 1548 + }, + { + "epoch": 0.04, + "grad_norm": 2.1466000505589413, + "learning_rate": 9.994172300839855e-06, + "loss": 0.5174, + "step": 1549 + }, + { + "epoch": 0.04, + "grad_norm": 1.8284684193414078, + "learning_rate": 9.994149607336823e-06, + "loss": 0.4565, + "step": 1550 + }, + { + "epoch": 0.04, + "grad_norm": 1.9232034936082765, + "learning_rate": 9.994126869760365e-06, + "loss": 0.4855, + "step": 1551 + }, + { + "epoch": 0.05, + "grad_norm": 2.0240258469825325, + "learning_rate": 9.994104088110682e-06, + "loss": 0.542, + "step": 1552 + }, + { + "epoch": 0.05, + "grad_norm": 2.3197607074292805, + "learning_rate": 9.994081262387976e-06, + "loss": 0.4757, + "step": 1553 + }, + { + "epoch": 0.05, + "grad_norm": 2.0402552713780397, + "learning_rate": 9.994058392592446e-06, + "loss": 0.5093, + "step": 1554 + }, + { + "epoch": 0.05, + "grad_norm": 1.8445232633993185, + "learning_rate": 9.994035478724296e-06, + "loss": 0.4719, + "step": 1555 + }, + { + "epoch": 0.05, + "grad_norm": 1.7630101176280364, + "learning_rate": 9.994012520783727e-06, + "loss": 0.5192, + "step": 1556 + }, + { + "epoch": 0.05, + "grad_norm": 1.922985358398672, + "learning_rate": 9.993989518770942e-06, + "loss": 0.4879, + "step": 1557 + }, + { + "epoch": 0.05, + "grad_norm": 1.7921638056085816, + "learning_rate": 9.993966472686145e-06, + "loss": 0.4832, + "step": 1558 + }, + { + "epoch": 0.05, + "grad_norm": 1.7991550318765095, + "learning_rate": 9.993943382529539e-06, + "loss": 0.4626, + "step": 1559 + }, + { + "epoch": 0.05, + "grad_norm": 1.875916777933599, + "learning_rate": 9.993920248301326e-06, + "loss": 0.4984, + "step": 1560 + }, + { + "epoch": 0.05, + "grad_norm": 2.1399146081715084, + "learning_rate": 9.993897070001711e-06, + "loss": 0.5288, + "step": 1561 + }, + { + "epoch": 0.05, + "grad_norm": 2.2501928013698973, + "learning_rate": 9.993873847630899e-06, + "loss": 0.494, + "step": 1562 + }, + { + "epoch": 0.05, + "grad_norm": 1.319465008521282, + "learning_rate": 9.993850581189095e-06, + "loss": 0.6432, + "step": 1563 + }, + { + "epoch": 0.05, + "grad_norm": 1.9172066350461292, + "learning_rate": 9.993827270676507e-06, + "loss": 0.5101, + "step": 1564 + }, + { + "epoch": 0.05, + "grad_norm": 1.992318920516795, + "learning_rate": 9.993803916093335e-06, + "loss": 0.5123, + "step": 1565 + }, + { + "epoch": 0.05, + "grad_norm": 1.9687751495247248, + "learning_rate": 9.993780517439789e-06, + "loss": 0.5103, + "step": 1566 + }, + { + "epoch": 0.05, + "grad_norm": 2.013628055033968, + "learning_rate": 9.993757074716073e-06, + "loss": 0.4798, + "step": 1567 + }, + { + "epoch": 0.05, + "grad_norm": 2.098085446151014, + "learning_rate": 9.993733587922397e-06, + "loss": 0.5304, + "step": 1568 + }, + { + "epoch": 0.05, + "grad_norm": 1.8763665337201527, + "learning_rate": 9.993710057058966e-06, + "loss": 0.4628, + "step": 1569 + }, + { + "epoch": 0.05, + "grad_norm": 1.8038308100571565, + "learning_rate": 9.993686482125989e-06, + "loss": 0.465, + "step": 1570 + }, + { + "epoch": 0.05, + "grad_norm": 2.412795717897965, + "learning_rate": 9.993662863123673e-06, + "loss": 0.4583, + "step": 1571 + }, + { + "epoch": 0.05, + "grad_norm": 1.9595099399745473, + "learning_rate": 9.993639200052228e-06, + "loss": 0.5043, + "step": 1572 + }, + { + "epoch": 0.05, + "grad_norm": 2.627645157109467, + "learning_rate": 9.993615492911858e-06, + "loss": 0.4595, + "step": 1573 + }, + { + "epoch": 0.05, + "grad_norm": 2.094334085690106, + "learning_rate": 9.99359174170278e-06, + "loss": 0.4899, + "step": 1574 + }, + { + "epoch": 0.05, + "grad_norm": 1.9161488594802236, + "learning_rate": 9.993567946425198e-06, + "loss": 0.4956, + "step": 1575 + }, + { + "epoch": 0.05, + "grad_norm": 2.3492209815420195, + "learning_rate": 9.993544107079324e-06, + "loss": 0.499, + "step": 1576 + }, + { + "epoch": 0.05, + "grad_norm": 2.162168579118214, + "learning_rate": 9.993520223665367e-06, + "loss": 0.4741, + "step": 1577 + }, + { + "epoch": 0.05, + "grad_norm": 1.9353792713615667, + "learning_rate": 9.99349629618354e-06, + "loss": 0.5191, + "step": 1578 + }, + { + "epoch": 0.05, + "grad_norm": 2.212071987961546, + "learning_rate": 9.99347232463405e-06, + "loss": 0.5103, + "step": 1579 + }, + { + "epoch": 0.05, + "grad_norm": 1.7458878684686698, + "learning_rate": 9.993448309017112e-06, + "loss": 0.4703, + "step": 1580 + }, + { + "epoch": 0.05, + "grad_norm": 2.000935781874721, + "learning_rate": 9.99342424933294e-06, + "loss": 0.4671, + "step": 1581 + }, + { + "epoch": 0.05, + "grad_norm": 2.0927188380736244, + "learning_rate": 9.993400145581741e-06, + "loss": 0.5196, + "step": 1582 + }, + { + "epoch": 0.05, + "grad_norm": 2.3043860966107435, + "learning_rate": 9.993375997763732e-06, + "loss": 0.5124, + "step": 1583 + }, + { + "epoch": 0.05, + "grad_norm": 2.918680155874492, + "learning_rate": 9.993351805879124e-06, + "loss": 0.4597, + "step": 1584 + }, + { + "epoch": 0.05, + "grad_norm": 3.023472410598688, + "learning_rate": 9.993327569928131e-06, + "loss": 0.5261, + "step": 1585 + }, + { + "epoch": 0.05, + "grad_norm": 1.8510509866006617, + "learning_rate": 9.993303289910966e-06, + "loss": 0.5315, + "step": 1586 + }, + { + "epoch": 0.05, + "grad_norm": 1.9172750544533748, + "learning_rate": 9.993278965827844e-06, + "loss": 0.4624, + "step": 1587 + }, + { + "epoch": 0.05, + "grad_norm": 2.0041529571812853, + "learning_rate": 9.99325459767898e-06, + "loss": 0.4812, + "step": 1588 + }, + { + "epoch": 0.05, + "grad_norm": 1.4179527349127496, + "learning_rate": 9.993230185464589e-06, + "loss": 0.6039, + "step": 1589 + }, + { + "epoch": 0.05, + "grad_norm": 1.8804172696919095, + "learning_rate": 9.993205729184887e-06, + "loss": 0.4493, + "step": 1590 + }, + { + "epoch": 0.05, + "grad_norm": 1.9388428291036246, + "learning_rate": 9.993181228840087e-06, + "loss": 0.5339, + "step": 1591 + }, + { + "epoch": 0.05, + "grad_norm": 1.9566983468554835, + "learning_rate": 9.99315668443041e-06, + "loss": 0.5191, + "step": 1592 + }, + { + "epoch": 0.05, + "grad_norm": 1.86036720173394, + "learning_rate": 9.993132095956068e-06, + "loss": 0.4692, + "step": 1593 + }, + { + "epoch": 0.05, + "grad_norm": 1.8983914670603939, + "learning_rate": 9.99310746341728e-06, + "loss": 0.4904, + "step": 1594 + }, + { + "epoch": 0.05, + "grad_norm": 2.152817251361655, + "learning_rate": 9.993082786814265e-06, + "loss": 0.4901, + "step": 1595 + }, + { + "epoch": 0.05, + "grad_norm": 2.3952505052637703, + "learning_rate": 9.993058066147237e-06, + "loss": 0.4919, + "step": 1596 + }, + { + "epoch": 0.05, + "grad_norm": 1.915255904358426, + "learning_rate": 9.993033301416418e-06, + "loss": 0.4941, + "step": 1597 + }, + { + "epoch": 0.05, + "grad_norm": 1.8895366277897256, + "learning_rate": 9.993008492622022e-06, + "loss": 0.5, + "step": 1598 + }, + { + "epoch": 0.05, + "grad_norm": 1.706963192267258, + "learning_rate": 9.992983639764272e-06, + "loss": 0.4936, + "step": 1599 + }, + { + "epoch": 0.05, + "grad_norm": 1.8036404065918394, + "learning_rate": 9.992958742843386e-06, + "loss": 0.4858, + "step": 1600 + }, + { + "epoch": 0.05, + "grad_norm": 1.9453602578119522, + "learning_rate": 9.992933801859583e-06, + "loss": 0.5214, + "step": 1601 + }, + { + "epoch": 0.05, + "grad_norm": 1.8741503851218742, + "learning_rate": 9.992908816813084e-06, + "loss": 0.5256, + "step": 1602 + }, + { + "epoch": 0.05, + "grad_norm": 2.715047249387063, + "learning_rate": 9.99288378770411e-06, + "loss": 0.5255, + "step": 1603 + }, + { + "epoch": 0.05, + "grad_norm": 2.116636237981968, + "learning_rate": 9.992858714532883e-06, + "loss": 0.4565, + "step": 1604 + }, + { + "epoch": 0.05, + "grad_norm": 2.2239616009478387, + "learning_rate": 9.99283359729962e-06, + "loss": 0.4982, + "step": 1605 + }, + { + "epoch": 0.05, + "grad_norm": 7.056154062952148, + "learning_rate": 9.992808436004547e-06, + "loss": 0.4879, + "step": 1606 + }, + { + "epoch": 0.05, + "grad_norm": 1.9431632087953843, + "learning_rate": 9.992783230647882e-06, + "loss": 0.4882, + "step": 1607 + }, + { + "epoch": 0.05, + "grad_norm": 1.8508018955080585, + "learning_rate": 9.99275798122985e-06, + "loss": 0.5168, + "step": 1608 + }, + { + "epoch": 0.05, + "grad_norm": 1.9278193449442909, + "learning_rate": 9.992732687750674e-06, + "loss": 0.4759, + "step": 1609 + }, + { + "epoch": 0.05, + "grad_norm": 1.8685799475629288, + "learning_rate": 9.992707350210577e-06, + "loss": 0.5435, + "step": 1610 + }, + { + "epoch": 0.05, + "grad_norm": 1.3722262698029932, + "learning_rate": 9.992681968609781e-06, + "loss": 0.6067, + "step": 1611 + }, + { + "epoch": 0.05, + "grad_norm": 2.0562725093946472, + "learning_rate": 9.992656542948512e-06, + "loss": 0.4319, + "step": 1612 + }, + { + "epoch": 0.05, + "grad_norm": 1.9476124046655108, + "learning_rate": 9.992631073226994e-06, + "loss": 0.4866, + "step": 1613 + }, + { + "epoch": 0.05, + "grad_norm": 1.8787311372493638, + "learning_rate": 9.992605559445451e-06, + "loss": 0.5155, + "step": 1614 + }, + { + "epoch": 0.05, + "grad_norm": 1.8086289679811745, + "learning_rate": 9.992580001604108e-06, + "loss": 0.4834, + "step": 1615 + }, + { + "epoch": 0.05, + "grad_norm": 2.2413691250374743, + "learning_rate": 9.992554399703192e-06, + "loss": 0.521, + "step": 1616 + }, + { + "epoch": 0.05, + "grad_norm": 2.012825867432741, + "learning_rate": 9.992528753742927e-06, + "loss": 0.4448, + "step": 1617 + }, + { + "epoch": 0.05, + "grad_norm": 2.0216595307992296, + "learning_rate": 9.992503063723542e-06, + "loss": 0.4885, + "step": 1618 + }, + { + "epoch": 0.05, + "grad_norm": 2.041742711373071, + "learning_rate": 9.99247732964526e-06, + "loss": 0.5287, + "step": 1619 + }, + { + "epoch": 0.05, + "grad_norm": 2.129927411208594, + "learning_rate": 9.992451551508313e-06, + "loss": 0.5246, + "step": 1620 + }, + { + "epoch": 0.05, + "grad_norm": 1.9669293428007142, + "learning_rate": 9.992425729312923e-06, + "loss": 0.4527, + "step": 1621 + }, + { + "epoch": 0.05, + "grad_norm": 1.9577640255986355, + "learning_rate": 9.99239986305932e-06, + "loss": 0.5055, + "step": 1622 + }, + { + "epoch": 0.05, + "grad_norm": 2.0255503365514502, + "learning_rate": 9.992373952747737e-06, + "loss": 0.5171, + "step": 1623 + }, + { + "epoch": 0.05, + "grad_norm": 2.014254806193443, + "learning_rate": 9.992347998378395e-06, + "loss": 0.4782, + "step": 1624 + }, + { + "epoch": 0.05, + "grad_norm": 3.080946055690579, + "learning_rate": 9.992321999951527e-06, + "loss": 0.4775, + "step": 1625 + }, + { + "epoch": 0.05, + "grad_norm": 2.0500614234098897, + "learning_rate": 9.992295957467364e-06, + "loss": 0.441, + "step": 1626 + }, + { + "epoch": 0.05, + "grad_norm": 1.6545676397708269, + "learning_rate": 9.992269870926132e-06, + "loss": 0.4994, + "step": 1627 + }, + { + "epoch": 0.05, + "grad_norm": 2.1430761600037997, + "learning_rate": 9.992243740328063e-06, + "loss": 0.4533, + "step": 1628 + }, + { + "epoch": 0.05, + "grad_norm": 1.8729702149954042, + "learning_rate": 9.992217565673389e-06, + "loss": 0.5002, + "step": 1629 + }, + { + "epoch": 0.05, + "grad_norm": 1.9590107811440605, + "learning_rate": 9.992191346962337e-06, + "loss": 0.5222, + "step": 1630 + }, + { + "epoch": 0.05, + "grad_norm": 1.7965367183807055, + "learning_rate": 9.992165084195143e-06, + "loss": 0.5001, + "step": 1631 + }, + { + "epoch": 0.05, + "grad_norm": 1.955354919058818, + "learning_rate": 9.992138777372035e-06, + "loss": 0.4823, + "step": 1632 + }, + { + "epoch": 0.05, + "grad_norm": 1.7961058855904501, + "learning_rate": 9.992112426493247e-06, + "loss": 0.5001, + "step": 1633 + }, + { + "epoch": 0.05, + "grad_norm": 1.6982865040350055, + "learning_rate": 9.992086031559013e-06, + "loss": 0.4578, + "step": 1634 + }, + { + "epoch": 0.05, + "grad_norm": 1.9851508166337462, + "learning_rate": 9.992059592569566e-06, + "loss": 0.5326, + "step": 1635 + }, + { + "epoch": 0.05, + "grad_norm": 2.0403001390119293, + "learning_rate": 9.992033109525135e-06, + "loss": 0.4866, + "step": 1636 + }, + { + "epoch": 0.05, + "grad_norm": 2.150594230471658, + "learning_rate": 9.992006582425957e-06, + "loss": 0.5127, + "step": 1637 + }, + { + "epoch": 0.05, + "grad_norm": 2.25662217045318, + "learning_rate": 9.991980011272264e-06, + "loss": 0.4782, + "step": 1638 + }, + { + "epoch": 0.05, + "grad_norm": 1.8587266302577647, + "learning_rate": 9.991953396064293e-06, + "loss": 0.5008, + "step": 1639 + }, + { + "epoch": 0.05, + "grad_norm": 1.9481821101172212, + "learning_rate": 9.99192673680228e-06, + "loss": 0.4663, + "step": 1640 + }, + { + "epoch": 0.05, + "grad_norm": 1.8509265752724995, + "learning_rate": 9.991900033486457e-06, + "loss": 0.5302, + "step": 1641 + }, + { + "epoch": 0.05, + "grad_norm": 1.750386720967024, + "learning_rate": 9.99187328611706e-06, + "loss": 0.4571, + "step": 1642 + }, + { + "epoch": 0.05, + "grad_norm": 2.158756302598063, + "learning_rate": 9.991846494694327e-06, + "loss": 0.5236, + "step": 1643 + }, + { + "epoch": 0.05, + "grad_norm": 1.9662691890876873, + "learning_rate": 9.99181965921849e-06, + "loss": 0.5136, + "step": 1644 + }, + { + "epoch": 0.05, + "grad_norm": 1.9785877894587671, + "learning_rate": 9.991792779689792e-06, + "loss": 0.4663, + "step": 1645 + }, + { + "epoch": 0.05, + "grad_norm": 1.815918400671707, + "learning_rate": 9.991765856108469e-06, + "loss": 0.4737, + "step": 1646 + }, + { + "epoch": 0.05, + "grad_norm": 1.9782988180738024, + "learning_rate": 9.991738888474754e-06, + "loss": 0.479, + "step": 1647 + }, + { + "epoch": 0.05, + "grad_norm": 2.2083345299844623, + "learning_rate": 9.991711876788889e-06, + "loss": 0.4933, + "step": 1648 + }, + { + "epoch": 0.05, + "grad_norm": 2.009950679909498, + "learning_rate": 9.991684821051112e-06, + "loss": 0.4703, + "step": 1649 + }, + { + "epoch": 0.05, + "grad_norm": 2.201121881119306, + "learning_rate": 9.99165772126166e-06, + "loss": 0.4477, + "step": 1650 + }, + { + "epoch": 0.05, + "grad_norm": 2.017080684845592, + "learning_rate": 9.991630577420774e-06, + "loss": 0.4725, + "step": 1651 + }, + { + "epoch": 0.05, + "grad_norm": 2.010520932791612, + "learning_rate": 9.991603389528693e-06, + "loss": 0.5032, + "step": 1652 + }, + { + "epoch": 0.05, + "grad_norm": 1.993545382734086, + "learning_rate": 9.991576157585656e-06, + "loss": 0.4974, + "step": 1653 + }, + { + "epoch": 0.05, + "grad_norm": 1.8328074894518458, + "learning_rate": 9.991548881591904e-06, + "loss": 0.4489, + "step": 1654 + }, + { + "epoch": 0.05, + "grad_norm": 3.3448931577681438, + "learning_rate": 9.991521561547679e-06, + "loss": 0.5147, + "step": 1655 + }, + { + "epoch": 0.05, + "grad_norm": 1.878805431765491, + "learning_rate": 9.991494197453219e-06, + "loss": 0.4729, + "step": 1656 + }, + { + "epoch": 0.05, + "grad_norm": 2.015300952202089, + "learning_rate": 9.991466789308768e-06, + "loss": 0.4797, + "step": 1657 + }, + { + "epoch": 0.05, + "grad_norm": 2.2353348885385462, + "learning_rate": 9.991439337114567e-06, + "loss": 0.4829, + "step": 1658 + }, + { + "epoch": 0.05, + "grad_norm": 2.0694063447870277, + "learning_rate": 9.99141184087086e-06, + "loss": 0.4958, + "step": 1659 + }, + { + "epoch": 0.05, + "grad_norm": 1.7837020618551354, + "learning_rate": 9.991384300577885e-06, + "loss": 0.4809, + "step": 1660 + }, + { + "epoch": 0.05, + "grad_norm": 2.3108550695170256, + "learning_rate": 9.991356716235892e-06, + "loss": 0.5411, + "step": 1661 + }, + { + "epoch": 0.05, + "grad_norm": 1.9773358049304712, + "learning_rate": 9.991329087845117e-06, + "loss": 0.4383, + "step": 1662 + }, + { + "epoch": 0.05, + "grad_norm": 1.875879441802633, + "learning_rate": 9.99130141540581e-06, + "loss": 0.5385, + "step": 1663 + }, + { + "epoch": 0.05, + "grad_norm": 2.309234603287908, + "learning_rate": 9.991273698918211e-06, + "loss": 0.4903, + "step": 1664 + }, + { + "epoch": 0.05, + "grad_norm": 1.8431194641905495, + "learning_rate": 9.991245938382567e-06, + "loss": 0.4877, + "step": 1665 + }, + { + "epoch": 0.05, + "grad_norm": 2.1375591902069613, + "learning_rate": 9.991218133799123e-06, + "loss": 0.5099, + "step": 1666 + }, + { + "epoch": 0.05, + "grad_norm": 1.8622538516469247, + "learning_rate": 9.991190285168122e-06, + "loss": 0.4989, + "step": 1667 + }, + { + "epoch": 0.05, + "grad_norm": 1.2473314058567422, + "learning_rate": 9.991162392489812e-06, + "loss": 0.541, + "step": 1668 + }, + { + "epoch": 0.05, + "grad_norm": 2.0197155190031375, + "learning_rate": 9.991134455764439e-06, + "loss": 0.4391, + "step": 1669 + }, + { + "epoch": 0.05, + "grad_norm": 1.939436845528641, + "learning_rate": 9.99110647499225e-06, + "loss": 0.4675, + "step": 1670 + }, + { + "epoch": 0.05, + "grad_norm": 1.6947253226331604, + "learning_rate": 9.991078450173488e-06, + "loss": 0.4914, + "step": 1671 + }, + { + "epoch": 0.05, + "grad_norm": 1.8531143049285304, + "learning_rate": 9.991050381308407e-06, + "loss": 0.461, + "step": 1672 + }, + { + "epoch": 0.05, + "grad_norm": 1.8257326670888725, + "learning_rate": 9.991022268397248e-06, + "loss": 0.4854, + "step": 1673 + }, + { + "epoch": 0.05, + "grad_norm": 2.8104680378041187, + "learning_rate": 9.990994111440262e-06, + "loss": 0.4757, + "step": 1674 + }, + { + "epoch": 0.05, + "grad_norm": 2.0940570269853542, + "learning_rate": 9.990965910437699e-06, + "loss": 0.5151, + "step": 1675 + }, + { + "epoch": 0.05, + "grad_norm": 1.9393727281501048, + "learning_rate": 9.990937665389807e-06, + "loss": 0.5907, + "step": 1676 + }, + { + "epoch": 0.05, + "grad_norm": 1.9656889238862152, + "learning_rate": 9.990909376296834e-06, + "loss": 0.5015, + "step": 1677 + }, + { + "epoch": 0.05, + "grad_norm": 1.9803619983453735, + "learning_rate": 9.990881043159032e-06, + "loss": 0.4451, + "step": 1678 + }, + { + "epoch": 0.05, + "grad_norm": 1.8254622983788562, + "learning_rate": 9.990852665976648e-06, + "loss": 0.4888, + "step": 1679 + }, + { + "epoch": 0.05, + "grad_norm": 1.697054213486508, + "learning_rate": 9.990824244749933e-06, + "loss": 0.4635, + "step": 1680 + }, + { + "epoch": 0.05, + "grad_norm": 2.305782350434265, + "learning_rate": 9.99079577947914e-06, + "loss": 0.5068, + "step": 1681 + }, + { + "epoch": 0.05, + "grad_norm": 2.0275591229685785, + "learning_rate": 9.990767270164517e-06, + "loss": 0.4919, + "step": 1682 + }, + { + "epoch": 0.05, + "grad_norm": 1.8122077290060117, + "learning_rate": 9.990738716806319e-06, + "loss": 0.5207, + "step": 1683 + }, + { + "epoch": 0.05, + "grad_norm": 1.803152606151325, + "learning_rate": 9.990710119404796e-06, + "loss": 0.456, + "step": 1684 + }, + { + "epoch": 0.05, + "grad_norm": 2.210228188247733, + "learning_rate": 9.9906814779602e-06, + "loss": 0.5467, + "step": 1685 + }, + { + "epoch": 0.05, + "grad_norm": 2.0699027635667973, + "learning_rate": 9.990652792472786e-06, + "loss": 0.4916, + "step": 1686 + }, + { + "epoch": 0.05, + "grad_norm": 1.950489032334518, + "learning_rate": 9.990624062942806e-06, + "loss": 0.4956, + "step": 1687 + }, + { + "epoch": 0.05, + "grad_norm": 1.6887348113367326, + "learning_rate": 9.990595289370512e-06, + "loss": 0.4751, + "step": 1688 + }, + { + "epoch": 0.05, + "grad_norm": 1.7678729167547536, + "learning_rate": 9.99056647175616e-06, + "loss": 0.4787, + "step": 1689 + }, + { + "epoch": 0.05, + "grad_norm": 1.9791085657679965, + "learning_rate": 9.990537610100003e-06, + "loss": 0.5411, + "step": 1690 + }, + { + "epoch": 0.05, + "grad_norm": 1.7524038599656915, + "learning_rate": 9.990508704402298e-06, + "loss": 0.4622, + "step": 1691 + }, + { + "epoch": 0.05, + "grad_norm": 2.0585289984603454, + "learning_rate": 9.990479754663296e-06, + "loss": 0.5289, + "step": 1692 + }, + { + "epoch": 0.05, + "grad_norm": 1.8008117013594838, + "learning_rate": 9.990450760883256e-06, + "loss": 0.4774, + "step": 1693 + }, + { + "epoch": 0.05, + "grad_norm": 2.2258881004731124, + "learning_rate": 9.990421723062432e-06, + "loss": 0.4832, + "step": 1694 + }, + { + "epoch": 0.05, + "grad_norm": 2.0711260450685556, + "learning_rate": 9.990392641201081e-06, + "loss": 0.4957, + "step": 1695 + }, + { + "epoch": 0.05, + "grad_norm": 2.20752861976997, + "learning_rate": 9.99036351529946e-06, + "loss": 0.4912, + "step": 1696 + }, + { + "epoch": 0.05, + "grad_norm": 3.0113485684314183, + "learning_rate": 9.990334345357826e-06, + "loss": 0.4718, + "step": 1697 + }, + { + "epoch": 0.05, + "grad_norm": 2.1780101385545043, + "learning_rate": 9.990305131376435e-06, + "loss": 0.4544, + "step": 1698 + }, + { + "epoch": 0.05, + "grad_norm": 1.781786269044877, + "learning_rate": 9.990275873355546e-06, + "loss": 0.4554, + "step": 1699 + }, + { + "epoch": 0.05, + "grad_norm": 1.8579123118460097, + "learning_rate": 9.990246571295419e-06, + "loss": 0.4533, + "step": 1700 + }, + { + "epoch": 0.05, + "grad_norm": 1.8798583485678897, + "learning_rate": 9.990217225196307e-06, + "loss": 0.4904, + "step": 1701 + }, + { + "epoch": 0.05, + "grad_norm": 1.9698910441055577, + "learning_rate": 9.990187835058475e-06, + "loss": 0.5175, + "step": 1702 + }, + { + "epoch": 0.05, + "grad_norm": 1.7209981489536856, + "learning_rate": 9.990158400882181e-06, + "loss": 0.494, + "step": 1703 + }, + { + "epoch": 0.05, + "grad_norm": 1.9171131003548516, + "learning_rate": 9.990128922667683e-06, + "loss": 0.4847, + "step": 1704 + }, + { + "epoch": 0.05, + "grad_norm": 1.790856341281311, + "learning_rate": 9.99009940041524e-06, + "loss": 0.5011, + "step": 1705 + }, + { + "epoch": 0.05, + "grad_norm": 1.7040547404316682, + "learning_rate": 9.990069834125117e-06, + "loss": 0.4467, + "step": 1706 + }, + { + "epoch": 0.05, + "grad_norm": 2.3849162912126056, + "learning_rate": 9.99004022379757e-06, + "loss": 0.4838, + "step": 1707 + }, + { + "epoch": 0.05, + "grad_norm": 1.7651107991987274, + "learning_rate": 9.990010569432864e-06, + "loss": 0.4776, + "step": 1708 + }, + { + "epoch": 0.05, + "grad_norm": 1.5249074830909204, + "learning_rate": 9.98998087103126e-06, + "loss": 0.5816, + "step": 1709 + }, + { + "epoch": 0.05, + "grad_norm": 1.9834517557961755, + "learning_rate": 9.989951128593019e-06, + "loss": 0.473, + "step": 1710 + }, + { + "epoch": 0.05, + "grad_norm": 2.1181706562347027, + "learning_rate": 9.989921342118403e-06, + "loss": 0.5069, + "step": 1711 + }, + { + "epoch": 0.05, + "grad_norm": 2.254354920238539, + "learning_rate": 9.989891511607676e-06, + "loss": 0.4329, + "step": 1712 + }, + { + "epoch": 0.05, + "grad_norm": 1.8797136200876157, + "learning_rate": 9.989861637061102e-06, + "loss": 0.4736, + "step": 1713 + }, + { + "epoch": 0.05, + "grad_norm": 1.9486474924044814, + "learning_rate": 9.989831718478943e-06, + "loss": 0.5422, + "step": 1714 + }, + { + "epoch": 0.05, + "grad_norm": 12.825538346014728, + "learning_rate": 9.989801755861463e-06, + "loss": 0.4804, + "step": 1715 + }, + { + "epoch": 0.05, + "grad_norm": 1.7856465673600006, + "learning_rate": 9.98977174920893e-06, + "loss": 0.4838, + "step": 1716 + }, + { + "epoch": 0.05, + "grad_norm": 1.7243213323382451, + "learning_rate": 9.989741698521603e-06, + "loss": 0.4877, + "step": 1717 + }, + { + "epoch": 0.05, + "grad_norm": 1.6933441574472496, + "learning_rate": 9.989711603799753e-06, + "loss": 0.4635, + "step": 1718 + }, + { + "epoch": 0.05, + "grad_norm": 1.8679279157933661, + "learning_rate": 9.98968146504364e-06, + "loss": 0.5137, + "step": 1719 + }, + { + "epoch": 0.05, + "grad_norm": 2.648611161082564, + "learning_rate": 9.989651282253533e-06, + "loss": 0.5249, + "step": 1720 + }, + { + "epoch": 0.05, + "grad_norm": 2.407440357249139, + "learning_rate": 9.9896210554297e-06, + "loss": 0.4663, + "step": 1721 + }, + { + "epoch": 0.05, + "grad_norm": 2.4601173777675003, + "learning_rate": 9.989590784572403e-06, + "loss": 0.468, + "step": 1722 + }, + { + "epoch": 0.05, + "grad_norm": 1.8375830899337322, + "learning_rate": 9.989560469681915e-06, + "loss": 0.4931, + "step": 1723 + }, + { + "epoch": 0.05, + "grad_norm": 4.459511232147399, + "learning_rate": 9.9895301107585e-06, + "loss": 0.5046, + "step": 1724 + }, + { + "epoch": 0.05, + "grad_norm": 1.884297993779501, + "learning_rate": 9.989499707802424e-06, + "loss": 0.4312, + "step": 1725 + }, + { + "epoch": 0.05, + "grad_norm": 2.0345207731399095, + "learning_rate": 9.989469260813957e-06, + "loss": 0.492, + "step": 1726 + }, + { + "epoch": 0.05, + "grad_norm": 1.3587928163854321, + "learning_rate": 9.989438769793372e-06, + "loss": 0.6398, + "step": 1727 + }, + { + "epoch": 0.05, + "grad_norm": 1.791644869044641, + "learning_rate": 9.989408234740934e-06, + "loss": 0.4797, + "step": 1728 + }, + { + "epoch": 0.05, + "grad_norm": 1.9855430711351851, + "learning_rate": 9.989377655656912e-06, + "loss": 0.4767, + "step": 1729 + }, + { + "epoch": 0.05, + "grad_norm": 1.8756984238289633, + "learning_rate": 9.989347032541576e-06, + "loss": 0.5042, + "step": 1730 + }, + { + "epoch": 0.05, + "grad_norm": 1.9224330631455022, + "learning_rate": 9.989316365395199e-06, + "loss": 0.5197, + "step": 1731 + }, + { + "epoch": 0.05, + "grad_norm": 1.8433459144694468, + "learning_rate": 9.989285654218048e-06, + "loss": 0.5134, + "step": 1732 + }, + { + "epoch": 0.05, + "grad_norm": 2.2069446109370956, + "learning_rate": 9.989254899010396e-06, + "loss": 0.5006, + "step": 1733 + }, + { + "epoch": 0.05, + "grad_norm": 1.6555099789164378, + "learning_rate": 9.989224099772514e-06, + "loss": 0.489, + "step": 1734 + }, + { + "epoch": 0.05, + "grad_norm": 1.7774691786932288, + "learning_rate": 9.989193256504675e-06, + "loss": 0.4737, + "step": 1735 + }, + { + "epoch": 0.05, + "grad_norm": 2.053281573228551, + "learning_rate": 9.98916236920715e-06, + "loss": 0.5192, + "step": 1736 + }, + { + "epoch": 0.05, + "grad_norm": 1.8671204301261815, + "learning_rate": 9.989131437880211e-06, + "loss": 0.5213, + "step": 1737 + }, + { + "epoch": 0.05, + "grad_norm": 1.7491284016805648, + "learning_rate": 9.989100462524132e-06, + "loss": 0.5256, + "step": 1738 + }, + { + "epoch": 0.05, + "grad_norm": 1.660503001570367, + "learning_rate": 9.989069443139187e-06, + "loss": 0.5186, + "step": 1739 + }, + { + "epoch": 0.05, + "grad_norm": 2.499108187377407, + "learning_rate": 9.989038379725649e-06, + "loss": 0.5009, + "step": 1740 + }, + { + "epoch": 0.05, + "grad_norm": 1.8786748930981125, + "learning_rate": 9.98900727228379e-06, + "loss": 0.4721, + "step": 1741 + }, + { + "epoch": 0.05, + "grad_norm": 1.6566367823871688, + "learning_rate": 9.98897612081389e-06, + "loss": 0.4533, + "step": 1742 + }, + { + "epoch": 0.05, + "grad_norm": 1.6742243789032731, + "learning_rate": 9.988944925316216e-06, + "loss": 0.4666, + "step": 1743 + }, + { + "epoch": 0.05, + "grad_norm": 1.8673488078863278, + "learning_rate": 9.988913685791051e-06, + "loss": 0.4461, + "step": 1744 + }, + { + "epoch": 0.05, + "grad_norm": 2.0266246295529546, + "learning_rate": 9.988882402238665e-06, + "loss": 0.4864, + "step": 1745 + }, + { + "epoch": 0.05, + "grad_norm": 1.8449059442281968, + "learning_rate": 9.988851074659338e-06, + "loss": 0.4726, + "step": 1746 + }, + { + "epoch": 0.05, + "grad_norm": 1.863605660252147, + "learning_rate": 9.988819703053344e-06, + "loss": 0.519, + "step": 1747 + }, + { + "epoch": 0.05, + "grad_norm": 1.962820577285024, + "learning_rate": 9.988788287420961e-06, + "loss": 0.4867, + "step": 1748 + }, + { + "epoch": 0.05, + "grad_norm": 1.7970642037305276, + "learning_rate": 9.988756827762466e-06, + "loss": 0.4686, + "step": 1749 + }, + { + "epoch": 0.05, + "grad_norm": 1.8523307313775987, + "learning_rate": 9.988725324078136e-06, + "loss": 0.4362, + "step": 1750 + }, + { + "epoch": 0.05, + "grad_norm": 1.8131130369625987, + "learning_rate": 9.988693776368252e-06, + "loss": 0.5088, + "step": 1751 + }, + { + "epoch": 0.05, + "grad_norm": 2.7951533379771973, + "learning_rate": 9.988662184633088e-06, + "loss": 0.4583, + "step": 1752 + }, + { + "epoch": 0.05, + "grad_norm": 1.7461050588076823, + "learning_rate": 9.988630548872927e-06, + "loss": 0.4407, + "step": 1753 + }, + { + "epoch": 0.05, + "grad_norm": 2.6823746741844827, + "learning_rate": 9.988598869088044e-06, + "loss": 0.472, + "step": 1754 + }, + { + "epoch": 0.05, + "grad_norm": 2.2489737614505687, + "learning_rate": 9.98856714527872e-06, + "loss": 0.4764, + "step": 1755 + }, + { + "epoch": 0.05, + "grad_norm": 3.7527428492063617, + "learning_rate": 9.988535377445237e-06, + "loss": 0.4881, + "step": 1756 + }, + { + "epoch": 0.05, + "grad_norm": 2.451247407954173, + "learning_rate": 9.988503565587874e-06, + "loss": 0.4716, + "step": 1757 + }, + { + "epoch": 0.05, + "grad_norm": 2.056755444731157, + "learning_rate": 9.988471709706912e-06, + "loss": 0.5401, + "step": 1758 + }, + { + "epoch": 0.05, + "grad_norm": 2.261385340687533, + "learning_rate": 9.98843980980263e-06, + "loss": 0.4353, + "step": 1759 + }, + { + "epoch": 0.05, + "grad_norm": 2.424318646899155, + "learning_rate": 9.988407865875314e-06, + "loss": 0.5179, + "step": 1760 + }, + { + "epoch": 0.05, + "grad_norm": 2.0816839656428834, + "learning_rate": 9.98837587792524e-06, + "loss": 0.4868, + "step": 1761 + }, + { + "epoch": 0.05, + "grad_norm": 2.1464307891690164, + "learning_rate": 9.988343845952697e-06, + "loss": 0.5079, + "step": 1762 + }, + { + "epoch": 0.05, + "grad_norm": 1.9578406158216184, + "learning_rate": 9.988311769957962e-06, + "loss": 0.4814, + "step": 1763 + }, + { + "epoch": 0.05, + "grad_norm": 1.7682712546253665, + "learning_rate": 9.988279649941321e-06, + "loss": 0.4742, + "step": 1764 + }, + { + "epoch": 0.05, + "grad_norm": 2.1325826316072396, + "learning_rate": 9.988247485903058e-06, + "loss": 0.5053, + "step": 1765 + }, + { + "epoch": 0.05, + "grad_norm": 2.1712310866937425, + "learning_rate": 9.988215277843454e-06, + "loss": 0.448, + "step": 1766 + }, + { + "epoch": 0.05, + "grad_norm": 1.9270798571688499, + "learning_rate": 9.988183025762794e-06, + "loss": 0.4474, + "step": 1767 + }, + { + "epoch": 0.05, + "grad_norm": 1.673085762764628, + "learning_rate": 9.988150729661366e-06, + "loss": 0.4524, + "step": 1768 + }, + { + "epoch": 0.05, + "grad_norm": 1.6967584056071965, + "learning_rate": 9.988118389539449e-06, + "loss": 0.4862, + "step": 1769 + }, + { + "epoch": 0.05, + "grad_norm": 1.8884023710561446, + "learning_rate": 9.988086005397335e-06, + "loss": 0.4811, + "step": 1770 + }, + { + "epoch": 0.05, + "grad_norm": 2.479595503515533, + "learning_rate": 9.988053577235306e-06, + "loss": 0.5356, + "step": 1771 + }, + { + "epoch": 0.05, + "grad_norm": 2.0087904296960137, + "learning_rate": 9.988021105053648e-06, + "loss": 0.5088, + "step": 1772 + }, + { + "epoch": 0.05, + "grad_norm": 2.0944575818024207, + "learning_rate": 9.987988588852648e-06, + "loss": 0.4667, + "step": 1773 + }, + { + "epoch": 0.05, + "grad_norm": 1.723979292739801, + "learning_rate": 9.987956028632595e-06, + "loss": 0.5075, + "step": 1774 + }, + { + "epoch": 0.05, + "grad_norm": 2.697867300173741, + "learning_rate": 9.987923424393774e-06, + "loss": 0.4831, + "step": 1775 + }, + { + "epoch": 0.05, + "grad_norm": 2.579185643729277, + "learning_rate": 9.987890776136473e-06, + "loss": 0.5006, + "step": 1776 + }, + { + "epoch": 0.05, + "grad_norm": 2.090033006162696, + "learning_rate": 9.987858083860981e-06, + "loss": 0.5195, + "step": 1777 + }, + { + "epoch": 0.05, + "grad_norm": 1.9519002858705967, + "learning_rate": 9.987825347567586e-06, + "loss": 0.5023, + "step": 1778 + }, + { + "epoch": 0.05, + "grad_norm": 2.0406286571811556, + "learning_rate": 9.987792567256578e-06, + "loss": 0.5055, + "step": 1779 + }, + { + "epoch": 0.05, + "grad_norm": 2.0403096614625174, + "learning_rate": 9.987759742928243e-06, + "loss": 0.5463, + "step": 1780 + }, + { + "epoch": 0.05, + "grad_norm": 2.325792822126985, + "learning_rate": 9.987726874582876e-06, + "loss": 0.5184, + "step": 1781 + }, + { + "epoch": 0.05, + "grad_norm": 1.8430915875461822, + "learning_rate": 9.987693962220761e-06, + "loss": 0.4621, + "step": 1782 + }, + { + "epoch": 0.05, + "grad_norm": 1.920477500264551, + "learning_rate": 9.987661005842194e-06, + "loss": 0.4697, + "step": 1783 + }, + { + "epoch": 0.05, + "grad_norm": 1.263791814077483, + "learning_rate": 9.987628005447461e-06, + "loss": 0.5794, + "step": 1784 + }, + { + "epoch": 0.05, + "grad_norm": 1.8455592946863915, + "learning_rate": 9.987594961036856e-06, + "loss": 0.5274, + "step": 1785 + }, + { + "epoch": 0.05, + "grad_norm": 1.8852528514047575, + "learning_rate": 9.98756187261067e-06, + "loss": 0.4664, + "step": 1786 + }, + { + "epoch": 0.05, + "grad_norm": 1.055349552873875, + "learning_rate": 9.987528740169198e-06, + "loss": 0.564, + "step": 1787 + }, + { + "epoch": 0.05, + "grad_norm": 1.8499889204410942, + "learning_rate": 9.987495563712725e-06, + "loss": 0.5171, + "step": 1788 + }, + { + "epoch": 0.05, + "grad_norm": 1.8749575596319126, + "learning_rate": 9.987462343241552e-06, + "loss": 0.4969, + "step": 1789 + }, + { + "epoch": 0.05, + "grad_norm": 1.870518609750105, + "learning_rate": 9.987429078755966e-06, + "loss": 0.5085, + "step": 1790 + }, + { + "epoch": 0.05, + "grad_norm": 1.8993957038693736, + "learning_rate": 9.987395770256265e-06, + "loss": 0.5153, + "step": 1791 + }, + { + "epoch": 0.05, + "grad_norm": 1.7585224645717816, + "learning_rate": 9.98736241774274e-06, + "loss": 0.4591, + "step": 1792 + }, + { + "epoch": 0.05, + "grad_norm": 1.8685058807837367, + "learning_rate": 9.987329021215686e-06, + "loss": 0.4639, + "step": 1793 + }, + { + "epoch": 0.05, + "grad_norm": 1.777993285847745, + "learning_rate": 9.987295580675398e-06, + "loss": 0.4615, + "step": 1794 + }, + { + "epoch": 0.05, + "grad_norm": 1.669342302571882, + "learning_rate": 9.987262096122172e-06, + "loss": 0.4732, + "step": 1795 + }, + { + "epoch": 0.05, + "grad_norm": 2.6878384236640884, + "learning_rate": 9.987228567556303e-06, + "loss": 0.4591, + "step": 1796 + }, + { + "epoch": 0.05, + "grad_norm": 1.8063106593182836, + "learning_rate": 9.987194994978085e-06, + "loss": 0.5095, + "step": 1797 + }, + { + "epoch": 0.05, + "grad_norm": 1.9131909715603541, + "learning_rate": 9.987161378387817e-06, + "loss": 0.4681, + "step": 1798 + }, + { + "epoch": 0.05, + "grad_norm": 1.7217681788417298, + "learning_rate": 9.987127717785794e-06, + "loss": 0.4592, + "step": 1799 + }, + { + "epoch": 0.05, + "grad_norm": 1.2284275490371626, + "learning_rate": 9.987094013172314e-06, + "loss": 0.5983, + "step": 1800 + }, + { + "epoch": 0.05, + "grad_norm": 1.8209707536559492, + "learning_rate": 9.987060264547675e-06, + "loss": 0.5044, + "step": 1801 + }, + { + "epoch": 0.05, + "grad_norm": 1.845430748932118, + "learning_rate": 9.987026471912173e-06, + "loss": 0.5464, + "step": 1802 + }, + { + "epoch": 0.05, + "grad_norm": 1.9075114431208198, + "learning_rate": 9.986992635266105e-06, + "loss": 0.4741, + "step": 1803 + }, + { + "epoch": 0.05, + "grad_norm": 1.902884428222026, + "learning_rate": 9.986958754609775e-06, + "loss": 0.5029, + "step": 1804 + }, + { + "epoch": 0.05, + "grad_norm": 1.7175164712503663, + "learning_rate": 9.986924829943476e-06, + "loss": 0.4649, + "step": 1805 + }, + { + "epoch": 0.05, + "grad_norm": 1.892563359034796, + "learning_rate": 9.986890861267512e-06, + "loss": 0.4711, + "step": 1806 + }, + { + "epoch": 0.05, + "grad_norm": 1.8806720277041842, + "learning_rate": 9.986856848582181e-06, + "loss": 0.4455, + "step": 1807 + }, + { + "epoch": 0.05, + "grad_norm": 1.8599982436812994, + "learning_rate": 9.98682279188778e-06, + "loss": 0.4869, + "step": 1808 + }, + { + "epoch": 0.05, + "grad_norm": 1.8549640876921216, + "learning_rate": 9.986788691184615e-06, + "loss": 0.45, + "step": 1809 + }, + { + "epoch": 0.05, + "grad_norm": 1.9699902888387357, + "learning_rate": 9.986754546472986e-06, + "loss": 0.5043, + "step": 1810 + }, + { + "epoch": 0.05, + "grad_norm": 1.918213061290646, + "learning_rate": 9.98672035775319e-06, + "loss": 0.4539, + "step": 1811 + }, + { + "epoch": 0.05, + "grad_norm": 2.0144634754304316, + "learning_rate": 9.986686125025531e-06, + "loss": 0.4831, + "step": 1812 + }, + { + "epoch": 0.05, + "grad_norm": 1.7536317478496062, + "learning_rate": 9.986651848290315e-06, + "loss": 0.462, + "step": 1813 + }, + { + "epoch": 0.05, + "grad_norm": 1.8673872807021556, + "learning_rate": 9.986617527547837e-06, + "loss": 0.4448, + "step": 1814 + }, + { + "epoch": 0.05, + "grad_norm": 1.9681307567635902, + "learning_rate": 9.986583162798406e-06, + "loss": 0.464, + "step": 1815 + }, + { + "epoch": 0.05, + "grad_norm": 3.424543345395034, + "learning_rate": 9.986548754042324e-06, + "loss": 0.4965, + "step": 1816 + }, + { + "epoch": 0.05, + "grad_norm": 1.906654847375938, + "learning_rate": 9.986514301279894e-06, + "loss": 0.4586, + "step": 1817 + }, + { + "epoch": 0.05, + "grad_norm": 1.688730348872464, + "learning_rate": 9.98647980451142e-06, + "loss": 0.4453, + "step": 1818 + }, + { + "epoch": 0.05, + "grad_norm": 1.8553899549547748, + "learning_rate": 9.986445263737204e-06, + "loss": 0.473, + "step": 1819 + }, + { + "epoch": 0.05, + "grad_norm": 2.5100832015242274, + "learning_rate": 9.986410678957555e-06, + "loss": 0.5128, + "step": 1820 + }, + { + "epoch": 0.05, + "grad_norm": 1.4769481218086946, + "learning_rate": 9.986376050172778e-06, + "loss": 0.5653, + "step": 1821 + }, + { + "epoch": 0.05, + "grad_norm": 1.8726393709134597, + "learning_rate": 9.986341377383175e-06, + "loss": 0.4589, + "step": 1822 + }, + { + "epoch": 0.05, + "grad_norm": 1.9450472731461925, + "learning_rate": 9.986306660589053e-06, + "loss": 0.4797, + "step": 1823 + }, + { + "epoch": 0.05, + "grad_norm": 1.876541436706856, + "learning_rate": 9.986271899790722e-06, + "loss": 0.5011, + "step": 1824 + }, + { + "epoch": 0.05, + "grad_norm": 2.3078729185927567, + "learning_rate": 9.986237094988484e-06, + "loss": 0.5121, + "step": 1825 + }, + { + "epoch": 0.05, + "grad_norm": 2.050269938327175, + "learning_rate": 9.98620224618265e-06, + "loss": 0.4748, + "step": 1826 + }, + { + "epoch": 0.05, + "grad_norm": 1.9089606952307703, + "learning_rate": 9.986167353373525e-06, + "loss": 0.4832, + "step": 1827 + }, + { + "epoch": 0.05, + "grad_norm": 1.7699155885675542, + "learning_rate": 9.986132416561418e-06, + "loss": 0.4597, + "step": 1828 + }, + { + "epoch": 0.05, + "grad_norm": 1.78056132694489, + "learning_rate": 9.986097435746638e-06, + "loss": 0.5064, + "step": 1829 + }, + { + "epoch": 0.05, + "grad_norm": 1.7652571019683498, + "learning_rate": 9.986062410929491e-06, + "loss": 0.4837, + "step": 1830 + }, + { + "epoch": 0.05, + "grad_norm": 2.0448104350357905, + "learning_rate": 9.98602734211029e-06, + "loss": 0.498, + "step": 1831 + }, + { + "epoch": 0.05, + "grad_norm": 1.6962950203009888, + "learning_rate": 9.98599222928934e-06, + "loss": 0.4609, + "step": 1832 + }, + { + "epoch": 0.05, + "grad_norm": 1.189564597434773, + "learning_rate": 9.985957072466955e-06, + "loss": 0.5951, + "step": 1833 + }, + { + "epoch": 0.05, + "grad_norm": 2.0263686104569025, + "learning_rate": 9.985921871643443e-06, + "loss": 0.4749, + "step": 1834 + }, + { + "epoch": 0.05, + "grad_norm": 1.8120818293413623, + "learning_rate": 9.985886626819115e-06, + "loss": 0.4916, + "step": 1835 + }, + { + "epoch": 0.05, + "grad_norm": 1.893075694559597, + "learning_rate": 9.985851337994282e-06, + "loss": 0.4992, + "step": 1836 + }, + { + "epoch": 0.05, + "grad_norm": 2.265800387531855, + "learning_rate": 9.985816005169256e-06, + "loss": 0.4718, + "step": 1837 + }, + { + "epoch": 0.05, + "grad_norm": 2.176866761654589, + "learning_rate": 9.985780628344348e-06, + "loss": 0.5123, + "step": 1838 + }, + { + "epoch": 0.05, + "grad_norm": 1.818671970232332, + "learning_rate": 9.985745207519872e-06, + "loss": 0.4395, + "step": 1839 + }, + { + "epoch": 0.05, + "grad_norm": 1.848554942745401, + "learning_rate": 9.985709742696138e-06, + "loss": 0.4709, + "step": 1840 + }, + { + "epoch": 0.05, + "grad_norm": 2.096592871336934, + "learning_rate": 9.985674233873461e-06, + "loss": 0.5015, + "step": 1841 + }, + { + "epoch": 0.05, + "grad_norm": 1.7965262384746312, + "learning_rate": 9.985638681052152e-06, + "loss": 0.4541, + "step": 1842 + }, + { + "epoch": 0.05, + "grad_norm": 1.866209977941699, + "learning_rate": 9.985603084232529e-06, + "loss": 0.477, + "step": 1843 + }, + { + "epoch": 0.05, + "grad_norm": 1.760299363708251, + "learning_rate": 9.985567443414902e-06, + "loss": 0.4737, + "step": 1844 + }, + { + "epoch": 0.05, + "grad_norm": 1.822546814771955, + "learning_rate": 9.985531758599586e-06, + "loss": 0.5111, + "step": 1845 + }, + { + "epoch": 0.05, + "grad_norm": 1.8939062876508321, + "learning_rate": 9.985496029786898e-06, + "loss": 0.4978, + "step": 1846 + }, + { + "epoch": 0.05, + "grad_norm": 1.643822829932943, + "learning_rate": 9.985460256977153e-06, + "loss": 0.4795, + "step": 1847 + }, + { + "epoch": 0.05, + "grad_norm": 1.9252246290478643, + "learning_rate": 9.985424440170664e-06, + "loss": 0.4681, + "step": 1848 + }, + { + "epoch": 0.05, + "grad_norm": 2.8122076750921474, + "learning_rate": 9.985388579367749e-06, + "loss": 0.4664, + "step": 1849 + }, + { + "epoch": 0.05, + "grad_norm": 1.757771326788681, + "learning_rate": 9.985352674568726e-06, + "loss": 0.46, + "step": 1850 + }, + { + "epoch": 0.05, + "grad_norm": 1.7019506889851896, + "learning_rate": 9.98531672577391e-06, + "loss": 0.4982, + "step": 1851 + }, + { + "epoch": 0.05, + "grad_norm": 2.069294025829276, + "learning_rate": 9.98528073298362e-06, + "loss": 0.4765, + "step": 1852 + }, + { + "epoch": 0.05, + "grad_norm": 1.7896396254686364, + "learning_rate": 9.98524469619817e-06, + "loss": 0.4409, + "step": 1853 + }, + { + "epoch": 0.05, + "grad_norm": 1.8270634974219189, + "learning_rate": 9.98520861541788e-06, + "loss": 0.5026, + "step": 1854 + }, + { + "epoch": 0.05, + "grad_norm": 1.9784906520511893, + "learning_rate": 9.98517249064307e-06, + "loss": 0.4821, + "step": 1855 + }, + { + "epoch": 0.05, + "grad_norm": 1.9032763131585533, + "learning_rate": 9.985136321874057e-06, + "loss": 0.4818, + "step": 1856 + }, + { + "epoch": 0.05, + "grad_norm": 1.6544668347977478, + "learning_rate": 9.985100109111161e-06, + "loss": 0.4453, + "step": 1857 + }, + { + "epoch": 0.05, + "grad_norm": 1.7470116251300787, + "learning_rate": 9.985063852354702e-06, + "loss": 0.459, + "step": 1858 + }, + { + "epoch": 0.05, + "grad_norm": 1.651202569003155, + "learning_rate": 9.985027551604998e-06, + "loss": 0.4905, + "step": 1859 + }, + { + "epoch": 0.05, + "grad_norm": 2.1922786778575256, + "learning_rate": 9.98499120686237e-06, + "loss": 0.4631, + "step": 1860 + }, + { + "epoch": 0.05, + "grad_norm": 1.934529274072151, + "learning_rate": 9.984954818127139e-06, + "loss": 0.4898, + "step": 1861 + }, + { + "epoch": 0.05, + "grad_norm": 1.9378640351763392, + "learning_rate": 9.98491838539963e-06, + "loss": 0.4874, + "step": 1862 + }, + { + "epoch": 0.05, + "grad_norm": 1.6884200102509872, + "learning_rate": 9.984881908680157e-06, + "loss": 0.4621, + "step": 1863 + }, + { + "epoch": 0.05, + "grad_norm": 3.463947973610225, + "learning_rate": 9.984845387969046e-06, + "loss": 0.4941, + "step": 1864 + }, + { + "epoch": 0.05, + "grad_norm": 1.7717426904958207, + "learning_rate": 9.98480882326662e-06, + "loss": 0.4766, + "step": 1865 + }, + { + "epoch": 0.05, + "grad_norm": 1.7515958195729353, + "learning_rate": 9.984772214573203e-06, + "loss": 0.4591, + "step": 1866 + }, + { + "epoch": 0.05, + "grad_norm": 1.873138833768853, + "learning_rate": 9.984735561889114e-06, + "loss": 0.5377, + "step": 1867 + }, + { + "epoch": 0.05, + "grad_norm": 1.7419687421041872, + "learning_rate": 9.984698865214678e-06, + "loss": 0.4898, + "step": 1868 + }, + { + "epoch": 0.05, + "grad_norm": 1.9161831037747978, + "learning_rate": 9.984662124550219e-06, + "loss": 0.4702, + "step": 1869 + }, + { + "epoch": 0.05, + "grad_norm": 1.7790595845195105, + "learning_rate": 9.984625339896064e-06, + "loss": 0.5067, + "step": 1870 + }, + { + "epoch": 0.05, + "grad_norm": 1.7205931013424844, + "learning_rate": 9.984588511252532e-06, + "loss": 0.4936, + "step": 1871 + }, + { + "epoch": 0.05, + "grad_norm": 2.104567277734665, + "learning_rate": 9.984551638619952e-06, + "loss": 0.4417, + "step": 1872 + }, + { + "epoch": 0.05, + "grad_norm": 1.886404806531488, + "learning_rate": 9.98451472199865e-06, + "loss": 0.4585, + "step": 1873 + }, + { + "epoch": 0.05, + "grad_norm": 1.8590854946070745, + "learning_rate": 9.984477761388949e-06, + "loss": 0.4508, + "step": 1874 + }, + { + "epoch": 0.05, + "grad_norm": 1.7825527749459589, + "learning_rate": 9.984440756791177e-06, + "loss": 0.484, + "step": 1875 + }, + { + "epoch": 0.05, + "grad_norm": 1.8501207339861832, + "learning_rate": 9.984403708205659e-06, + "loss": 0.4516, + "step": 1876 + }, + { + "epoch": 0.05, + "grad_norm": 1.8719951383938134, + "learning_rate": 9.984366615632724e-06, + "loss": 0.5011, + "step": 1877 + }, + { + "epoch": 0.05, + "grad_norm": 2.034248209202781, + "learning_rate": 9.984329479072697e-06, + "loss": 0.4818, + "step": 1878 + }, + { + "epoch": 0.05, + "grad_norm": 2.3084523064918194, + "learning_rate": 9.984292298525908e-06, + "loss": 0.4953, + "step": 1879 + }, + { + "epoch": 0.05, + "grad_norm": 1.8178997973648885, + "learning_rate": 9.984255073992684e-06, + "loss": 0.4475, + "step": 1880 + }, + { + "epoch": 0.05, + "grad_norm": 1.741068935346577, + "learning_rate": 9.984217805473355e-06, + "loss": 0.4764, + "step": 1881 + }, + { + "epoch": 0.05, + "grad_norm": 1.839211461529139, + "learning_rate": 9.984180492968248e-06, + "loss": 0.5026, + "step": 1882 + }, + { + "epoch": 0.05, + "grad_norm": 1.8262095485983185, + "learning_rate": 9.984143136477691e-06, + "loss": 0.4693, + "step": 1883 + }, + { + "epoch": 0.05, + "grad_norm": 1.8669959370060119, + "learning_rate": 9.984105736002017e-06, + "loss": 0.4698, + "step": 1884 + }, + { + "epoch": 0.05, + "grad_norm": 2.261571978998654, + "learning_rate": 9.984068291541555e-06, + "loss": 0.4785, + "step": 1885 + }, + { + "epoch": 0.05, + "grad_norm": 1.9176829539528413, + "learning_rate": 9.984030803096633e-06, + "loss": 0.493, + "step": 1886 + }, + { + "epoch": 0.05, + "grad_norm": 1.62915806443978, + "learning_rate": 9.983993270667586e-06, + "loss": 0.4492, + "step": 1887 + }, + { + "epoch": 0.05, + "grad_norm": 2.1658112642805993, + "learning_rate": 9.983955694254743e-06, + "loss": 0.4605, + "step": 1888 + }, + { + "epoch": 0.05, + "grad_norm": 2.1877804109402694, + "learning_rate": 9.983918073858434e-06, + "loss": 0.4945, + "step": 1889 + }, + { + "epoch": 0.05, + "grad_norm": 1.662787920757515, + "learning_rate": 9.983880409478994e-06, + "loss": 0.466, + "step": 1890 + }, + { + "epoch": 0.05, + "grad_norm": 1.7440660402024681, + "learning_rate": 9.983842701116756e-06, + "loss": 0.4506, + "step": 1891 + }, + { + "epoch": 0.05, + "grad_norm": 1.8901352764138875, + "learning_rate": 9.983804948772047e-06, + "loss": 0.4327, + "step": 1892 + }, + { + "epoch": 0.05, + "grad_norm": 1.8411929876168405, + "learning_rate": 9.983767152445206e-06, + "loss": 0.5237, + "step": 1893 + }, + { + "epoch": 0.05, + "grad_norm": 2.199547036592023, + "learning_rate": 9.983729312136564e-06, + "loss": 0.4259, + "step": 1894 + }, + { + "epoch": 0.05, + "grad_norm": 2.157078710869039, + "learning_rate": 9.983691427846456e-06, + "loss": 0.5121, + "step": 1895 + }, + { + "epoch": 0.05, + "grad_norm": 1.9892812561009783, + "learning_rate": 9.983653499575216e-06, + "loss": 0.4413, + "step": 1896 + }, + { + "epoch": 0.06, + "grad_norm": 1.6351334841676843, + "learning_rate": 9.983615527323178e-06, + "loss": 0.49, + "step": 1897 + }, + { + "epoch": 0.06, + "grad_norm": 1.8531662395378827, + "learning_rate": 9.983577511090679e-06, + "loss": 0.4597, + "step": 1898 + }, + { + "epoch": 0.06, + "grad_norm": 1.760178979682625, + "learning_rate": 9.983539450878052e-06, + "loss": 0.4304, + "step": 1899 + }, + { + "epoch": 0.06, + "grad_norm": 1.77633806277831, + "learning_rate": 9.983501346685633e-06, + "loss": 0.4379, + "step": 1900 + }, + { + "epoch": 0.06, + "grad_norm": 1.8338961882113156, + "learning_rate": 9.983463198513761e-06, + "loss": 0.4648, + "step": 1901 + }, + { + "epoch": 0.06, + "grad_norm": 1.8799010157023266, + "learning_rate": 9.983425006362771e-06, + "loss": 0.5108, + "step": 1902 + }, + { + "epoch": 0.06, + "grad_norm": 2.1338903535387894, + "learning_rate": 9.983386770233e-06, + "loss": 0.5143, + "step": 1903 + }, + { + "epoch": 0.06, + "grad_norm": 2.043874321934663, + "learning_rate": 9.983348490124784e-06, + "loss": 0.4805, + "step": 1904 + }, + { + "epoch": 0.06, + "grad_norm": 1.8617761614199604, + "learning_rate": 9.983310166038465e-06, + "loss": 0.5467, + "step": 1905 + }, + { + "epoch": 0.06, + "grad_norm": 1.7425409122629607, + "learning_rate": 9.983271797974377e-06, + "loss": 0.4548, + "step": 1906 + }, + { + "epoch": 0.06, + "grad_norm": 2.2357288992850104, + "learning_rate": 9.98323338593286e-06, + "loss": 0.5008, + "step": 1907 + }, + { + "epoch": 0.06, + "grad_norm": 2.3128169383257418, + "learning_rate": 9.983194929914254e-06, + "loss": 0.4857, + "step": 1908 + }, + { + "epoch": 0.06, + "grad_norm": 1.87287625426338, + "learning_rate": 9.983156429918895e-06, + "loss": 0.4736, + "step": 1909 + }, + { + "epoch": 0.06, + "grad_norm": 1.8110222682108266, + "learning_rate": 9.983117885947127e-06, + "loss": 0.4888, + "step": 1910 + }, + { + "epoch": 0.06, + "grad_norm": 1.939171522497333, + "learning_rate": 9.983079297999288e-06, + "loss": 0.5422, + "step": 1911 + }, + { + "epoch": 0.06, + "grad_norm": 1.757860484757778, + "learning_rate": 9.983040666075719e-06, + "loss": 0.5137, + "step": 1912 + }, + { + "epoch": 0.06, + "grad_norm": 1.9277991102430232, + "learning_rate": 9.983001990176762e-06, + "loss": 0.4744, + "step": 1913 + }, + { + "epoch": 0.06, + "grad_norm": 2.045575731134743, + "learning_rate": 9.982963270302755e-06, + "loss": 0.5164, + "step": 1914 + }, + { + "epoch": 0.06, + "grad_norm": 1.7350721480944593, + "learning_rate": 9.982924506454043e-06, + "loss": 0.4564, + "step": 1915 + }, + { + "epoch": 0.06, + "grad_norm": 1.8269415015954213, + "learning_rate": 9.982885698630967e-06, + "loss": 0.4898, + "step": 1916 + }, + { + "epoch": 0.06, + "grad_norm": 1.7921732036266864, + "learning_rate": 9.982846846833869e-06, + "loss": 0.4606, + "step": 1917 + }, + { + "epoch": 0.06, + "grad_norm": 1.962945135505212, + "learning_rate": 9.982807951063094e-06, + "loss": 0.474, + "step": 1918 + }, + { + "epoch": 0.06, + "grad_norm": 1.798243989900394, + "learning_rate": 9.982769011318981e-06, + "loss": 0.4676, + "step": 1919 + }, + { + "epoch": 0.06, + "grad_norm": 2.1204931883954883, + "learning_rate": 9.982730027601879e-06, + "loss": 0.462, + "step": 1920 + }, + { + "epoch": 0.06, + "grad_norm": 1.6935782576730312, + "learning_rate": 9.982690999912126e-06, + "loss": 0.4392, + "step": 1921 + }, + { + "epoch": 0.06, + "grad_norm": 1.6615964405701136, + "learning_rate": 9.98265192825007e-06, + "loss": 0.4743, + "step": 1922 + }, + { + "epoch": 0.06, + "grad_norm": 1.7647955935866482, + "learning_rate": 9.98261281261606e-06, + "loss": 0.4842, + "step": 1923 + }, + { + "epoch": 0.06, + "grad_norm": 2.0869072316295343, + "learning_rate": 9.982573653010433e-06, + "loss": 0.5252, + "step": 1924 + }, + { + "epoch": 0.06, + "grad_norm": 1.7561100359701087, + "learning_rate": 9.982534449433538e-06, + "loss": 0.4465, + "step": 1925 + }, + { + "epoch": 0.06, + "grad_norm": 2.2710622093463675, + "learning_rate": 9.982495201885722e-06, + "loss": 0.4542, + "step": 1926 + }, + { + "epoch": 0.06, + "grad_norm": 1.8343630974629241, + "learning_rate": 9.98245591036733e-06, + "loss": 0.4642, + "step": 1927 + }, + { + "epoch": 0.06, + "grad_norm": 1.8257107565280402, + "learning_rate": 9.98241657487871e-06, + "loss": 0.477, + "step": 1928 + }, + { + "epoch": 0.06, + "grad_norm": 1.6235278947152127, + "learning_rate": 9.982377195420208e-06, + "loss": 0.4376, + "step": 1929 + }, + { + "epoch": 0.06, + "grad_norm": 1.6977763250488374, + "learning_rate": 9.982337771992173e-06, + "loss": 0.4812, + "step": 1930 + }, + { + "epoch": 0.06, + "grad_norm": 1.9750708260274845, + "learning_rate": 9.98229830459495e-06, + "loss": 0.5235, + "step": 1931 + }, + { + "epoch": 0.06, + "grad_norm": 1.9610205650057082, + "learning_rate": 9.982258793228889e-06, + "loss": 0.4679, + "step": 1932 + }, + { + "epoch": 0.06, + "grad_norm": 1.7009095731893533, + "learning_rate": 9.982219237894342e-06, + "loss": 0.4649, + "step": 1933 + }, + { + "epoch": 0.06, + "grad_norm": 1.781515234995365, + "learning_rate": 9.982179638591652e-06, + "loss": 0.4508, + "step": 1934 + }, + { + "epoch": 0.06, + "grad_norm": 1.8913613645321634, + "learning_rate": 9.982139995321173e-06, + "loss": 0.4849, + "step": 1935 + }, + { + "epoch": 0.06, + "grad_norm": 1.936481872614449, + "learning_rate": 9.982100308083253e-06, + "loss": 0.4744, + "step": 1936 + }, + { + "epoch": 0.06, + "grad_norm": 1.9684876718944286, + "learning_rate": 9.982060576878243e-06, + "loss": 0.4719, + "step": 1937 + }, + { + "epoch": 0.06, + "grad_norm": 3.800892959640944, + "learning_rate": 9.982020801706491e-06, + "loss": 0.469, + "step": 1938 + }, + { + "epoch": 0.06, + "grad_norm": 2.7185258859328787, + "learning_rate": 9.981980982568352e-06, + "loss": 0.5001, + "step": 1939 + }, + { + "epoch": 0.06, + "grad_norm": 1.9937776407470476, + "learning_rate": 9.981941119464176e-06, + "loss": 0.4744, + "step": 1940 + }, + { + "epoch": 0.06, + "grad_norm": 1.9702209971672187, + "learning_rate": 9.981901212394312e-06, + "loss": 0.4822, + "step": 1941 + }, + { + "epoch": 0.06, + "grad_norm": 1.9503724738323576, + "learning_rate": 9.981861261359116e-06, + "loss": 0.551, + "step": 1942 + }, + { + "epoch": 0.06, + "grad_norm": 1.6450153154423965, + "learning_rate": 9.98182126635894e-06, + "loss": 0.5185, + "step": 1943 + }, + { + "epoch": 0.06, + "grad_norm": 2.0176959879249794, + "learning_rate": 9.981781227394134e-06, + "loss": 0.461, + "step": 1944 + }, + { + "epoch": 0.06, + "grad_norm": 2.0275696620422443, + "learning_rate": 9.981741144465056e-06, + "loss": 0.456, + "step": 1945 + }, + { + "epoch": 0.06, + "grad_norm": 1.7985838535639425, + "learning_rate": 9.981701017572055e-06, + "loss": 0.5368, + "step": 1946 + }, + { + "epoch": 0.06, + "grad_norm": 1.8407455024792425, + "learning_rate": 9.981660846715488e-06, + "loss": 0.5012, + "step": 1947 + }, + { + "epoch": 0.06, + "grad_norm": 1.8122107494287762, + "learning_rate": 9.98162063189571e-06, + "loss": 0.4342, + "step": 1948 + }, + { + "epoch": 0.06, + "grad_norm": 1.8574140475890644, + "learning_rate": 9.981580373113073e-06, + "loss": 0.435, + "step": 1949 + }, + { + "epoch": 0.06, + "grad_norm": 1.3586952344845387, + "learning_rate": 9.981540070367935e-06, + "loss": 0.5639, + "step": 1950 + }, + { + "epoch": 0.06, + "grad_norm": 1.7824697574609039, + "learning_rate": 9.981499723660648e-06, + "loss": 0.4998, + "step": 1951 + }, + { + "epoch": 0.06, + "grad_norm": 1.903502308354929, + "learning_rate": 9.981459332991574e-06, + "loss": 0.4988, + "step": 1952 + }, + { + "epoch": 0.06, + "grad_norm": 1.5666746851480784, + "learning_rate": 9.981418898361063e-06, + "loss": 0.4409, + "step": 1953 + }, + { + "epoch": 0.06, + "grad_norm": 1.770662835885975, + "learning_rate": 9.981378419769478e-06, + "loss": 0.4275, + "step": 1954 + }, + { + "epoch": 0.06, + "grad_norm": 1.7428523463172756, + "learning_rate": 9.981337897217171e-06, + "loss": 0.4817, + "step": 1955 + }, + { + "epoch": 0.06, + "grad_norm": 1.8259340755612639, + "learning_rate": 9.981297330704504e-06, + "loss": 0.469, + "step": 1956 + }, + { + "epoch": 0.06, + "grad_norm": 1.915808810879009, + "learning_rate": 9.98125672023183e-06, + "loss": 0.4852, + "step": 1957 + }, + { + "epoch": 0.06, + "grad_norm": 1.7243907010663475, + "learning_rate": 9.981216065799512e-06, + "loss": 0.4813, + "step": 1958 + }, + { + "epoch": 0.06, + "grad_norm": 1.7525273191516366, + "learning_rate": 9.981175367407907e-06, + "loss": 0.4792, + "step": 1959 + }, + { + "epoch": 0.06, + "grad_norm": 1.7995109229626578, + "learning_rate": 9.981134625057374e-06, + "loss": 0.5048, + "step": 1960 + }, + { + "epoch": 0.06, + "grad_norm": 1.7276608232068804, + "learning_rate": 9.981093838748273e-06, + "loss": 0.4665, + "step": 1961 + }, + { + "epoch": 0.06, + "grad_norm": 2.07966146065734, + "learning_rate": 9.981053008480963e-06, + "loss": 0.5041, + "step": 1962 + }, + { + "epoch": 0.06, + "grad_norm": 2.1802381172592344, + "learning_rate": 9.981012134255806e-06, + "loss": 0.4615, + "step": 1963 + }, + { + "epoch": 0.06, + "grad_norm": 1.7404374931823337, + "learning_rate": 9.98097121607316e-06, + "loss": 0.4618, + "step": 1964 + }, + { + "epoch": 0.06, + "grad_norm": 2.0722283139425013, + "learning_rate": 9.980930253933388e-06, + "loss": 0.5164, + "step": 1965 + }, + { + "epoch": 0.06, + "grad_norm": 1.7910438766298002, + "learning_rate": 9.98088924783685e-06, + "loss": 0.5086, + "step": 1966 + }, + { + "epoch": 0.06, + "grad_norm": 1.655112034304843, + "learning_rate": 9.980848197783912e-06, + "loss": 0.4739, + "step": 1967 + }, + { + "epoch": 0.06, + "grad_norm": 1.705986663367253, + "learning_rate": 9.980807103774933e-06, + "loss": 0.5162, + "step": 1968 + }, + { + "epoch": 0.06, + "grad_norm": 1.701746038765338, + "learning_rate": 9.980765965810275e-06, + "loss": 0.4545, + "step": 1969 + }, + { + "epoch": 0.06, + "grad_norm": 1.6864085336042058, + "learning_rate": 9.980724783890303e-06, + "loss": 0.4742, + "step": 1970 + }, + { + "epoch": 0.06, + "grad_norm": 1.5936809548996347, + "learning_rate": 9.980683558015378e-06, + "loss": 0.4388, + "step": 1971 + }, + { + "epoch": 0.06, + "grad_norm": 1.9331930457381346, + "learning_rate": 9.980642288185866e-06, + "loss": 0.4482, + "step": 1972 + }, + { + "epoch": 0.06, + "grad_norm": 1.719176747256159, + "learning_rate": 9.98060097440213e-06, + "loss": 0.5024, + "step": 1973 + }, + { + "epoch": 0.06, + "grad_norm": 1.8156923491305366, + "learning_rate": 9.980559616664538e-06, + "loss": 0.4697, + "step": 1974 + }, + { + "epoch": 0.06, + "grad_norm": 2.2205833952794127, + "learning_rate": 9.98051821497345e-06, + "loss": 0.5454, + "step": 1975 + }, + { + "epoch": 0.06, + "grad_norm": 1.939642631836731, + "learning_rate": 9.980476769329233e-06, + "loss": 0.5082, + "step": 1976 + }, + { + "epoch": 0.06, + "grad_norm": 2.2633562723078713, + "learning_rate": 9.980435279732254e-06, + "loss": 0.465, + "step": 1977 + }, + { + "epoch": 0.06, + "grad_norm": 1.7553150479658315, + "learning_rate": 9.98039374618288e-06, + "loss": 0.4767, + "step": 1978 + }, + { + "epoch": 0.06, + "grad_norm": 2.059181734409249, + "learning_rate": 9.980352168681472e-06, + "loss": 0.4594, + "step": 1979 + }, + { + "epoch": 0.06, + "grad_norm": 1.7056047687578493, + "learning_rate": 9.980310547228404e-06, + "loss": 0.476, + "step": 1980 + }, + { + "epoch": 0.06, + "grad_norm": 1.3205147365832537, + "learning_rate": 9.980268881824039e-06, + "loss": 0.6091, + "step": 1981 + }, + { + "epoch": 0.06, + "grad_norm": 2.2324616613934944, + "learning_rate": 9.980227172468746e-06, + "loss": 0.4402, + "step": 1982 + }, + { + "epoch": 0.06, + "grad_norm": 1.7235654999585561, + "learning_rate": 9.980185419162894e-06, + "loss": 0.4563, + "step": 1983 + }, + { + "epoch": 0.06, + "grad_norm": 1.6769523690334853, + "learning_rate": 9.980143621906849e-06, + "loss": 0.4572, + "step": 1984 + }, + { + "epoch": 0.06, + "grad_norm": 1.6805216043392772, + "learning_rate": 9.980101780700981e-06, + "loss": 0.441, + "step": 1985 + }, + { + "epoch": 0.06, + "grad_norm": 2.172814812362866, + "learning_rate": 9.98005989554566e-06, + "loss": 0.4417, + "step": 1986 + }, + { + "epoch": 0.06, + "grad_norm": 1.7707337059137478, + "learning_rate": 9.980017966441256e-06, + "loss": 0.4445, + "step": 1987 + }, + { + "epoch": 0.06, + "grad_norm": 1.8775916192361874, + "learning_rate": 9.979975993388137e-06, + "loss": 0.5064, + "step": 1988 + }, + { + "epoch": 0.06, + "grad_norm": 2.0027545844141215, + "learning_rate": 9.979933976386677e-06, + "loss": 0.5412, + "step": 1989 + }, + { + "epoch": 0.06, + "grad_norm": 1.8622648978633431, + "learning_rate": 9.979891915437242e-06, + "loss": 0.4785, + "step": 1990 + }, + { + "epoch": 0.06, + "grad_norm": 1.6681902344255164, + "learning_rate": 9.979849810540206e-06, + "loss": 0.4978, + "step": 1991 + }, + { + "epoch": 0.06, + "grad_norm": 1.6674550640843988, + "learning_rate": 9.979807661695941e-06, + "loss": 0.4998, + "step": 1992 + }, + { + "epoch": 0.06, + "grad_norm": 1.5935907955345028, + "learning_rate": 9.979765468904817e-06, + "loss": 0.4486, + "step": 1993 + }, + { + "epoch": 0.06, + "grad_norm": 1.191218676843555, + "learning_rate": 9.97972323216721e-06, + "loss": 0.5517, + "step": 1994 + }, + { + "epoch": 0.06, + "grad_norm": 1.6850251478791773, + "learning_rate": 9.979680951483488e-06, + "loss": 0.5153, + "step": 1995 + }, + { + "epoch": 0.06, + "grad_norm": 2.0883412871964646, + "learning_rate": 9.979638626854026e-06, + "loss": 0.4815, + "step": 1996 + }, + { + "epoch": 0.06, + "grad_norm": 1.0447210985321458, + "learning_rate": 9.979596258279198e-06, + "loss": 0.5787, + "step": 1997 + }, + { + "epoch": 0.06, + "grad_norm": 1.6352518889456804, + "learning_rate": 9.979553845759379e-06, + "loss": 0.4771, + "step": 1998 + }, + { + "epoch": 0.06, + "grad_norm": 1.8503635441569743, + "learning_rate": 9.979511389294942e-06, + "loss": 0.473, + "step": 1999 + }, + { + "epoch": 0.06, + "grad_norm": 1.902655875206386, + "learning_rate": 9.97946888888626e-06, + "loss": 0.4692, + "step": 2000 + }, + { + "epoch": 0.06, + "grad_norm": 3.636141310841412, + "learning_rate": 9.979426344533712e-06, + "loss": 0.4433, + "step": 2001 + }, + { + "epoch": 0.06, + "grad_norm": 1.8385920637555864, + "learning_rate": 9.979383756237671e-06, + "loss": 0.5135, + "step": 2002 + }, + { + "epoch": 0.06, + "grad_norm": 1.7987128457105295, + "learning_rate": 9.979341123998512e-06, + "loss": 0.4516, + "step": 2003 + }, + { + "epoch": 0.06, + "grad_norm": 1.838398085131164, + "learning_rate": 9.979298447816613e-06, + "loss": 0.5007, + "step": 2004 + }, + { + "epoch": 0.06, + "grad_norm": 1.8328782378753627, + "learning_rate": 9.979255727692349e-06, + "loss": 0.4373, + "step": 2005 + }, + { + "epoch": 0.06, + "grad_norm": 2.087477863449056, + "learning_rate": 9.979212963626098e-06, + "loss": 0.5105, + "step": 2006 + }, + { + "epoch": 0.06, + "grad_norm": 2.0550920883332986, + "learning_rate": 9.979170155618239e-06, + "loss": 0.4823, + "step": 2007 + }, + { + "epoch": 0.06, + "grad_norm": 1.8900085339974575, + "learning_rate": 9.979127303669148e-06, + "loss": 0.4609, + "step": 2008 + }, + { + "epoch": 0.06, + "grad_norm": 1.7454118329875379, + "learning_rate": 9.9790844077792e-06, + "loss": 0.4584, + "step": 2009 + }, + { + "epoch": 0.06, + "grad_norm": 1.9836410452870101, + "learning_rate": 9.97904146794878e-06, + "loss": 0.4864, + "step": 2010 + }, + { + "epoch": 0.06, + "grad_norm": 2.049125092978683, + "learning_rate": 9.978998484178265e-06, + "loss": 0.4341, + "step": 2011 + }, + { + "epoch": 0.06, + "grad_norm": 1.7438450756454793, + "learning_rate": 9.97895545646803e-06, + "loss": 0.4487, + "step": 2012 + }, + { + "epoch": 0.06, + "grad_norm": 1.610366092089184, + "learning_rate": 9.978912384818459e-06, + "loss": 0.4564, + "step": 2013 + }, + { + "epoch": 0.06, + "grad_norm": 1.6689814404006542, + "learning_rate": 9.978869269229931e-06, + "loss": 0.4486, + "step": 2014 + }, + { + "epoch": 0.06, + "grad_norm": 2.048771143167139, + "learning_rate": 9.978826109702826e-06, + "loss": 0.4678, + "step": 2015 + }, + { + "epoch": 0.06, + "grad_norm": 1.8715661604074196, + "learning_rate": 9.978782906237527e-06, + "loss": 0.5048, + "step": 2016 + }, + { + "epoch": 0.06, + "grad_norm": 1.853926615868925, + "learning_rate": 9.978739658834413e-06, + "loss": 0.4825, + "step": 2017 + }, + { + "epoch": 0.06, + "grad_norm": 1.7013352937793107, + "learning_rate": 9.978696367493866e-06, + "loss": 0.4925, + "step": 2018 + }, + { + "epoch": 0.06, + "grad_norm": 1.9881677142609333, + "learning_rate": 9.978653032216268e-06, + "loss": 0.4664, + "step": 2019 + }, + { + "epoch": 0.06, + "grad_norm": 1.7812409487305225, + "learning_rate": 9.978609653002001e-06, + "loss": 0.424, + "step": 2020 + }, + { + "epoch": 0.06, + "grad_norm": 1.734927423415657, + "learning_rate": 9.97856622985145e-06, + "loss": 0.6277, + "step": 2021 + }, + { + "epoch": 0.06, + "grad_norm": 1.9670503723016335, + "learning_rate": 9.978522762764997e-06, + "loss": 0.4597, + "step": 2022 + }, + { + "epoch": 0.06, + "grad_norm": 1.9439000947290404, + "learning_rate": 9.978479251743024e-06, + "loss": 0.4927, + "step": 2023 + }, + { + "epoch": 0.06, + "grad_norm": 1.8489456385057113, + "learning_rate": 9.978435696785918e-06, + "loss": 0.4454, + "step": 2024 + }, + { + "epoch": 0.06, + "grad_norm": 1.677348785870781, + "learning_rate": 9.978392097894062e-06, + "loss": 0.4983, + "step": 2025 + }, + { + "epoch": 0.06, + "grad_norm": 1.9049921301041022, + "learning_rate": 9.978348455067836e-06, + "loss": 0.4505, + "step": 2026 + }, + { + "epoch": 0.06, + "grad_norm": 2.106165028154172, + "learning_rate": 9.978304768307633e-06, + "loss": 0.4465, + "step": 2027 + }, + { + "epoch": 0.06, + "grad_norm": 1.842875655595036, + "learning_rate": 9.978261037613837e-06, + "loss": 0.4456, + "step": 2028 + }, + { + "epoch": 0.06, + "grad_norm": 1.7873418298404726, + "learning_rate": 9.978217262986829e-06, + "loss": 0.4488, + "step": 2029 + }, + { + "epoch": 0.06, + "grad_norm": 1.851616050243143, + "learning_rate": 9.978173444427001e-06, + "loss": 0.4826, + "step": 2030 + }, + { + "epoch": 0.06, + "grad_norm": 1.8310096978577017, + "learning_rate": 9.978129581934735e-06, + "loss": 0.4667, + "step": 2031 + }, + { + "epoch": 0.06, + "grad_norm": 2.0882299642361764, + "learning_rate": 9.978085675510421e-06, + "loss": 0.4674, + "step": 2032 + }, + { + "epoch": 0.06, + "grad_norm": 2.4024925200781255, + "learning_rate": 9.978041725154446e-06, + "loss": 0.4779, + "step": 2033 + }, + { + "epoch": 0.06, + "grad_norm": 2.022685590848383, + "learning_rate": 9.977997730867197e-06, + "loss": 0.4589, + "step": 2034 + }, + { + "epoch": 0.06, + "grad_norm": 1.8800939887554433, + "learning_rate": 9.977953692649065e-06, + "loss": 0.4672, + "step": 2035 + }, + { + "epoch": 0.06, + "grad_norm": 1.7846127181541995, + "learning_rate": 9.977909610500434e-06, + "loss": 0.5253, + "step": 2036 + }, + { + "epoch": 0.06, + "grad_norm": 1.9670095005813946, + "learning_rate": 9.977865484421696e-06, + "loss": 0.4596, + "step": 2037 + }, + { + "epoch": 0.06, + "grad_norm": 2.2797631606932063, + "learning_rate": 9.977821314413238e-06, + "loss": 0.4818, + "step": 2038 + }, + { + "epoch": 0.06, + "grad_norm": 2.314619891162864, + "learning_rate": 9.977777100475455e-06, + "loss": 0.4745, + "step": 2039 + }, + { + "epoch": 0.06, + "grad_norm": 1.9111186069332748, + "learning_rate": 9.977732842608732e-06, + "loss": 0.4511, + "step": 2040 + }, + { + "epoch": 0.06, + "grad_norm": 1.8402669539774141, + "learning_rate": 9.977688540813462e-06, + "loss": 0.4847, + "step": 2041 + }, + { + "epoch": 0.06, + "grad_norm": 1.8517647744112653, + "learning_rate": 9.977644195090036e-06, + "loss": 0.4631, + "step": 2042 + }, + { + "epoch": 0.06, + "grad_norm": 2.075264389768318, + "learning_rate": 9.977599805438844e-06, + "loss": 0.4374, + "step": 2043 + }, + { + "epoch": 0.06, + "grad_norm": 1.9373051443151126, + "learning_rate": 9.977555371860279e-06, + "loss": 0.438, + "step": 2044 + }, + { + "epoch": 0.06, + "grad_norm": 1.8296862652733252, + "learning_rate": 9.977510894354731e-06, + "loss": 0.4218, + "step": 2045 + }, + { + "epoch": 0.06, + "grad_norm": 1.8208174630911895, + "learning_rate": 9.977466372922595e-06, + "loss": 0.446, + "step": 2046 + }, + { + "epoch": 0.06, + "grad_norm": 2.762655340669196, + "learning_rate": 9.977421807564264e-06, + "loss": 0.4914, + "step": 2047 + }, + { + "epoch": 0.06, + "grad_norm": 1.7497642999118626, + "learning_rate": 9.977377198280129e-06, + "loss": 0.4448, + "step": 2048 + }, + { + "epoch": 0.06, + "grad_norm": 1.863696609444054, + "learning_rate": 9.977332545070587e-06, + "loss": 0.4583, + "step": 2049 + }, + { + "epoch": 0.06, + "grad_norm": 2.478297857834589, + "learning_rate": 9.977287847936028e-06, + "loss": 0.4607, + "step": 2050 + }, + { + "epoch": 0.06, + "grad_norm": 1.6654735150141808, + "learning_rate": 9.97724310687685e-06, + "loss": 0.5008, + "step": 2051 + }, + { + "epoch": 0.06, + "grad_norm": 1.9818817678760268, + "learning_rate": 9.977198321893446e-06, + "loss": 0.4759, + "step": 2052 + }, + { + "epoch": 0.06, + "grad_norm": 1.739102537341465, + "learning_rate": 9.977153492986211e-06, + "loss": 0.4503, + "step": 2053 + }, + { + "epoch": 0.06, + "grad_norm": 1.931997408182547, + "learning_rate": 9.977108620155542e-06, + "loss": 0.4576, + "step": 2054 + }, + { + "epoch": 0.06, + "grad_norm": 1.9946360928554625, + "learning_rate": 9.977063703401833e-06, + "loss": 0.479, + "step": 2055 + }, + { + "epoch": 0.06, + "grad_norm": 1.6723213052397896, + "learning_rate": 9.977018742725482e-06, + "loss": 0.4277, + "step": 2056 + }, + { + "epoch": 0.06, + "grad_norm": 1.699531205973995, + "learning_rate": 9.976973738126884e-06, + "loss": 0.4735, + "step": 2057 + }, + { + "epoch": 0.06, + "grad_norm": 2.18344734415284, + "learning_rate": 9.97692868960644e-06, + "loss": 0.4505, + "step": 2058 + }, + { + "epoch": 0.06, + "grad_norm": 1.8493358813811438, + "learning_rate": 9.976883597164544e-06, + "loss": 0.464, + "step": 2059 + }, + { + "epoch": 0.06, + "grad_norm": 1.875970235127219, + "learning_rate": 9.976838460801597e-06, + "loss": 0.5366, + "step": 2060 + }, + { + "epoch": 0.06, + "grad_norm": 1.829515118887746, + "learning_rate": 9.976793280517993e-06, + "loss": 0.476, + "step": 2061 + }, + { + "epoch": 0.06, + "grad_norm": 1.781626518482754, + "learning_rate": 9.976748056314134e-06, + "loss": 0.4824, + "step": 2062 + }, + { + "epoch": 0.06, + "grad_norm": 1.7350105146570305, + "learning_rate": 9.976702788190418e-06, + "loss": 0.4533, + "step": 2063 + }, + { + "epoch": 0.06, + "grad_norm": 1.8295828951771589, + "learning_rate": 9.976657476147245e-06, + "loss": 0.4654, + "step": 2064 + }, + { + "epoch": 0.06, + "grad_norm": 2.0833432763770197, + "learning_rate": 9.976612120185013e-06, + "loss": 0.4972, + "step": 2065 + }, + { + "epoch": 0.06, + "grad_norm": 2.370452019589151, + "learning_rate": 9.976566720304125e-06, + "loss": 0.4867, + "step": 2066 + }, + { + "epoch": 0.06, + "grad_norm": 1.750616272784027, + "learning_rate": 9.976521276504982e-06, + "loss": 0.4577, + "step": 2067 + }, + { + "epoch": 0.06, + "grad_norm": 1.8978788955552663, + "learning_rate": 9.976475788787982e-06, + "loss": 0.5033, + "step": 2068 + }, + { + "epoch": 0.06, + "grad_norm": 1.7163703980432927, + "learning_rate": 9.976430257153528e-06, + "loss": 0.4987, + "step": 2069 + }, + { + "epoch": 0.06, + "grad_norm": 2.3743569424986903, + "learning_rate": 9.976384681602023e-06, + "loss": 0.4394, + "step": 2070 + }, + { + "epoch": 0.06, + "grad_norm": 2.109372937850115, + "learning_rate": 9.976339062133866e-06, + "loss": 0.4423, + "step": 2071 + }, + { + "epoch": 0.06, + "grad_norm": 1.8612342850569683, + "learning_rate": 9.976293398749463e-06, + "loss": 0.4866, + "step": 2072 + }, + { + "epoch": 0.06, + "grad_norm": 1.950152703330604, + "learning_rate": 9.976247691449215e-06, + "loss": 0.4609, + "step": 2073 + }, + { + "epoch": 0.06, + "grad_norm": 2.0531532067517326, + "learning_rate": 9.976201940233525e-06, + "loss": 0.4565, + "step": 2074 + }, + { + "epoch": 0.06, + "grad_norm": 1.6852895741386713, + "learning_rate": 9.9761561451028e-06, + "loss": 0.4802, + "step": 2075 + }, + { + "epoch": 0.06, + "grad_norm": 1.6620162473567541, + "learning_rate": 9.976110306057439e-06, + "loss": 0.499, + "step": 2076 + }, + { + "epoch": 0.06, + "grad_norm": 1.7657517452807765, + "learning_rate": 9.97606442309785e-06, + "loss": 0.4744, + "step": 2077 + }, + { + "epoch": 0.06, + "grad_norm": 1.855486202228872, + "learning_rate": 9.976018496224438e-06, + "loss": 0.4948, + "step": 2078 + }, + { + "epoch": 0.06, + "grad_norm": 2.5138560522165556, + "learning_rate": 9.97597252543761e-06, + "loss": 0.4439, + "step": 2079 + }, + { + "epoch": 0.06, + "grad_norm": 1.871045558415501, + "learning_rate": 9.975926510737765e-06, + "loss": 0.4788, + "step": 2080 + }, + { + "epoch": 0.06, + "grad_norm": 1.689429157372301, + "learning_rate": 9.975880452125315e-06, + "loss": 0.4618, + "step": 2081 + }, + { + "epoch": 0.06, + "grad_norm": 1.6727152445648765, + "learning_rate": 9.975834349600665e-06, + "loss": 0.442, + "step": 2082 + }, + { + "epoch": 0.06, + "grad_norm": 1.7226334749121628, + "learning_rate": 9.975788203164223e-06, + "loss": 0.4892, + "step": 2083 + }, + { + "epoch": 0.06, + "grad_norm": 1.8506603322146509, + "learning_rate": 9.975742012816393e-06, + "loss": 0.5039, + "step": 2084 + }, + { + "epoch": 0.06, + "grad_norm": 1.8443546329855793, + "learning_rate": 9.975695778557585e-06, + "loss": 0.4471, + "step": 2085 + }, + { + "epoch": 0.06, + "grad_norm": 2.445837157028649, + "learning_rate": 9.975649500388207e-06, + "loss": 0.4296, + "step": 2086 + }, + { + "epoch": 0.06, + "grad_norm": 2.3987265934624133, + "learning_rate": 9.975603178308667e-06, + "loss": 0.4691, + "step": 2087 + }, + { + "epoch": 0.06, + "grad_norm": 2.0557726305994257, + "learning_rate": 9.975556812319373e-06, + "loss": 0.5169, + "step": 2088 + }, + { + "epoch": 0.06, + "grad_norm": 3.279753042384695, + "learning_rate": 9.975510402420736e-06, + "loss": 0.4797, + "step": 2089 + }, + { + "epoch": 0.06, + "grad_norm": 2.7924361244626317, + "learning_rate": 9.975463948613162e-06, + "loss": 0.4882, + "step": 2090 + }, + { + "epoch": 0.06, + "grad_norm": 1.831441924435789, + "learning_rate": 9.975417450897065e-06, + "loss": 0.4753, + "step": 2091 + }, + { + "epoch": 0.06, + "grad_norm": 2.2306912030883237, + "learning_rate": 9.975370909272854e-06, + "loss": 0.4635, + "step": 2092 + }, + { + "epoch": 0.06, + "grad_norm": 1.8456576617576084, + "learning_rate": 9.97532432374094e-06, + "loss": 0.4837, + "step": 2093 + }, + { + "epoch": 0.06, + "grad_norm": 2.643251548108856, + "learning_rate": 9.975277694301734e-06, + "loss": 0.4736, + "step": 2094 + }, + { + "epoch": 0.06, + "grad_norm": 1.7561672294413604, + "learning_rate": 9.975231020955646e-06, + "loss": 0.4804, + "step": 2095 + }, + { + "epoch": 0.06, + "grad_norm": 1.8553034594469053, + "learning_rate": 9.97518430370309e-06, + "loss": 0.4563, + "step": 2096 + }, + { + "epoch": 0.06, + "grad_norm": 2.1568078122810648, + "learning_rate": 9.975137542544476e-06, + "loss": 0.4668, + "step": 2097 + }, + { + "epoch": 0.06, + "grad_norm": 1.9481697286900623, + "learning_rate": 9.97509073748022e-06, + "loss": 0.5003, + "step": 2098 + }, + { + "epoch": 0.06, + "grad_norm": 1.7925823063206126, + "learning_rate": 9.975043888510732e-06, + "loss": 0.4506, + "step": 2099 + }, + { + "epoch": 0.06, + "grad_norm": 2.796150078078796, + "learning_rate": 9.974996995636425e-06, + "loss": 0.4496, + "step": 2100 + }, + { + "epoch": 0.06, + "grad_norm": 2.0301072232477786, + "learning_rate": 9.974950058857716e-06, + "loss": 0.4758, + "step": 2101 + }, + { + "epoch": 0.06, + "grad_norm": 2.059343516533024, + "learning_rate": 9.974903078175019e-06, + "loss": 0.4871, + "step": 2102 + }, + { + "epoch": 0.06, + "grad_norm": 1.8068600728911501, + "learning_rate": 9.974856053588744e-06, + "loss": 0.4796, + "step": 2103 + }, + { + "epoch": 0.06, + "grad_norm": 1.75177063711296, + "learning_rate": 9.974808985099311e-06, + "loss": 0.4534, + "step": 2104 + }, + { + "epoch": 0.06, + "grad_norm": 1.7150215031645142, + "learning_rate": 9.974761872707132e-06, + "loss": 0.4224, + "step": 2105 + }, + { + "epoch": 0.06, + "grad_norm": 2.177224221379077, + "learning_rate": 9.974714716412626e-06, + "loss": 0.4976, + "step": 2106 + }, + { + "epoch": 0.06, + "grad_norm": 1.6473024929886806, + "learning_rate": 9.974667516216206e-06, + "loss": 0.4277, + "step": 2107 + }, + { + "epoch": 0.06, + "grad_norm": 1.841888894274937, + "learning_rate": 9.97462027211829e-06, + "loss": 0.4782, + "step": 2108 + }, + { + "epoch": 0.06, + "grad_norm": 1.806283558717888, + "learning_rate": 9.974572984119297e-06, + "loss": 0.4698, + "step": 2109 + }, + { + "epoch": 0.06, + "grad_norm": 1.881752831802436, + "learning_rate": 9.97452565221964e-06, + "loss": 0.4803, + "step": 2110 + }, + { + "epoch": 0.06, + "grad_norm": 2.1548739892508677, + "learning_rate": 9.97447827641974e-06, + "loss": 0.5174, + "step": 2111 + }, + { + "epoch": 0.06, + "grad_norm": 5.4927054420273915, + "learning_rate": 9.974430856720015e-06, + "loss": 0.5005, + "step": 2112 + }, + { + "epoch": 0.06, + "grad_norm": 1.8007869685977467, + "learning_rate": 9.97438339312088e-06, + "loss": 0.5015, + "step": 2113 + }, + { + "epoch": 0.06, + "grad_norm": 1.8331427673684304, + "learning_rate": 9.974335885622758e-06, + "loss": 0.4837, + "step": 2114 + }, + { + "epoch": 0.06, + "grad_norm": 1.8699660592758425, + "learning_rate": 9.974288334226066e-06, + "loss": 0.4709, + "step": 2115 + }, + { + "epoch": 0.06, + "grad_norm": 1.745968739667521, + "learning_rate": 9.974240738931224e-06, + "loss": 0.5053, + "step": 2116 + }, + { + "epoch": 0.06, + "grad_norm": 1.195611215678551, + "learning_rate": 9.974193099738654e-06, + "loss": 0.574, + "step": 2117 + }, + { + "epoch": 0.06, + "grad_norm": 13.720233613180628, + "learning_rate": 9.974145416648773e-06, + "loss": 0.4781, + "step": 2118 + }, + { + "epoch": 0.06, + "grad_norm": 2.0854716895843244, + "learning_rate": 9.974097689662003e-06, + "loss": 0.4633, + "step": 2119 + }, + { + "epoch": 0.06, + "grad_norm": 1.923248755926441, + "learning_rate": 9.974049918778768e-06, + "loss": 0.4796, + "step": 2120 + }, + { + "epoch": 0.06, + "grad_norm": 2.9297680779739417, + "learning_rate": 9.974002103999486e-06, + "loss": 0.4569, + "step": 2121 + }, + { + "epoch": 0.06, + "grad_norm": 1.7080263459257021, + "learning_rate": 9.97395424532458e-06, + "loss": 0.4624, + "step": 2122 + }, + { + "epoch": 0.06, + "grad_norm": 1.9406210036432194, + "learning_rate": 9.973906342754473e-06, + "loss": 0.4444, + "step": 2123 + }, + { + "epoch": 0.06, + "grad_norm": 1.9872697788094278, + "learning_rate": 9.973858396289588e-06, + "loss": 0.4607, + "step": 2124 + }, + { + "epoch": 0.06, + "grad_norm": 1.756027946939009, + "learning_rate": 9.973810405930347e-06, + "loss": 0.4465, + "step": 2125 + }, + { + "epoch": 0.06, + "grad_norm": 1.961606698296437, + "learning_rate": 9.973762371677173e-06, + "loss": 0.476, + "step": 2126 + }, + { + "epoch": 0.06, + "grad_norm": 1.9790259329015862, + "learning_rate": 9.973714293530492e-06, + "loss": 0.4831, + "step": 2127 + }, + { + "epoch": 0.06, + "grad_norm": 1.8114479993738644, + "learning_rate": 9.973666171490727e-06, + "loss": 0.4439, + "step": 2128 + }, + { + "epoch": 0.06, + "grad_norm": 1.973761268780145, + "learning_rate": 9.973618005558303e-06, + "loss": 0.4292, + "step": 2129 + }, + { + "epoch": 0.06, + "grad_norm": 1.8298036139783542, + "learning_rate": 9.973569795733644e-06, + "loss": 0.4579, + "step": 2130 + }, + { + "epoch": 0.06, + "grad_norm": 1.8340929777272037, + "learning_rate": 9.973521542017177e-06, + "loss": 0.4874, + "step": 2131 + }, + { + "epoch": 0.06, + "grad_norm": 1.952434816778662, + "learning_rate": 9.973473244409327e-06, + "loss": 0.4341, + "step": 2132 + }, + { + "epoch": 0.06, + "grad_norm": 1.8304668817564265, + "learning_rate": 9.97342490291052e-06, + "loss": 0.4549, + "step": 2133 + }, + { + "epoch": 0.06, + "grad_norm": 1.7391105888262166, + "learning_rate": 9.973376517521186e-06, + "loss": 0.4417, + "step": 2134 + }, + { + "epoch": 0.06, + "grad_norm": 1.9881101884005563, + "learning_rate": 9.973328088241744e-06, + "loss": 0.4803, + "step": 2135 + }, + { + "epoch": 0.06, + "grad_norm": 1.7658994286929968, + "learning_rate": 9.973279615072632e-06, + "loss": 0.485, + "step": 2136 + }, + { + "epoch": 0.06, + "grad_norm": 1.648695268330297, + "learning_rate": 9.97323109801427e-06, + "loss": 0.4978, + "step": 2137 + }, + { + "epoch": 0.06, + "grad_norm": 1.878173085815734, + "learning_rate": 9.973182537067087e-06, + "loss": 0.4502, + "step": 2138 + }, + { + "epoch": 0.06, + "grad_norm": 1.9205619697218022, + "learning_rate": 9.973133932231514e-06, + "loss": 0.4605, + "step": 2139 + }, + { + "epoch": 0.06, + "grad_norm": 2.053729264546475, + "learning_rate": 9.973085283507977e-06, + "loss": 0.4474, + "step": 2140 + }, + { + "epoch": 0.06, + "grad_norm": 2.3887366955668283, + "learning_rate": 9.973036590896909e-06, + "loss": 0.4787, + "step": 2141 + }, + { + "epoch": 0.06, + "grad_norm": 1.7139460120545718, + "learning_rate": 9.97298785439874e-06, + "loss": 0.4742, + "step": 2142 + }, + { + "epoch": 0.06, + "grad_norm": 1.8472262397138723, + "learning_rate": 9.972939074013894e-06, + "loss": 0.5039, + "step": 2143 + }, + { + "epoch": 0.06, + "grad_norm": 2.3351736682817252, + "learning_rate": 9.972890249742808e-06, + "loss": 0.5095, + "step": 2144 + }, + { + "epoch": 0.06, + "grad_norm": 1.812978311970842, + "learning_rate": 9.97284138158591e-06, + "loss": 0.4705, + "step": 2145 + }, + { + "epoch": 0.06, + "grad_norm": 1.7806004465166534, + "learning_rate": 9.972792469543631e-06, + "loss": 0.5116, + "step": 2146 + }, + { + "epoch": 0.06, + "grad_norm": 1.7399983062675168, + "learning_rate": 9.972743513616403e-06, + "loss": 0.4603, + "step": 2147 + }, + { + "epoch": 0.06, + "grad_norm": 2.21191010849355, + "learning_rate": 9.972694513804659e-06, + "loss": 0.4708, + "step": 2148 + }, + { + "epoch": 0.06, + "grad_norm": 1.7848042112195643, + "learning_rate": 9.97264547010883e-06, + "loss": 0.4612, + "step": 2149 + }, + { + "epoch": 0.06, + "grad_norm": 1.8956824713342122, + "learning_rate": 9.972596382529351e-06, + "loss": 0.4513, + "step": 2150 + }, + { + "epoch": 0.06, + "grad_norm": 2.088067719017445, + "learning_rate": 9.972547251066652e-06, + "loss": 0.4661, + "step": 2151 + }, + { + "epoch": 0.06, + "grad_norm": 2.112942770588152, + "learning_rate": 9.97249807572117e-06, + "loss": 0.4737, + "step": 2152 + }, + { + "epoch": 0.06, + "grad_norm": 1.8836493753640806, + "learning_rate": 9.972448856493337e-06, + "loss": 0.4559, + "step": 2153 + }, + { + "epoch": 0.06, + "grad_norm": 3.665119332525329, + "learning_rate": 9.972399593383588e-06, + "loss": 0.548, + "step": 2154 + }, + { + "epoch": 0.06, + "grad_norm": 2.0571541467662486, + "learning_rate": 9.972350286392356e-06, + "loss": 0.5341, + "step": 2155 + }, + { + "epoch": 0.06, + "grad_norm": 2.4338859718329284, + "learning_rate": 9.97230093552008e-06, + "loss": 0.4206, + "step": 2156 + }, + { + "epoch": 0.06, + "grad_norm": 2.1250128054468744, + "learning_rate": 9.972251540767191e-06, + "loss": 0.4509, + "step": 2157 + }, + { + "epoch": 0.06, + "grad_norm": 1.8318043841541263, + "learning_rate": 9.972202102134128e-06, + "loss": 0.4696, + "step": 2158 + }, + { + "epoch": 0.06, + "grad_norm": 2.0576345571653434, + "learning_rate": 9.972152619621324e-06, + "loss": 0.4576, + "step": 2159 + }, + { + "epoch": 0.06, + "grad_norm": 3.9335426026542653, + "learning_rate": 9.97210309322922e-06, + "loss": 0.4462, + "step": 2160 + }, + { + "epoch": 0.06, + "grad_norm": 1.7873252576783487, + "learning_rate": 9.972053522958252e-06, + "loss": 0.4526, + "step": 2161 + }, + { + "epoch": 0.06, + "grad_norm": 1.902496245983206, + "learning_rate": 9.972003908808854e-06, + "loss": 0.4738, + "step": 2162 + }, + { + "epoch": 0.06, + "grad_norm": 1.9893773483358226, + "learning_rate": 9.971954250781468e-06, + "loss": 0.4945, + "step": 2163 + }, + { + "epoch": 0.06, + "grad_norm": 2.367603852751861, + "learning_rate": 9.97190454887653e-06, + "loss": 0.4594, + "step": 2164 + }, + { + "epoch": 0.06, + "grad_norm": 1.7869829274958142, + "learning_rate": 9.971854803094481e-06, + "loss": 0.4465, + "step": 2165 + }, + { + "epoch": 0.06, + "grad_norm": 1.8030392024623665, + "learning_rate": 9.971805013435756e-06, + "loss": 0.4756, + "step": 2166 + }, + { + "epoch": 0.06, + "grad_norm": 1.848203409150631, + "learning_rate": 9.971755179900798e-06, + "loss": 0.435, + "step": 2167 + }, + { + "epoch": 0.06, + "grad_norm": 1.7229582515038795, + "learning_rate": 9.971705302490043e-06, + "loss": 0.4147, + "step": 2168 + }, + { + "epoch": 0.06, + "grad_norm": 1.750957934529507, + "learning_rate": 9.971655381203935e-06, + "loss": 0.4793, + "step": 2169 + }, + { + "epoch": 0.06, + "grad_norm": 1.6129216878803385, + "learning_rate": 9.971605416042914e-06, + "loss": 0.4245, + "step": 2170 + }, + { + "epoch": 0.06, + "grad_norm": 1.8060886993353327, + "learning_rate": 9.971555407007418e-06, + "loss": 0.4288, + "step": 2171 + }, + { + "epoch": 0.06, + "grad_norm": 1.8319339372136718, + "learning_rate": 9.971505354097891e-06, + "loss": 0.4532, + "step": 2172 + }, + { + "epoch": 0.06, + "grad_norm": 2.9573965443814645, + "learning_rate": 9.971455257314777e-06, + "loss": 0.5346, + "step": 2173 + }, + { + "epoch": 0.06, + "grad_norm": 1.77765211813073, + "learning_rate": 9.971405116658512e-06, + "loss": 0.472, + "step": 2174 + }, + { + "epoch": 0.06, + "grad_norm": 1.869706953932632, + "learning_rate": 9.971354932129543e-06, + "loss": 0.4736, + "step": 2175 + }, + { + "epoch": 0.06, + "grad_norm": 2.1414727603216193, + "learning_rate": 9.971304703728311e-06, + "loss": 0.4507, + "step": 2176 + }, + { + "epoch": 0.06, + "grad_norm": 1.7507093055246623, + "learning_rate": 9.971254431455261e-06, + "loss": 0.4648, + "step": 2177 + }, + { + "epoch": 0.06, + "grad_norm": 1.7490970376563644, + "learning_rate": 9.971204115310833e-06, + "loss": 0.4099, + "step": 2178 + }, + { + "epoch": 0.06, + "grad_norm": 1.936539200829602, + "learning_rate": 9.971153755295476e-06, + "loss": 0.4653, + "step": 2179 + }, + { + "epoch": 0.06, + "grad_norm": 1.602016386560658, + "learning_rate": 9.971103351409633e-06, + "loss": 0.4528, + "step": 2180 + }, + { + "epoch": 0.06, + "grad_norm": 1.2421661751740327, + "learning_rate": 9.971052903653744e-06, + "loss": 0.5627, + "step": 2181 + }, + { + "epoch": 0.06, + "grad_norm": 1.6215533084265148, + "learning_rate": 9.97100241202826e-06, + "loss": 0.4702, + "step": 2182 + }, + { + "epoch": 0.06, + "grad_norm": 2.1334185405310233, + "learning_rate": 9.970951876533625e-06, + "loss": 0.461, + "step": 2183 + }, + { + "epoch": 0.06, + "grad_norm": 1.743738704793325, + "learning_rate": 9.970901297170284e-06, + "loss": 0.4578, + "step": 2184 + }, + { + "epoch": 0.06, + "grad_norm": 1.7078836621625761, + "learning_rate": 9.970850673938684e-06, + "loss": 0.4805, + "step": 2185 + }, + { + "epoch": 0.06, + "grad_norm": 1.9283218892967695, + "learning_rate": 9.97080000683927e-06, + "loss": 0.429, + "step": 2186 + }, + { + "epoch": 0.06, + "grad_norm": 2.21571599770392, + "learning_rate": 9.970749295872493e-06, + "loss": 0.4802, + "step": 2187 + }, + { + "epoch": 0.06, + "grad_norm": 1.8142933988022516, + "learning_rate": 9.970698541038797e-06, + "loss": 0.4655, + "step": 2188 + }, + { + "epoch": 0.06, + "grad_norm": 1.1304417345652498, + "learning_rate": 9.970647742338632e-06, + "loss": 0.6045, + "step": 2189 + }, + { + "epoch": 0.06, + "grad_norm": 17.32167962098194, + "learning_rate": 9.970596899772447e-06, + "loss": 0.4738, + "step": 2190 + }, + { + "epoch": 0.06, + "grad_norm": 2.5082072561030704, + "learning_rate": 9.970546013340687e-06, + "loss": 0.5452, + "step": 2191 + }, + { + "epoch": 0.06, + "grad_norm": 1.9218978793501853, + "learning_rate": 9.970495083043805e-06, + "loss": 0.4921, + "step": 2192 + }, + { + "epoch": 0.06, + "grad_norm": 1.867297373633111, + "learning_rate": 9.970444108882247e-06, + "loss": 0.4536, + "step": 2193 + }, + { + "epoch": 0.06, + "grad_norm": 1.9699188648325459, + "learning_rate": 9.970393090856465e-06, + "loss": 0.4419, + "step": 2194 + }, + { + "epoch": 0.06, + "grad_norm": 2.489096899080414, + "learning_rate": 9.970342028966909e-06, + "loss": 0.4889, + "step": 2195 + }, + { + "epoch": 0.06, + "grad_norm": 3.8368724454161294, + "learning_rate": 9.97029092321403e-06, + "loss": 0.4376, + "step": 2196 + }, + { + "epoch": 0.06, + "grad_norm": 1.016904262230505, + "learning_rate": 9.970239773598279e-06, + "loss": 0.5542, + "step": 2197 + }, + { + "epoch": 0.06, + "grad_norm": 2.327849246736984, + "learning_rate": 9.970188580120105e-06, + "loss": 0.4852, + "step": 2198 + }, + { + "epoch": 0.06, + "grad_norm": 2.122990104665241, + "learning_rate": 9.970137342779962e-06, + "loss": 0.4911, + "step": 2199 + }, + { + "epoch": 0.06, + "grad_norm": 1.7196811584976393, + "learning_rate": 9.970086061578302e-06, + "loss": 0.5115, + "step": 2200 + }, + { + "epoch": 0.06, + "grad_norm": 1.8160652541017983, + "learning_rate": 9.970034736515579e-06, + "loss": 0.4717, + "step": 2201 + }, + { + "epoch": 0.06, + "grad_norm": 1.8536183803679902, + "learning_rate": 9.969983367592243e-06, + "loss": 0.4263, + "step": 2202 + }, + { + "epoch": 0.06, + "grad_norm": 1.7490431573035634, + "learning_rate": 9.969931954808748e-06, + "loss": 0.4608, + "step": 2203 + }, + { + "epoch": 0.06, + "grad_norm": 2.9238102734335305, + "learning_rate": 9.96988049816555e-06, + "loss": 0.4707, + "step": 2204 + }, + { + "epoch": 0.06, + "grad_norm": 1.7827912309890706, + "learning_rate": 9.969828997663099e-06, + "loss": 0.4603, + "step": 2205 + }, + { + "epoch": 0.06, + "grad_norm": 2.299694272897853, + "learning_rate": 9.969777453301855e-06, + "loss": 0.485, + "step": 2206 + }, + { + "epoch": 0.06, + "grad_norm": 1.7170086213632267, + "learning_rate": 9.969725865082268e-06, + "loss": 0.4686, + "step": 2207 + }, + { + "epoch": 0.06, + "grad_norm": 1.7711542998945466, + "learning_rate": 9.969674233004794e-06, + "loss": 0.4822, + "step": 2208 + }, + { + "epoch": 0.06, + "grad_norm": 1.865394489880361, + "learning_rate": 9.969622557069892e-06, + "loss": 0.4395, + "step": 2209 + }, + { + "epoch": 0.06, + "grad_norm": 2.145911201596758, + "learning_rate": 9.969570837278016e-06, + "loss": 0.4636, + "step": 2210 + }, + { + "epoch": 0.06, + "grad_norm": 1.88201710076015, + "learning_rate": 9.96951907362962e-06, + "loss": 0.4609, + "step": 2211 + }, + { + "epoch": 0.06, + "grad_norm": 1.9035793900256484, + "learning_rate": 9.969467266125165e-06, + "loss": 0.5063, + "step": 2212 + }, + { + "epoch": 0.06, + "grad_norm": 1.8997789512563867, + "learning_rate": 9.969415414765105e-06, + "loss": 0.4649, + "step": 2213 + }, + { + "epoch": 0.06, + "grad_norm": 1.6429096597472237, + "learning_rate": 9.9693635195499e-06, + "loss": 0.494, + "step": 2214 + }, + { + "epoch": 0.06, + "grad_norm": 1.6352785781698933, + "learning_rate": 9.969311580480005e-06, + "loss": 0.4644, + "step": 2215 + }, + { + "epoch": 0.06, + "grad_norm": 1.7048784239679253, + "learning_rate": 9.969259597555882e-06, + "loss": 0.4851, + "step": 2216 + }, + { + "epoch": 0.06, + "grad_norm": 2.0083049679429665, + "learning_rate": 9.969207570777988e-06, + "loss": 0.4544, + "step": 2217 + }, + { + "epoch": 0.06, + "grad_norm": 1.8654850825467426, + "learning_rate": 9.969155500146782e-06, + "loss": 0.4848, + "step": 2218 + }, + { + "epoch": 0.06, + "grad_norm": 2.208725518196358, + "learning_rate": 9.969103385662721e-06, + "loss": 0.4745, + "step": 2219 + }, + { + "epoch": 0.06, + "grad_norm": 1.6790432098825745, + "learning_rate": 9.969051227326269e-06, + "loss": 0.4641, + "step": 2220 + }, + { + "epoch": 0.06, + "grad_norm": 2.2253546202956973, + "learning_rate": 9.968999025137886e-06, + "loss": 0.4438, + "step": 2221 + }, + { + "epoch": 0.06, + "grad_norm": 1.7034598975745183, + "learning_rate": 9.96894677909803e-06, + "loss": 0.4597, + "step": 2222 + }, + { + "epoch": 0.06, + "grad_norm": 2.189270989033699, + "learning_rate": 9.968894489207164e-06, + "loss": 0.4898, + "step": 2223 + }, + { + "epoch": 0.06, + "grad_norm": 1.6923824892870656, + "learning_rate": 9.96884215546575e-06, + "loss": 0.4907, + "step": 2224 + }, + { + "epoch": 0.06, + "grad_norm": 1.63223539626723, + "learning_rate": 9.968789777874249e-06, + "loss": 0.4636, + "step": 2225 + }, + { + "epoch": 0.06, + "grad_norm": 1.6712656641988872, + "learning_rate": 9.968737356433122e-06, + "loss": 0.4761, + "step": 2226 + }, + { + "epoch": 0.06, + "grad_norm": 1.239528119100188, + "learning_rate": 9.968684891142833e-06, + "loss": 0.5974, + "step": 2227 + }, + { + "epoch": 0.06, + "grad_norm": 1.6866905777233263, + "learning_rate": 9.968632382003845e-06, + "loss": 0.4648, + "step": 2228 + }, + { + "epoch": 0.06, + "grad_norm": 1.7075876039102222, + "learning_rate": 9.96857982901662e-06, + "loss": 0.4733, + "step": 2229 + }, + { + "epoch": 0.06, + "grad_norm": 1.6640840372673742, + "learning_rate": 9.968527232181624e-06, + "loss": 0.4724, + "step": 2230 + }, + { + "epoch": 0.06, + "grad_norm": 1.5320703403677698, + "learning_rate": 9.96847459149932e-06, + "loss": 0.4545, + "step": 2231 + }, + { + "epoch": 0.06, + "grad_norm": 1.6111338405856621, + "learning_rate": 9.968421906970171e-06, + "loss": 0.4641, + "step": 2232 + }, + { + "epoch": 0.06, + "grad_norm": 1.6131082444237013, + "learning_rate": 9.968369178594645e-06, + "loss": 0.4645, + "step": 2233 + }, + { + "epoch": 0.06, + "grad_norm": 1.6847911682109185, + "learning_rate": 9.968316406373206e-06, + "loss": 0.4716, + "step": 2234 + }, + { + "epoch": 0.06, + "grad_norm": 1.7735535426408884, + "learning_rate": 9.96826359030632e-06, + "loss": 0.5003, + "step": 2235 + }, + { + "epoch": 0.06, + "grad_norm": 2.0084714931477947, + "learning_rate": 9.968210730394452e-06, + "loss": 0.4646, + "step": 2236 + }, + { + "epoch": 0.06, + "grad_norm": 1.696719324857034, + "learning_rate": 9.968157826638069e-06, + "loss": 0.4567, + "step": 2237 + }, + { + "epoch": 0.06, + "grad_norm": 1.6858787478280692, + "learning_rate": 9.968104879037638e-06, + "loss": 0.4407, + "step": 2238 + }, + { + "epoch": 0.06, + "grad_norm": 1.926068293099498, + "learning_rate": 9.968051887593626e-06, + "loss": 0.3942, + "step": 2239 + }, + { + "epoch": 0.06, + "grad_norm": 1.6843012018952688, + "learning_rate": 9.967998852306501e-06, + "loss": 0.4376, + "step": 2240 + }, + { + "epoch": 0.06, + "grad_norm": 1.6699485337987139, + "learning_rate": 9.967945773176732e-06, + "loss": 0.4621, + "step": 2241 + }, + { + "epoch": 0.07, + "grad_norm": 1.725135297219596, + "learning_rate": 9.967892650204787e-06, + "loss": 0.4494, + "step": 2242 + }, + { + "epoch": 0.07, + "grad_norm": 1.8020988623125278, + "learning_rate": 9.967839483391133e-06, + "loss": 0.4314, + "step": 2243 + }, + { + "epoch": 0.07, + "grad_norm": 1.997720086525518, + "learning_rate": 9.967786272736242e-06, + "loss": 0.4292, + "step": 2244 + }, + { + "epoch": 0.07, + "grad_norm": 1.6479400083962774, + "learning_rate": 9.96773301824058e-06, + "loss": 0.452, + "step": 2245 + }, + { + "epoch": 0.07, + "grad_norm": 3.368393292635591, + "learning_rate": 9.967679719904619e-06, + "loss": 0.4574, + "step": 2246 + }, + { + "epoch": 0.07, + "grad_norm": 1.6330750662174933, + "learning_rate": 9.96762637772883e-06, + "loss": 0.4405, + "step": 2247 + }, + { + "epoch": 0.07, + "grad_norm": 1.8512341274952593, + "learning_rate": 9.967572991713684e-06, + "loss": 0.4918, + "step": 2248 + }, + { + "epoch": 0.07, + "grad_norm": 2.5967663839841735, + "learning_rate": 9.96751956185965e-06, + "loss": 0.5368, + "step": 2249 + }, + { + "epoch": 0.07, + "grad_norm": 1.675728795193932, + "learning_rate": 9.967466088167201e-06, + "loss": 0.4584, + "step": 2250 + }, + { + "epoch": 0.07, + "grad_norm": 1.7541228430688067, + "learning_rate": 9.96741257063681e-06, + "loss": 0.4806, + "step": 2251 + }, + { + "epoch": 0.07, + "grad_norm": 1.642464683394851, + "learning_rate": 9.967359009268946e-06, + "loss": 0.4608, + "step": 2252 + }, + { + "epoch": 0.07, + "grad_norm": 1.6286181103809914, + "learning_rate": 9.967305404064085e-06, + "loss": 0.461, + "step": 2253 + }, + { + "epoch": 0.07, + "grad_norm": 1.1472827805482517, + "learning_rate": 9.967251755022697e-06, + "loss": 0.5763, + "step": 2254 + }, + { + "epoch": 0.07, + "grad_norm": 1.7456989391092796, + "learning_rate": 9.96719806214526e-06, + "loss": 0.4492, + "step": 2255 + }, + { + "epoch": 0.07, + "grad_norm": 1.8020574483135534, + "learning_rate": 9.967144325432244e-06, + "loss": 0.4609, + "step": 2256 + }, + { + "epoch": 0.07, + "grad_norm": 1.6176834115418344, + "learning_rate": 9.967090544884122e-06, + "loss": 0.4332, + "step": 2257 + }, + { + "epoch": 0.07, + "grad_norm": 1.7028234957864985, + "learning_rate": 9.967036720501373e-06, + "loss": 0.4651, + "step": 2258 + }, + { + "epoch": 0.07, + "grad_norm": 1.561504766182147, + "learning_rate": 9.96698285228447e-06, + "loss": 0.4395, + "step": 2259 + }, + { + "epoch": 0.07, + "grad_norm": 1.7949915259949387, + "learning_rate": 9.966928940233886e-06, + "loss": 0.431, + "step": 2260 + }, + { + "epoch": 0.07, + "grad_norm": 1.8393664260696687, + "learning_rate": 9.966874984350102e-06, + "loss": 0.4947, + "step": 2261 + }, + { + "epoch": 0.07, + "grad_norm": 1.7917749143945072, + "learning_rate": 9.966820984633589e-06, + "loss": 0.4532, + "step": 2262 + }, + { + "epoch": 0.07, + "grad_norm": 2.0040636481839194, + "learning_rate": 9.966766941084827e-06, + "loss": 0.4898, + "step": 2263 + }, + { + "epoch": 0.07, + "grad_norm": 1.6173209193854168, + "learning_rate": 9.96671285370429e-06, + "loss": 0.5332, + "step": 2264 + }, + { + "epoch": 0.07, + "grad_norm": 2.3228499166238903, + "learning_rate": 9.966658722492459e-06, + "loss": 0.4422, + "step": 2265 + }, + { + "epoch": 0.07, + "grad_norm": 1.8377703015109152, + "learning_rate": 9.96660454744981e-06, + "loss": 0.4803, + "step": 2266 + }, + { + "epoch": 0.07, + "grad_norm": 1.6158244732484193, + "learning_rate": 9.966550328576818e-06, + "loss": 0.4764, + "step": 2267 + }, + { + "epoch": 0.07, + "grad_norm": 1.901031626108363, + "learning_rate": 9.966496065873965e-06, + "loss": 0.4385, + "step": 2268 + }, + { + "epoch": 0.07, + "grad_norm": 1.7616457903444622, + "learning_rate": 9.96644175934173e-06, + "loss": 0.4335, + "step": 2269 + }, + { + "epoch": 0.07, + "grad_norm": 1.784493919551032, + "learning_rate": 9.966387408980591e-06, + "loss": 0.4712, + "step": 2270 + }, + { + "epoch": 0.07, + "grad_norm": 1.5647752909845958, + "learning_rate": 9.96633301479103e-06, + "loss": 0.4542, + "step": 2271 + }, + { + "epoch": 0.07, + "grad_norm": 1.781680162393068, + "learning_rate": 9.966278576773522e-06, + "loss": 0.5421, + "step": 2272 + }, + { + "epoch": 0.07, + "grad_norm": 1.6098915544891035, + "learning_rate": 9.966224094928551e-06, + "loss": 0.4847, + "step": 2273 + }, + { + "epoch": 0.07, + "grad_norm": 1.7055142766053752, + "learning_rate": 9.966169569256598e-06, + "loss": 0.4931, + "step": 2274 + }, + { + "epoch": 0.07, + "grad_norm": 1.6203576361874628, + "learning_rate": 9.966114999758145e-06, + "loss": 0.4456, + "step": 2275 + }, + { + "epoch": 0.07, + "grad_norm": 1.609856931072219, + "learning_rate": 9.966060386433671e-06, + "loss": 0.434, + "step": 2276 + }, + { + "epoch": 0.07, + "grad_norm": 1.7775590279716165, + "learning_rate": 9.966005729283658e-06, + "loss": 0.5012, + "step": 2277 + }, + { + "epoch": 0.07, + "grad_norm": 1.6671339736380308, + "learning_rate": 9.96595102830859e-06, + "loss": 0.4406, + "step": 2278 + }, + { + "epoch": 0.07, + "grad_norm": 1.910928648867774, + "learning_rate": 9.965896283508949e-06, + "loss": 0.4837, + "step": 2279 + }, + { + "epoch": 0.07, + "grad_norm": 1.997264023982475, + "learning_rate": 9.965841494885218e-06, + "loss": 0.4614, + "step": 2280 + }, + { + "epoch": 0.07, + "grad_norm": 1.6783197447483054, + "learning_rate": 9.96578666243788e-06, + "loss": 0.4699, + "step": 2281 + }, + { + "epoch": 0.07, + "grad_norm": 2.3547473020615275, + "learning_rate": 9.965731786167421e-06, + "loss": 0.4739, + "step": 2282 + }, + { + "epoch": 0.07, + "grad_norm": 1.7463682016469295, + "learning_rate": 9.965676866074324e-06, + "loss": 0.4645, + "step": 2283 + }, + { + "epoch": 0.07, + "grad_norm": 1.6920889825369494, + "learning_rate": 9.965621902159074e-06, + "loss": 0.4481, + "step": 2284 + }, + { + "epoch": 0.07, + "grad_norm": 1.8041237185339587, + "learning_rate": 9.965566894422152e-06, + "loss": 0.4847, + "step": 2285 + }, + { + "epoch": 0.07, + "grad_norm": 1.9138336559053446, + "learning_rate": 9.96551184286405e-06, + "loss": 0.4549, + "step": 2286 + }, + { + "epoch": 0.07, + "grad_norm": 1.7055222479152883, + "learning_rate": 9.965456747485252e-06, + "loss": 0.4828, + "step": 2287 + }, + { + "epoch": 0.07, + "grad_norm": 1.7137300215950297, + "learning_rate": 9.96540160828624e-06, + "loss": 0.5123, + "step": 2288 + }, + { + "epoch": 0.07, + "grad_norm": 1.699501498223891, + "learning_rate": 9.965346425267506e-06, + "loss": 0.4345, + "step": 2289 + }, + { + "epoch": 0.07, + "grad_norm": 1.579908985902771, + "learning_rate": 9.965291198429534e-06, + "loss": 0.4833, + "step": 2290 + }, + { + "epoch": 0.07, + "grad_norm": 1.7942883742564015, + "learning_rate": 9.965235927772812e-06, + "loss": 0.4508, + "step": 2291 + }, + { + "epoch": 0.07, + "grad_norm": 2.0824624105160003, + "learning_rate": 9.965180613297828e-06, + "loss": 0.4557, + "step": 2292 + }, + { + "epoch": 0.07, + "grad_norm": 1.6218790588028558, + "learning_rate": 9.96512525500507e-06, + "loss": 0.4777, + "step": 2293 + }, + { + "epoch": 0.07, + "grad_norm": 1.6058414587034622, + "learning_rate": 9.965069852895028e-06, + "loss": 0.4463, + "step": 2294 + }, + { + "epoch": 0.07, + "grad_norm": 1.5871798806303739, + "learning_rate": 9.965014406968188e-06, + "loss": 0.433, + "step": 2295 + }, + { + "epoch": 0.07, + "grad_norm": 1.6423188839608722, + "learning_rate": 9.96495891722504e-06, + "loss": 0.4664, + "step": 2296 + }, + { + "epoch": 0.07, + "grad_norm": 1.8709773682049224, + "learning_rate": 9.964903383666076e-06, + "loss": 0.4618, + "step": 2297 + }, + { + "epoch": 0.07, + "grad_norm": 1.9093341960767691, + "learning_rate": 9.964847806291784e-06, + "loss": 0.5043, + "step": 2298 + }, + { + "epoch": 0.07, + "grad_norm": 1.7166098519549768, + "learning_rate": 9.964792185102653e-06, + "loss": 0.4744, + "step": 2299 + }, + { + "epoch": 0.07, + "grad_norm": 2.2620054081397356, + "learning_rate": 9.96473652009918e-06, + "loss": 0.4683, + "step": 2300 + }, + { + "epoch": 0.07, + "grad_norm": 1.5951013809836823, + "learning_rate": 9.96468081128185e-06, + "loss": 0.4606, + "step": 2301 + }, + { + "epoch": 0.07, + "grad_norm": 1.9513419119815674, + "learning_rate": 9.964625058651156e-06, + "loss": 0.4901, + "step": 2302 + }, + { + "epoch": 0.07, + "grad_norm": 1.6412755807807822, + "learning_rate": 9.964569262207592e-06, + "loss": 0.4693, + "step": 2303 + }, + { + "epoch": 0.07, + "grad_norm": 1.211655340126906, + "learning_rate": 9.96451342195165e-06, + "loss": 0.5884, + "step": 2304 + }, + { + "epoch": 0.07, + "grad_norm": 1.6871127849283674, + "learning_rate": 9.96445753788382e-06, + "loss": 0.4444, + "step": 2305 + }, + { + "epoch": 0.07, + "grad_norm": 1.6995124562691095, + "learning_rate": 9.964401610004597e-06, + "loss": 0.4501, + "step": 2306 + }, + { + "epoch": 0.07, + "grad_norm": 1.6760812420738815, + "learning_rate": 9.964345638314476e-06, + "loss": 0.4999, + "step": 2307 + }, + { + "epoch": 0.07, + "grad_norm": 2.1032884415002786, + "learning_rate": 9.96428962281395e-06, + "loss": 0.436, + "step": 2308 + }, + { + "epoch": 0.07, + "grad_norm": 1.7835106241911372, + "learning_rate": 9.964233563503512e-06, + "loss": 0.461, + "step": 2309 + }, + { + "epoch": 0.07, + "grad_norm": 1.6043807272987851, + "learning_rate": 9.964177460383658e-06, + "loss": 0.4461, + "step": 2310 + }, + { + "epoch": 0.07, + "grad_norm": 2.39469151184607, + "learning_rate": 9.964121313454882e-06, + "loss": 0.4611, + "step": 2311 + }, + { + "epoch": 0.07, + "grad_norm": 1.7546959277492336, + "learning_rate": 9.96406512271768e-06, + "loss": 0.5023, + "step": 2312 + }, + { + "epoch": 0.07, + "grad_norm": 1.8900745700599741, + "learning_rate": 9.96400888817255e-06, + "loss": 0.5144, + "step": 2313 + }, + { + "epoch": 0.07, + "grad_norm": 2.1193954657491862, + "learning_rate": 9.963952609819987e-06, + "loss": 0.4857, + "step": 2314 + }, + { + "epoch": 0.07, + "grad_norm": 1.7618005283236033, + "learning_rate": 9.963896287660487e-06, + "loss": 0.4995, + "step": 2315 + }, + { + "epoch": 0.07, + "grad_norm": 1.6741919451208904, + "learning_rate": 9.963839921694545e-06, + "loss": 0.4654, + "step": 2316 + }, + { + "epoch": 0.07, + "grad_norm": 1.9216989118703667, + "learning_rate": 9.963783511922662e-06, + "loss": 0.4297, + "step": 2317 + }, + { + "epoch": 0.07, + "grad_norm": 1.7691262212051047, + "learning_rate": 9.963727058345333e-06, + "loss": 0.5216, + "step": 2318 + }, + { + "epoch": 0.07, + "grad_norm": 1.7172924916108316, + "learning_rate": 9.963670560963059e-06, + "loss": 0.4634, + "step": 2319 + }, + { + "epoch": 0.07, + "grad_norm": 1.6903076682875289, + "learning_rate": 9.963614019776338e-06, + "loss": 0.4962, + "step": 2320 + }, + { + "epoch": 0.07, + "grad_norm": 2.3468262895825567, + "learning_rate": 9.963557434785667e-06, + "loss": 0.4088, + "step": 2321 + }, + { + "epoch": 0.07, + "grad_norm": 1.8094462332066696, + "learning_rate": 9.963500805991547e-06, + "loss": 0.4488, + "step": 2322 + }, + { + "epoch": 0.07, + "grad_norm": 1.8904230561826976, + "learning_rate": 9.963444133394478e-06, + "loss": 0.4358, + "step": 2323 + }, + { + "epoch": 0.07, + "grad_norm": 1.9671301796620788, + "learning_rate": 9.963387416994958e-06, + "loss": 0.456, + "step": 2324 + }, + { + "epoch": 0.07, + "grad_norm": 1.5963746429504662, + "learning_rate": 9.963330656793491e-06, + "loss": 0.4423, + "step": 2325 + }, + { + "epoch": 0.07, + "grad_norm": 1.7761733401246678, + "learning_rate": 9.963273852790574e-06, + "loss": 0.4705, + "step": 2326 + }, + { + "epoch": 0.07, + "grad_norm": 1.2061458357814894, + "learning_rate": 9.963217004986711e-06, + "loss": 0.5904, + "step": 2327 + }, + { + "epoch": 0.07, + "grad_norm": 1.9105440895540735, + "learning_rate": 9.963160113382403e-06, + "loss": 0.4932, + "step": 2328 + }, + { + "epoch": 0.07, + "grad_norm": 1.7921068567236635, + "learning_rate": 9.96310317797815e-06, + "loss": 0.4442, + "step": 2329 + }, + { + "epoch": 0.07, + "grad_norm": 1.6282433994311336, + "learning_rate": 9.963046198774459e-06, + "loss": 0.4897, + "step": 2330 + }, + { + "epoch": 0.07, + "grad_norm": 1.674393232880987, + "learning_rate": 9.96298917577183e-06, + "loss": 0.4166, + "step": 2331 + }, + { + "epoch": 0.07, + "grad_norm": 2.214432352714782, + "learning_rate": 9.962932108970765e-06, + "loss": 0.4697, + "step": 2332 + }, + { + "epoch": 0.07, + "grad_norm": 1.8544168083254222, + "learning_rate": 9.96287499837177e-06, + "loss": 0.4904, + "step": 2333 + }, + { + "epoch": 0.07, + "grad_norm": 1.7704947426080075, + "learning_rate": 9.962817843975347e-06, + "loss": 0.4582, + "step": 2334 + }, + { + "epoch": 0.07, + "grad_norm": 1.677780988533769, + "learning_rate": 9.962760645782003e-06, + "loss": 0.4663, + "step": 2335 + }, + { + "epoch": 0.07, + "grad_norm": 1.81343332703445, + "learning_rate": 9.96270340379224e-06, + "loss": 0.4616, + "step": 2336 + }, + { + "epoch": 0.07, + "grad_norm": 1.7498873661168455, + "learning_rate": 9.962646118006564e-06, + "loss": 0.4867, + "step": 2337 + }, + { + "epoch": 0.07, + "grad_norm": 1.661137182142362, + "learning_rate": 9.962588788425483e-06, + "loss": 0.4763, + "step": 2338 + }, + { + "epoch": 0.07, + "grad_norm": 1.5852081715757405, + "learning_rate": 9.9625314150495e-06, + "loss": 0.4441, + "step": 2339 + }, + { + "epoch": 0.07, + "grad_norm": 1.119095479262732, + "learning_rate": 9.962473997879123e-06, + "loss": 0.5699, + "step": 2340 + }, + { + "epoch": 0.07, + "grad_norm": 1.8521308108693129, + "learning_rate": 9.962416536914856e-06, + "loss": 0.4997, + "step": 2341 + }, + { + "epoch": 0.07, + "grad_norm": 1.6884913160219395, + "learning_rate": 9.962359032157209e-06, + "loss": 0.4399, + "step": 2342 + }, + { + "epoch": 0.07, + "grad_norm": 1.5857952378600104, + "learning_rate": 9.962301483606689e-06, + "loss": 0.4392, + "step": 2343 + }, + { + "epoch": 0.07, + "grad_norm": 3.355725409177224, + "learning_rate": 9.962243891263804e-06, + "loss": 0.4663, + "step": 2344 + }, + { + "epoch": 0.07, + "grad_norm": 1.8239177142536354, + "learning_rate": 9.962186255129059e-06, + "loss": 0.4882, + "step": 2345 + }, + { + "epoch": 0.07, + "grad_norm": 1.780076427515126, + "learning_rate": 9.962128575202967e-06, + "loss": 0.4587, + "step": 2346 + }, + { + "epoch": 0.07, + "grad_norm": 1.7435079702170733, + "learning_rate": 9.962070851486034e-06, + "loss": 0.4236, + "step": 2347 + }, + { + "epoch": 0.07, + "grad_norm": 1.9269846599741507, + "learning_rate": 9.962013083978773e-06, + "loss": 0.529, + "step": 2348 + }, + { + "epoch": 0.07, + "grad_norm": 1.975446326111129, + "learning_rate": 9.961955272681689e-06, + "loss": 0.487, + "step": 2349 + }, + { + "epoch": 0.07, + "grad_norm": 1.7443348929300244, + "learning_rate": 9.961897417595297e-06, + "loss": 0.4494, + "step": 2350 + }, + { + "epoch": 0.07, + "grad_norm": 1.6218754849950878, + "learning_rate": 9.961839518720104e-06, + "loss": 0.4709, + "step": 2351 + }, + { + "epoch": 0.07, + "grad_norm": 1.6751315893114684, + "learning_rate": 9.961781576056621e-06, + "loss": 0.4462, + "step": 2352 + }, + { + "epoch": 0.07, + "grad_norm": 1.637662443384434, + "learning_rate": 9.961723589605363e-06, + "loss": 0.443, + "step": 2353 + }, + { + "epoch": 0.07, + "grad_norm": 1.7865944242429663, + "learning_rate": 9.961665559366837e-06, + "loss": 0.431, + "step": 2354 + }, + { + "epoch": 0.07, + "grad_norm": 1.8819447454759868, + "learning_rate": 9.96160748534156e-06, + "loss": 0.4457, + "step": 2355 + }, + { + "epoch": 0.07, + "grad_norm": 1.2758891670506396, + "learning_rate": 9.96154936753004e-06, + "loss": 0.5903, + "step": 2356 + }, + { + "epoch": 0.07, + "grad_norm": 1.8180776911505925, + "learning_rate": 9.961491205932791e-06, + "loss": 0.4575, + "step": 2357 + }, + { + "epoch": 0.07, + "grad_norm": 1.8703821453623286, + "learning_rate": 9.961433000550329e-06, + "loss": 0.482, + "step": 2358 + }, + { + "epoch": 0.07, + "grad_norm": 1.7352701351070399, + "learning_rate": 9.961374751383166e-06, + "loss": 0.4808, + "step": 2359 + }, + { + "epoch": 0.07, + "grad_norm": 1.6357564900376562, + "learning_rate": 9.961316458431813e-06, + "loss": 0.4502, + "step": 2360 + }, + { + "epoch": 0.07, + "grad_norm": 2.5305061408908087, + "learning_rate": 9.96125812169679e-06, + "loss": 0.4938, + "step": 2361 + }, + { + "epoch": 0.07, + "grad_norm": 1.7393646107639795, + "learning_rate": 9.961199741178609e-06, + "loss": 0.4831, + "step": 2362 + }, + { + "epoch": 0.07, + "grad_norm": 1.7641654470339487, + "learning_rate": 9.961141316877783e-06, + "loss": 0.4479, + "step": 2363 + }, + { + "epoch": 0.07, + "grad_norm": 1.6941569550377098, + "learning_rate": 9.961082848794833e-06, + "loss": 0.477, + "step": 2364 + }, + { + "epoch": 0.07, + "grad_norm": 1.7685884545694182, + "learning_rate": 9.961024336930268e-06, + "loss": 0.4613, + "step": 2365 + }, + { + "epoch": 0.07, + "grad_norm": 1.728664717872347, + "learning_rate": 9.960965781284611e-06, + "loss": 0.46, + "step": 2366 + }, + { + "epoch": 0.07, + "grad_norm": 1.684365021178453, + "learning_rate": 9.960907181858374e-06, + "loss": 0.4444, + "step": 2367 + }, + { + "epoch": 0.07, + "grad_norm": 1.6755226833534544, + "learning_rate": 9.960848538652077e-06, + "loss": 0.4385, + "step": 2368 + }, + { + "epoch": 0.07, + "grad_norm": 1.9037430928082641, + "learning_rate": 9.960789851666237e-06, + "loss": 0.4701, + "step": 2369 + }, + { + "epoch": 0.07, + "grad_norm": 1.5955885293737702, + "learning_rate": 9.960731120901372e-06, + "loss": 0.4742, + "step": 2370 + }, + { + "epoch": 0.07, + "grad_norm": 1.6842684567324475, + "learning_rate": 9.960672346357999e-06, + "loss": 0.4539, + "step": 2371 + }, + { + "epoch": 0.07, + "grad_norm": 1.8501455625639218, + "learning_rate": 9.960613528036637e-06, + "loss": 0.4579, + "step": 2372 + }, + { + "epoch": 0.07, + "grad_norm": 2.1659028752736034, + "learning_rate": 9.960554665937808e-06, + "loss": 0.4435, + "step": 2373 + }, + { + "epoch": 0.07, + "grad_norm": 1.9179927745069887, + "learning_rate": 9.960495760062025e-06, + "loss": 0.4693, + "step": 2374 + }, + { + "epoch": 0.07, + "grad_norm": 1.6761443908031173, + "learning_rate": 9.960436810409815e-06, + "loss": 0.4848, + "step": 2375 + }, + { + "epoch": 0.07, + "grad_norm": 1.6183739472972156, + "learning_rate": 9.960377816981695e-06, + "loss": 0.472, + "step": 2376 + }, + { + "epoch": 0.07, + "grad_norm": 1.721832806067122, + "learning_rate": 9.960318779778183e-06, + "loss": 0.4355, + "step": 2377 + }, + { + "epoch": 0.07, + "grad_norm": 1.130368110888746, + "learning_rate": 9.960259698799806e-06, + "loss": 0.5784, + "step": 2378 + }, + { + "epoch": 0.07, + "grad_norm": 1.8367055004727033, + "learning_rate": 9.960200574047081e-06, + "loss": 0.4876, + "step": 2379 + }, + { + "epoch": 0.07, + "grad_norm": 1.947292326366326, + "learning_rate": 9.960141405520532e-06, + "loss": 0.5291, + "step": 2380 + }, + { + "epoch": 0.07, + "grad_norm": 2.2232980108440303, + "learning_rate": 9.960082193220677e-06, + "loss": 0.4795, + "step": 2381 + }, + { + "epoch": 0.07, + "grad_norm": 1.831469519706768, + "learning_rate": 9.960022937148045e-06, + "loss": 0.4587, + "step": 2382 + }, + { + "epoch": 0.07, + "grad_norm": 2.0646958996580826, + "learning_rate": 9.959963637303154e-06, + "loss": 0.4798, + "step": 2383 + }, + { + "epoch": 0.07, + "grad_norm": 1.7065877085357206, + "learning_rate": 9.959904293686528e-06, + "loss": 0.469, + "step": 2384 + }, + { + "epoch": 0.07, + "grad_norm": 1.7498675971634694, + "learning_rate": 9.959844906298693e-06, + "loss": 0.4558, + "step": 2385 + }, + { + "epoch": 0.07, + "grad_norm": 3.354143876573392, + "learning_rate": 9.959785475140172e-06, + "loss": 0.4718, + "step": 2386 + }, + { + "epoch": 0.07, + "grad_norm": 1.8823574877388407, + "learning_rate": 9.959726000211487e-06, + "loss": 0.4635, + "step": 2387 + }, + { + "epoch": 0.07, + "grad_norm": 1.9453194518567138, + "learning_rate": 9.959666481513168e-06, + "loss": 0.4945, + "step": 2388 + }, + { + "epoch": 0.07, + "grad_norm": 1.6001068894414237, + "learning_rate": 9.959606919045735e-06, + "loss": 0.4517, + "step": 2389 + }, + { + "epoch": 0.07, + "grad_norm": 1.6418022021038576, + "learning_rate": 9.959547312809717e-06, + "loss": 0.4624, + "step": 2390 + }, + { + "epoch": 0.07, + "grad_norm": 1.9010900685947174, + "learning_rate": 9.959487662805639e-06, + "loss": 0.5171, + "step": 2391 + }, + { + "epoch": 0.07, + "grad_norm": 1.8849538307011071, + "learning_rate": 9.959427969034025e-06, + "loss": 0.435, + "step": 2392 + }, + { + "epoch": 0.07, + "grad_norm": 1.7177171173357546, + "learning_rate": 9.959368231495408e-06, + "loss": 0.4965, + "step": 2393 + }, + { + "epoch": 0.07, + "grad_norm": 1.6318921483661062, + "learning_rate": 9.959308450190308e-06, + "loss": 0.4377, + "step": 2394 + }, + { + "epoch": 0.07, + "grad_norm": 2.149605397786968, + "learning_rate": 9.959248625119257e-06, + "loss": 0.5059, + "step": 2395 + }, + { + "epoch": 0.07, + "grad_norm": 1.7745332043436912, + "learning_rate": 9.959188756282783e-06, + "loss": 0.5015, + "step": 2396 + }, + { + "epoch": 0.07, + "grad_norm": 1.8831444002294355, + "learning_rate": 9.959128843681412e-06, + "loss": 0.4516, + "step": 2397 + }, + { + "epoch": 0.07, + "grad_norm": 1.6458376156902637, + "learning_rate": 9.959068887315674e-06, + "loss": 0.427, + "step": 2398 + }, + { + "epoch": 0.07, + "grad_norm": 1.7670844405749335, + "learning_rate": 9.9590088871861e-06, + "loss": 0.4829, + "step": 2399 + }, + { + "epoch": 0.07, + "grad_norm": 1.8254136703213388, + "learning_rate": 9.958948843293214e-06, + "loss": 0.4768, + "step": 2400 + }, + { + "epoch": 0.07, + "grad_norm": 1.665133998320324, + "learning_rate": 9.958888755637553e-06, + "loss": 0.4801, + "step": 2401 + }, + { + "epoch": 0.07, + "grad_norm": 2.479276909434107, + "learning_rate": 9.958828624219641e-06, + "loss": 0.4883, + "step": 2402 + }, + { + "epoch": 0.07, + "grad_norm": 1.6755608711123602, + "learning_rate": 9.958768449040013e-06, + "loss": 0.4775, + "step": 2403 + }, + { + "epoch": 0.07, + "grad_norm": 1.8091115034945366, + "learning_rate": 9.958708230099197e-06, + "loss": 0.4848, + "step": 2404 + }, + { + "epoch": 0.07, + "grad_norm": 1.8847007834486054, + "learning_rate": 9.958647967397728e-06, + "loss": 0.456, + "step": 2405 + }, + { + "epoch": 0.07, + "grad_norm": 2.0186533140673975, + "learning_rate": 9.958587660936134e-06, + "loss": 0.4428, + "step": 2406 + }, + { + "epoch": 0.07, + "grad_norm": 1.6379137021352925, + "learning_rate": 9.95852731071495e-06, + "loss": 0.4526, + "step": 2407 + }, + { + "epoch": 0.07, + "grad_norm": 1.7953021564042333, + "learning_rate": 9.958466916734706e-06, + "loss": 0.4581, + "step": 2408 + }, + { + "epoch": 0.07, + "grad_norm": 1.693650347337165, + "learning_rate": 9.958406478995938e-06, + "loss": 0.4771, + "step": 2409 + }, + { + "epoch": 0.07, + "grad_norm": 1.7174543401108515, + "learning_rate": 9.958345997499178e-06, + "loss": 0.4649, + "step": 2410 + }, + { + "epoch": 0.07, + "grad_norm": 1.6591587633234737, + "learning_rate": 9.958285472244958e-06, + "loss": 0.4363, + "step": 2411 + }, + { + "epoch": 0.07, + "grad_norm": 1.5521076166619336, + "learning_rate": 9.958224903233813e-06, + "loss": 0.4299, + "step": 2412 + }, + { + "epoch": 0.07, + "grad_norm": 1.5416591176772207, + "learning_rate": 9.95816429046628e-06, + "loss": 0.4451, + "step": 2413 + }, + { + "epoch": 0.07, + "grad_norm": 1.8186168254827308, + "learning_rate": 9.958103633942892e-06, + "loss": 0.4512, + "step": 2414 + }, + { + "epoch": 0.07, + "grad_norm": 1.8536745888849233, + "learning_rate": 9.958042933664186e-06, + "loss": 0.5253, + "step": 2415 + }, + { + "epoch": 0.07, + "grad_norm": 1.6142524837734535, + "learning_rate": 9.957982189630693e-06, + "loss": 0.4518, + "step": 2416 + }, + { + "epoch": 0.07, + "grad_norm": 2.421705730248931, + "learning_rate": 9.957921401842953e-06, + "loss": 0.4683, + "step": 2417 + }, + { + "epoch": 0.07, + "grad_norm": 1.63005443556491, + "learning_rate": 9.957860570301502e-06, + "loss": 0.4314, + "step": 2418 + }, + { + "epoch": 0.07, + "grad_norm": 1.9821032526424158, + "learning_rate": 9.957799695006876e-06, + "loss": 0.505, + "step": 2419 + }, + { + "epoch": 0.07, + "grad_norm": 2.7033495049917358, + "learning_rate": 9.957738775959614e-06, + "loss": 0.4677, + "step": 2420 + }, + { + "epoch": 0.07, + "grad_norm": 1.628087078645812, + "learning_rate": 9.957677813160252e-06, + "loss": 0.4436, + "step": 2421 + }, + { + "epoch": 0.07, + "grad_norm": 1.5140256077014513, + "learning_rate": 9.957616806609326e-06, + "loss": 0.4471, + "step": 2422 + }, + { + "epoch": 0.07, + "grad_norm": 1.5629226949301842, + "learning_rate": 9.95755575630738e-06, + "loss": 0.4657, + "step": 2423 + }, + { + "epoch": 0.07, + "grad_norm": 1.5724454762744797, + "learning_rate": 9.957494662254947e-06, + "loss": 0.4732, + "step": 2424 + }, + { + "epoch": 0.07, + "grad_norm": 1.6115304909808277, + "learning_rate": 9.95743352445257e-06, + "loss": 0.4636, + "step": 2425 + }, + { + "epoch": 0.07, + "grad_norm": 1.7724809981666874, + "learning_rate": 9.957372342900786e-06, + "loss": 0.5155, + "step": 2426 + }, + { + "epoch": 0.07, + "grad_norm": 1.600006058787074, + "learning_rate": 9.957311117600137e-06, + "loss": 0.4884, + "step": 2427 + }, + { + "epoch": 0.07, + "grad_norm": 1.5834650173632256, + "learning_rate": 9.957249848551163e-06, + "loss": 0.4352, + "step": 2428 + }, + { + "epoch": 0.07, + "grad_norm": 1.6699308679952822, + "learning_rate": 9.957188535754403e-06, + "loss": 0.4752, + "step": 2429 + }, + { + "epoch": 0.07, + "grad_norm": 1.744748002592448, + "learning_rate": 9.9571271792104e-06, + "loss": 0.4758, + "step": 2430 + }, + { + "epoch": 0.07, + "grad_norm": 1.6418167922240536, + "learning_rate": 9.957065778919693e-06, + "loss": 0.4375, + "step": 2431 + }, + { + "epoch": 0.07, + "grad_norm": 1.7381539804653379, + "learning_rate": 9.957004334882826e-06, + "loss": 0.4659, + "step": 2432 + }, + { + "epoch": 0.07, + "grad_norm": 4.643513430837416, + "learning_rate": 9.956942847100343e-06, + "loss": 0.4401, + "step": 2433 + }, + { + "epoch": 0.07, + "grad_norm": 2.048619092209772, + "learning_rate": 9.956881315572781e-06, + "loss": 0.4351, + "step": 2434 + }, + { + "epoch": 0.07, + "grad_norm": 1.673965513375754, + "learning_rate": 9.95681974030069e-06, + "loss": 0.5035, + "step": 2435 + }, + { + "epoch": 0.07, + "grad_norm": 1.7257019386756642, + "learning_rate": 9.956758121284607e-06, + "loss": 0.4467, + "step": 2436 + }, + { + "epoch": 0.07, + "grad_norm": 2.202366662254736, + "learning_rate": 9.95669645852508e-06, + "loss": 0.4639, + "step": 2437 + }, + { + "epoch": 0.07, + "grad_norm": 1.2489927556534242, + "learning_rate": 9.956634752022651e-06, + "loss": 0.5688, + "step": 2438 + }, + { + "epoch": 0.07, + "grad_norm": 1.7583695918580455, + "learning_rate": 9.956573001777866e-06, + "loss": 0.4103, + "step": 2439 + }, + { + "epoch": 0.07, + "grad_norm": 2.3324300579101367, + "learning_rate": 9.956511207791269e-06, + "loss": 0.4143, + "step": 2440 + }, + { + "epoch": 0.07, + "grad_norm": 1.6273336116279968, + "learning_rate": 9.956449370063405e-06, + "loss": 0.4585, + "step": 2441 + }, + { + "epoch": 0.07, + "grad_norm": 1.7151784102146246, + "learning_rate": 9.956387488594821e-06, + "loss": 0.4483, + "step": 2442 + }, + { + "epoch": 0.07, + "grad_norm": 2.0517156637483045, + "learning_rate": 9.956325563386062e-06, + "loss": 0.4345, + "step": 2443 + }, + { + "epoch": 0.07, + "grad_norm": 1.678309876630722, + "learning_rate": 9.956263594437677e-06, + "loss": 0.4563, + "step": 2444 + }, + { + "epoch": 0.07, + "grad_norm": 1.6839169896580095, + "learning_rate": 9.956201581750208e-06, + "loss": 0.4656, + "step": 2445 + }, + { + "epoch": 0.07, + "grad_norm": 1.8116513596656638, + "learning_rate": 9.956139525324207e-06, + "loss": 0.4841, + "step": 2446 + }, + { + "epoch": 0.07, + "grad_norm": 1.7693003794882196, + "learning_rate": 9.956077425160217e-06, + "loss": 0.4625, + "step": 2447 + }, + { + "epoch": 0.07, + "grad_norm": 1.6250394340210161, + "learning_rate": 9.956015281258791e-06, + "loss": 0.4707, + "step": 2448 + }, + { + "epoch": 0.07, + "grad_norm": 1.630612824672926, + "learning_rate": 9.955953093620475e-06, + "loss": 0.4296, + "step": 2449 + }, + { + "epoch": 0.07, + "grad_norm": 1.733381430824003, + "learning_rate": 9.955890862245818e-06, + "loss": 0.4939, + "step": 2450 + }, + { + "epoch": 0.07, + "grad_norm": 1.5639167883834528, + "learning_rate": 9.955828587135369e-06, + "loss": 0.4318, + "step": 2451 + }, + { + "epoch": 0.07, + "grad_norm": 1.5953339823099097, + "learning_rate": 9.955766268289677e-06, + "loss": 0.4917, + "step": 2452 + }, + { + "epoch": 0.07, + "grad_norm": 1.1079494610896852, + "learning_rate": 9.955703905709293e-06, + "loss": 0.6007, + "step": 2453 + }, + { + "epoch": 0.07, + "grad_norm": 4.391244315700422, + "learning_rate": 9.955641499394768e-06, + "loss": 0.4544, + "step": 2454 + }, + { + "epoch": 0.07, + "grad_norm": 1.8348452057442293, + "learning_rate": 9.955579049346651e-06, + "loss": 0.4559, + "step": 2455 + }, + { + "epoch": 0.07, + "grad_norm": 1.663376154985515, + "learning_rate": 9.955516555565493e-06, + "loss": 0.4333, + "step": 2456 + }, + { + "epoch": 0.07, + "grad_norm": 1.5944707902981448, + "learning_rate": 9.955454018051848e-06, + "loss": 0.4561, + "step": 2457 + }, + { + "epoch": 0.07, + "grad_norm": 1.6325981451779452, + "learning_rate": 9.955391436806265e-06, + "loss": 0.4507, + "step": 2458 + }, + { + "epoch": 0.07, + "grad_norm": 1.824740950933046, + "learning_rate": 9.955328811829298e-06, + "loss": 0.4444, + "step": 2459 + }, + { + "epoch": 0.07, + "grad_norm": 1.5088245956629556, + "learning_rate": 9.9552661431215e-06, + "loss": 0.4745, + "step": 2460 + }, + { + "epoch": 0.07, + "grad_norm": 1.642269628348713, + "learning_rate": 9.955203430683425e-06, + "loss": 0.4264, + "step": 2461 + }, + { + "epoch": 0.07, + "grad_norm": 1.579100272285132, + "learning_rate": 9.955140674515623e-06, + "loss": 0.4492, + "step": 2462 + }, + { + "epoch": 0.07, + "grad_norm": 1.7888293360860188, + "learning_rate": 9.955077874618648e-06, + "loss": 0.4891, + "step": 2463 + }, + { + "epoch": 0.07, + "grad_norm": 1.1590591294508532, + "learning_rate": 9.955015030993057e-06, + "loss": 0.6108, + "step": 2464 + }, + { + "epoch": 0.07, + "grad_norm": 1.6962548190604843, + "learning_rate": 9.954952143639404e-06, + "loss": 0.4499, + "step": 2465 + }, + { + "epoch": 0.07, + "grad_norm": 1.793742195619831, + "learning_rate": 9.954889212558243e-06, + "loss": 0.4558, + "step": 2466 + }, + { + "epoch": 0.07, + "grad_norm": 1.7689316010938576, + "learning_rate": 9.95482623775013e-06, + "loss": 0.4859, + "step": 2467 + }, + { + "epoch": 0.07, + "grad_norm": 1.5855450227935632, + "learning_rate": 9.95476321921562e-06, + "loss": 0.4308, + "step": 2468 + }, + { + "epoch": 0.07, + "grad_norm": 1.9814683373338933, + "learning_rate": 9.954700156955268e-06, + "loss": 0.4119, + "step": 2469 + }, + { + "epoch": 0.07, + "grad_norm": 1.853223980545298, + "learning_rate": 9.954637050969634e-06, + "loss": 0.4723, + "step": 2470 + }, + { + "epoch": 0.07, + "grad_norm": 1.5870894541154215, + "learning_rate": 9.954573901259274e-06, + "loss": 0.4179, + "step": 2471 + }, + { + "epoch": 0.07, + "grad_norm": 1.5699154266073265, + "learning_rate": 9.954510707824743e-06, + "loss": 0.4422, + "step": 2472 + }, + { + "epoch": 0.07, + "grad_norm": 1.8351588635358345, + "learning_rate": 9.9544474706666e-06, + "loss": 0.4409, + "step": 2473 + }, + { + "epoch": 0.07, + "grad_norm": 1.6386312892587873, + "learning_rate": 9.954384189785403e-06, + "loss": 0.4832, + "step": 2474 + }, + { + "epoch": 0.07, + "grad_norm": 1.044736948442105, + "learning_rate": 9.954320865181711e-06, + "loss": 0.5997, + "step": 2475 + }, + { + "epoch": 0.07, + "grad_norm": 1.7046787353327064, + "learning_rate": 9.954257496856082e-06, + "loss": 0.4473, + "step": 2476 + }, + { + "epoch": 0.07, + "grad_norm": 1.7038011295831892, + "learning_rate": 9.954194084809076e-06, + "loss": 0.4852, + "step": 2477 + }, + { + "epoch": 0.07, + "grad_norm": 1.7505527306906, + "learning_rate": 9.954130629041252e-06, + "loss": 0.4849, + "step": 2478 + }, + { + "epoch": 0.07, + "grad_norm": 1.8147402939738637, + "learning_rate": 9.95406712955317e-06, + "loss": 0.4398, + "step": 2479 + }, + { + "epoch": 0.07, + "grad_norm": 1.7174917253263908, + "learning_rate": 9.954003586345393e-06, + "loss": 0.4319, + "step": 2480 + }, + { + "epoch": 0.07, + "grad_norm": 1.6632327874741142, + "learning_rate": 9.953939999418477e-06, + "loss": 0.4743, + "step": 2481 + }, + { + "epoch": 0.07, + "grad_norm": 1.7510593781037163, + "learning_rate": 9.953876368772986e-06, + "loss": 0.4919, + "step": 2482 + }, + { + "epoch": 0.07, + "grad_norm": 1.5977205302361044, + "learning_rate": 9.953812694409481e-06, + "loss": 0.4085, + "step": 2483 + }, + { + "epoch": 0.07, + "grad_norm": 1.7589234914495622, + "learning_rate": 9.953748976328524e-06, + "loss": 0.4536, + "step": 2484 + }, + { + "epoch": 0.07, + "grad_norm": 1.5465498311917727, + "learning_rate": 9.953685214530677e-06, + "loss": 0.4636, + "step": 2485 + }, + { + "epoch": 0.07, + "grad_norm": 1.6010319383845222, + "learning_rate": 9.953621409016504e-06, + "loss": 0.4612, + "step": 2486 + }, + { + "epoch": 0.07, + "grad_norm": 2.0471241803550604, + "learning_rate": 9.953557559786566e-06, + "loss": 0.4919, + "step": 2487 + }, + { + "epoch": 0.07, + "grad_norm": 1.5838663102790873, + "learning_rate": 9.95349366684143e-06, + "loss": 0.4187, + "step": 2488 + }, + { + "epoch": 0.07, + "grad_norm": 1.7151529547527709, + "learning_rate": 9.953429730181653e-06, + "loss": 0.429, + "step": 2489 + }, + { + "epoch": 0.07, + "grad_norm": 1.9055148201047591, + "learning_rate": 9.953365749807808e-06, + "loss": 0.4416, + "step": 2490 + }, + { + "epoch": 0.07, + "grad_norm": 1.6464843888961427, + "learning_rate": 9.953301725720453e-06, + "loss": 0.4593, + "step": 2491 + }, + { + "epoch": 0.07, + "grad_norm": 1.9048954223612828, + "learning_rate": 9.953237657920156e-06, + "loss": 0.4782, + "step": 2492 + }, + { + "epoch": 0.07, + "grad_norm": 1.724859009493488, + "learning_rate": 9.953173546407482e-06, + "loss": 0.4699, + "step": 2493 + }, + { + "epoch": 0.07, + "grad_norm": 1.8724400096343143, + "learning_rate": 9.953109391182996e-06, + "loss": 0.5309, + "step": 2494 + }, + { + "epoch": 0.07, + "grad_norm": 1.7502446432710037, + "learning_rate": 9.953045192247266e-06, + "loss": 0.446, + "step": 2495 + }, + { + "epoch": 0.07, + "grad_norm": 2.400800659866155, + "learning_rate": 9.952980949600854e-06, + "loss": 0.487, + "step": 2496 + }, + { + "epoch": 0.07, + "grad_norm": 1.5731535357481463, + "learning_rate": 9.952916663244334e-06, + "loss": 0.4173, + "step": 2497 + }, + { + "epoch": 0.07, + "grad_norm": 2.0471341267558505, + "learning_rate": 9.952852333178266e-06, + "loss": 0.4606, + "step": 2498 + }, + { + "epoch": 0.07, + "grad_norm": 1.788021344830808, + "learning_rate": 9.952787959403223e-06, + "loss": 0.4318, + "step": 2499 + }, + { + "epoch": 0.07, + "grad_norm": 1.7881169681463398, + "learning_rate": 9.952723541919774e-06, + "loss": 0.4948, + "step": 2500 + }, + { + "epoch": 0.07, + "grad_norm": 1.7750704914597248, + "learning_rate": 9.95265908072848e-06, + "loss": 0.457, + "step": 2501 + }, + { + "epoch": 0.07, + "grad_norm": 1.5836994007049379, + "learning_rate": 9.952594575829917e-06, + "loss": 0.4405, + "step": 2502 + }, + { + "epoch": 0.07, + "grad_norm": 1.6578190227110081, + "learning_rate": 9.952530027224652e-06, + "loss": 0.4348, + "step": 2503 + }, + { + "epoch": 0.07, + "grad_norm": 1.0637005197103735, + "learning_rate": 9.952465434913254e-06, + "loss": 0.5946, + "step": 2504 + }, + { + "epoch": 0.07, + "grad_norm": 1.802906382188784, + "learning_rate": 9.952400798896293e-06, + "loss": 0.4921, + "step": 2505 + }, + { + "epoch": 0.07, + "grad_norm": 1.0302294750984924, + "learning_rate": 9.952336119174342e-06, + "loss": 0.5146, + "step": 2506 + }, + { + "epoch": 0.07, + "grad_norm": 1.6953722035055583, + "learning_rate": 9.952271395747969e-06, + "loss": 0.4429, + "step": 2507 + }, + { + "epoch": 0.07, + "grad_norm": 1.163414079097598, + "learning_rate": 9.952206628617745e-06, + "loss": 0.5447, + "step": 2508 + }, + { + "epoch": 0.07, + "grad_norm": 1.0349968145492499, + "learning_rate": 9.952141817784244e-06, + "loss": 0.5833, + "step": 2509 + }, + { + "epoch": 0.07, + "grad_norm": 1.6819036745748686, + "learning_rate": 9.952076963248034e-06, + "loss": 0.4638, + "step": 2510 + }, + { + "epoch": 0.07, + "grad_norm": 1.8254319276884405, + "learning_rate": 9.952012065009692e-06, + "loss": 0.5217, + "step": 2511 + }, + { + "epoch": 0.07, + "grad_norm": 1.6903838091071948, + "learning_rate": 9.95194712306979e-06, + "loss": 0.4625, + "step": 2512 + }, + { + "epoch": 0.07, + "grad_norm": 1.7069226276346177, + "learning_rate": 9.951882137428896e-06, + "loss": 0.4516, + "step": 2513 + }, + { + "epoch": 0.07, + "grad_norm": 1.66069925432362, + "learning_rate": 9.95181710808759e-06, + "loss": 0.4351, + "step": 2514 + }, + { + "epoch": 0.07, + "grad_norm": 1.8283294765695792, + "learning_rate": 9.951752035046443e-06, + "loss": 0.5036, + "step": 2515 + }, + { + "epoch": 0.07, + "grad_norm": 1.5343745273283733, + "learning_rate": 9.951686918306026e-06, + "loss": 0.4203, + "step": 2516 + }, + { + "epoch": 0.07, + "grad_norm": 1.6444505830746097, + "learning_rate": 9.951621757866921e-06, + "loss": 0.449, + "step": 2517 + }, + { + "epoch": 0.07, + "grad_norm": 1.966895083963196, + "learning_rate": 9.951556553729698e-06, + "loss": 0.4573, + "step": 2518 + }, + { + "epoch": 0.07, + "grad_norm": 1.6283717610668622, + "learning_rate": 9.951491305894934e-06, + "loss": 0.4326, + "step": 2519 + }, + { + "epoch": 0.07, + "grad_norm": 1.7948369889312945, + "learning_rate": 9.951426014363202e-06, + "loss": 0.4676, + "step": 2520 + }, + { + "epoch": 0.07, + "grad_norm": 1.95597936931991, + "learning_rate": 9.951360679135083e-06, + "loss": 0.4615, + "step": 2521 + }, + { + "epoch": 0.07, + "grad_norm": 1.5840060965883964, + "learning_rate": 9.95129530021115e-06, + "loss": 0.4427, + "step": 2522 + }, + { + "epoch": 0.07, + "grad_norm": 1.5586497385998748, + "learning_rate": 9.951229877591982e-06, + "loss": 0.4606, + "step": 2523 + }, + { + "epoch": 0.07, + "grad_norm": 1.5570702535599938, + "learning_rate": 9.951164411278154e-06, + "loss": 0.4427, + "step": 2524 + }, + { + "epoch": 0.07, + "grad_norm": 1.7112898770971277, + "learning_rate": 9.951098901270244e-06, + "loss": 0.5306, + "step": 2525 + }, + { + "epoch": 0.07, + "grad_norm": 1.6588462392577132, + "learning_rate": 9.951033347568833e-06, + "loss": 0.4506, + "step": 2526 + }, + { + "epoch": 0.07, + "grad_norm": 1.1917601991936873, + "learning_rate": 9.9509677501745e-06, + "loss": 0.5598, + "step": 2527 + }, + { + "epoch": 0.07, + "grad_norm": 1.8100321033470133, + "learning_rate": 9.950902109087817e-06, + "loss": 0.4598, + "step": 2528 + }, + { + "epoch": 0.07, + "grad_norm": 1.6967026396828622, + "learning_rate": 9.95083642430937e-06, + "loss": 0.4438, + "step": 2529 + }, + { + "epoch": 0.07, + "grad_norm": 1.8272246108109829, + "learning_rate": 9.950770695839737e-06, + "loss": 0.4292, + "step": 2530 + }, + { + "epoch": 0.07, + "grad_norm": 1.5461509070705373, + "learning_rate": 9.950704923679499e-06, + "loss": 0.4605, + "step": 2531 + }, + { + "epoch": 0.07, + "grad_norm": 1.0896069575092688, + "learning_rate": 9.950639107829233e-06, + "loss": 0.6366, + "step": 2532 + }, + { + "epoch": 0.07, + "grad_norm": 1.719440594544005, + "learning_rate": 9.950573248289523e-06, + "loss": 0.505, + "step": 2533 + }, + { + "epoch": 0.07, + "grad_norm": 1.726985685142886, + "learning_rate": 9.95050734506095e-06, + "loss": 0.4748, + "step": 2534 + }, + { + "epoch": 0.07, + "grad_norm": 1.7397903187392743, + "learning_rate": 9.950441398144092e-06, + "loss": 0.4778, + "step": 2535 + }, + { + "epoch": 0.07, + "grad_norm": 1.6221665287360916, + "learning_rate": 9.950375407539535e-06, + "loss": 0.4428, + "step": 2536 + }, + { + "epoch": 0.07, + "grad_norm": 1.7329009872773606, + "learning_rate": 9.950309373247862e-06, + "loss": 0.4457, + "step": 2537 + }, + { + "epoch": 0.07, + "grad_norm": 1.8648517305325258, + "learning_rate": 9.950243295269652e-06, + "loss": 0.4699, + "step": 2538 + }, + { + "epoch": 0.07, + "grad_norm": 1.9307311751075698, + "learning_rate": 9.95017717360549e-06, + "loss": 0.471, + "step": 2539 + }, + { + "epoch": 0.07, + "grad_norm": 1.747271566308291, + "learning_rate": 9.950111008255959e-06, + "loss": 0.4795, + "step": 2540 + }, + { + "epoch": 0.07, + "grad_norm": 1.5786847561737753, + "learning_rate": 9.950044799221646e-06, + "loss": 0.436, + "step": 2541 + }, + { + "epoch": 0.07, + "grad_norm": 1.6972972336071055, + "learning_rate": 9.94997854650313e-06, + "loss": 0.4272, + "step": 2542 + }, + { + "epoch": 0.07, + "grad_norm": 1.711992787723215, + "learning_rate": 9.949912250101e-06, + "loss": 0.4779, + "step": 2543 + }, + { + "epoch": 0.07, + "grad_norm": 1.5392362770023078, + "learning_rate": 9.949845910015838e-06, + "loss": 0.4586, + "step": 2544 + }, + { + "epoch": 0.07, + "grad_norm": 1.637883626001432, + "learning_rate": 9.949779526248233e-06, + "loss": 0.4515, + "step": 2545 + }, + { + "epoch": 0.07, + "grad_norm": 1.6623784161632176, + "learning_rate": 9.949713098798766e-06, + "loss": 0.4535, + "step": 2546 + }, + { + "epoch": 0.07, + "grad_norm": 1.8733757852075559, + "learning_rate": 9.949646627668029e-06, + "loss": 0.4826, + "step": 2547 + }, + { + "epoch": 0.07, + "grad_norm": 1.8939116107051543, + "learning_rate": 9.949580112856604e-06, + "loss": 0.532, + "step": 2548 + }, + { + "epoch": 0.07, + "grad_norm": 1.6994041687562758, + "learning_rate": 9.94951355436508e-06, + "loss": 0.5014, + "step": 2549 + }, + { + "epoch": 0.07, + "grad_norm": 2.340629695694108, + "learning_rate": 9.949446952194043e-06, + "loss": 0.4449, + "step": 2550 + }, + { + "epoch": 0.07, + "grad_norm": 1.6490405329207078, + "learning_rate": 9.949380306344084e-06, + "loss": 0.4698, + "step": 2551 + }, + { + "epoch": 0.07, + "grad_norm": 1.9103373512228121, + "learning_rate": 9.949313616815786e-06, + "loss": 0.4643, + "step": 2552 + }, + { + "epoch": 0.07, + "grad_norm": 1.6809531485576492, + "learning_rate": 9.949246883609743e-06, + "loss": 0.4716, + "step": 2553 + }, + { + "epoch": 0.07, + "grad_norm": 2.0206906933713964, + "learning_rate": 9.949180106726539e-06, + "loss": 0.4591, + "step": 2554 + }, + { + "epoch": 0.07, + "grad_norm": 1.7689720091745096, + "learning_rate": 9.949113286166768e-06, + "loss": 0.4454, + "step": 2555 + }, + { + "epoch": 0.07, + "grad_norm": 2.4331789632580874, + "learning_rate": 9.949046421931016e-06, + "loss": 0.4243, + "step": 2556 + }, + { + "epoch": 0.07, + "grad_norm": 1.9831769445335021, + "learning_rate": 9.948979514019874e-06, + "loss": 0.4841, + "step": 2557 + }, + { + "epoch": 0.07, + "grad_norm": 1.7848146819420077, + "learning_rate": 9.948912562433934e-06, + "loss": 0.4623, + "step": 2558 + }, + { + "epoch": 0.07, + "grad_norm": 1.836096056857154, + "learning_rate": 9.948845567173785e-06, + "loss": 0.491, + "step": 2559 + }, + { + "epoch": 0.07, + "grad_norm": 1.6501143102476057, + "learning_rate": 9.948778528240019e-06, + "loss": 0.4515, + "step": 2560 + }, + { + "epoch": 0.07, + "grad_norm": 1.7121946577310019, + "learning_rate": 9.948711445633227e-06, + "loss": 0.5583, + "step": 2561 + }, + { + "epoch": 0.07, + "grad_norm": 1.6868464846118414, + "learning_rate": 9.948644319354003e-06, + "loss": 0.4094, + "step": 2562 + }, + { + "epoch": 0.07, + "grad_norm": 1.5987387894987861, + "learning_rate": 9.948577149402938e-06, + "loss": 0.4945, + "step": 2563 + }, + { + "epoch": 0.07, + "grad_norm": 1.897207423506197, + "learning_rate": 9.948509935780624e-06, + "loss": 0.4914, + "step": 2564 + }, + { + "epoch": 0.07, + "grad_norm": 1.9412812883300326, + "learning_rate": 9.948442678487656e-06, + "loss": 0.4432, + "step": 2565 + }, + { + "epoch": 0.07, + "grad_norm": 1.5136899031525635, + "learning_rate": 9.948375377524624e-06, + "loss": 0.4123, + "step": 2566 + }, + { + "epoch": 0.07, + "grad_norm": 2.0630168977642604, + "learning_rate": 9.948308032892127e-06, + "loss": 0.4123, + "step": 2567 + }, + { + "epoch": 0.07, + "grad_norm": 1.872672268619345, + "learning_rate": 9.948240644590755e-06, + "loss": 0.4591, + "step": 2568 + }, + { + "epoch": 0.07, + "grad_norm": 1.1022808109140712, + "learning_rate": 9.948173212621106e-06, + "loss": 0.6193, + "step": 2569 + }, + { + "epoch": 0.07, + "grad_norm": 1.5638541168439597, + "learning_rate": 9.948105736983771e-06, + "loss": 0.4261, + "step": 2570 + }, + { + "epoch": 0.07, + "grad_norm": 1.6323775539049028, + "learning_rate": 9.94803821767935e-06, + "loss": 0.4599, + "step": 2571 + }, + { + "epoch": 0.07, + "grad_norm": 2.285504806902205, + "learning_rate": 9.947970654708436e-06, + "loss": 0.5131, + "step": 2572 + }, + { + "epoch": 0.07, + "grad_norm": 1.6076286661263, + "learning_rate": 9.947903048071627e-06, + "loss": 0.4244, + "step": 2573 + }, + { + "epoch": 0.07, + "grad_norm": 1.7283236139849314, + "learning_rate": 9.947835397769519e-06, + "loss": 0.4889, + "step": 2574 + }, + { + "epoch": 0.07, + "grad_norm": 1.7295790146263958, + "learning_rate": 9.947767703802708e-06, + "loss": 0.4523, + "step": 2575 + }, + { + "epoch": 0.07, + "grad_norm": 1.8758704034738016, + "learning_rate": 9.94769996617179e-06, + "loss": 0.4771, + "step": 2576 + }, + { + "epoch": 0.07, + "grad_norm": 1.5519826969809116, + "learning_rate": 9.947632184877369e-06, + "loss": 0.4091, + "step": 2577 + }, + { + "epoch": 0.07, + "grad_norm": 1.707260287335837, + "learning_rate": 9.947564359920036e-06, + "loss": 0.4095, + "step": 2578 + }, + { + "epoch": 0.07, + "grad_norm": 1.8190432207739473, + "learning_rate": 9.947496491300395e-06, + "loss": 0.4621, + "step": 2579 + }, + { + "epoch": 0.07, + "grad_norm": 1.5165732085817305, + "learning_rate": 9.94742857901904e-06, + "loss": 0.4506, + "step": 2580 + }, + { + "epoch": 0.07, + "grad_norm": 2.287722298086667, + "learning_rate": 9.947360623076573e-06, + "loss": 0.4261, + "step": 2581 + }, + { + "epoch": 0.07, + "grad_norm": 2.093081580379896, + "learning_rate": 9.947292623473597e-06, + "loss": 0.42, + "step": 2582 + }, + { + "epoch": 0.07, + "grad_norm": 1.7725667460703354, + "learning_rate": 9.947224580210705e-06, + "loss": 0.4838, + "step": 2583 + }, + { + "epoch": 0.07, + "grad_norm": 1.6956547605522665, + "learning_rate": 9.947156493288504e-06, + "loss": 0.4922, + "step": 2584 + }, + { + "epoch": 0.07, + "grad_norm": 2.2943311410038514, + "learning_rate": 9.947088362707591e-06, + "loss": 0.4837, + "step": 2585 + }, + { + "epoch": 0.08, + "grad_norm": 1.7347901717466998, + "learning_rate": 9.947020188468567e-06, + "loss": 0.4218, + "step": 2586 + }, + { + "epoch": 0.08, + "grad_norm": 1.7319868160367258, + "learning_rate": 9.946951970572036e-06, + "loss": 0.4643, + "step": 2587 + }, + { + "epoch": 0.08, + "grad_norm": 1.743746834304066, + "learning_rate": 9.9468837090186e-06, + "loss": 0.4427, + "step": 2588 + }, + { + "epoch": 0.08, + "grad_norm": 1.775563014743329, + "learning_rate": 9.94681540380886e-06, + "loss": 0.4322, + "step": 2589 + }, + { + "epoch": 0.08, + "grad_norm": 1.7591718977942872, + "learning_rate": 9.946747054943418e-06, + "loss": 0.4437, + "step": 2590 + }, + { + "epoch": 0.08, + "grad_norm": 1.7220672398856511, + "learning_rate": 9.94667866242288e-06, + "loss": 0.489, + "step": 2591 + }, + { + "epoch": 0.08, + "grad_norm": 1.5983265206097201, + "learning_rate": 9.946610226247846e-06, + "loss": 0.4421, + "step": 2592 + }, + { + "epoch": 0.08, + "grad_norm": 1.6668938318002726, + "learning_rate": 9.946541746418922e-06, + "loss": 0.4681, + "step": 2593 + }, + { + "epoch": 0.08, + "grad_norm": 1.7381064439963245, + "learning_rate": 9.946473222936713e-06, + "loss": 0.4103, + "step": 2594 + }, + { + "epoch": 0.08, + "grad_norm": 1.5983311013773756, + "learning_rate": 9.946404655801823e-06, + "loss": 0.4423, + "step": 2595 + }, + { + "epoch": 0.08, + "grad_norm": 1.175883105167644, + "learning_rate": 9.946336045014858e-06, + "loss": 0.6172, + "step": 2596 + }, + { + "epoch": 0.08, + "grad_norm": 1.9116716862134693, + "learning_rate": 9.94626739057642e-06, + "loss": 0.4406, + "step": 2597 + }, + { + "epoch": 0.08, + "grad_norm": 1.7285736940862586, + "learning_rate": 9.94619869248712e-06, + "loss": 0.4587, + "step": 2598 + }, + { + "epoch": 0.08, + "grad_norm": 1.7021709507902871, + "learning_rate": 9.94612995074756e-06, + "loss": 0.4588, + "step": 2599 + }, + { + "epoch": 0.08, + "grad_norm": 1.6656384272574059, + "learning_rate": 9.946061165358349e-06, + "loss": 0.4333, + "step": 2600 + }, + { + "epoch": 0.08, + "grad_norm": 1.8424861404112034, + "learning_rate": 9.945992336320094e-06, + "loss": 0.4583, + "step": 2601 + }, + { + "epoch": 0.08, + "grad_norm": 1.7072577511070708, + "learning_rate": 9.9459234636334e-06, + "loss": 0.469, + "step": 2602 + }, + { + "epoch": 0.08, + "grad_norm": 2.0546857739230067, + "learning_rate": 9.945854547298878e-06, + "loss": 0.526, + "step": 2603 + }, + { + "epoch": 0.08, + "grad_norm": 1.8329565063742193, + "learning_rate": 9.945785587317134e-06, + "loss": 0.4596, + "step": 2604 + }, + { + "epoch": 0.08, + "grad_norm": 1.6727824063184014, + "learning_rate": 9.945716583688778e-06, + "loss": 0.4471, + "step": 2605 + }, + { + "epoch": 0.08, + "grad_norm": 1.6703050852016506, + "learning_rate": 9.945647536414417e-06, + "loss": 0.442, + "step": 2606 + }, + { + "epoch": 0.08, + "grad_norm": 1.7161691412321334, + "learning_rate": 9.945578445494663e-06, + "loss": 0.4834, + "step": 2607 + }, + { + "epoch": 0.08, + "grad_norm": 2.544612202934222, + "learning_rate": 9.945509310930124e-06, + "loss": 0.4754, + "step": 2608 + }, + { + "epoch": 0.08, + "grad_norm": 1.6009887227825517, + "learning_rate": 9.945440132721412e-06, + "loss": 0.467, + "step": 2609 + }, + { + "epoch": 0.08, + "grad_norm": 1.5806029439088773, + "learning_rate": 9.945370910869134e-06, + "loss": 0.4204, + "step": 2610 + }, + { + "epoch": 0.08, + "grad_norm": 2.4814572803381014, + "learning_rate": 9.945301645373902e-06, + "loss": 0.4593, + "step": 2611 + }, + { + "epoch": 0.08, + "grad_norm": 2.018458020020326, + "learning_rate": 9.94523233623633e-06, + "loss": 0.4839, + "step": 2612 + }, + { + "epoch": 0.08, + "grad_norm": 1.7416355386620932, + "learning_rate": 9.945162983457026e-06, + "loss": 0.4509, + "step": 2613 + }, + { + "epoch": 0.08, + "grad_norm": 2.082128593624636, + "learning_rate": 9.945093587036606e-06, + "loss": 0.4259, + "step": 2614 + }, + { + "epoch": 0.08, + "grad_norm": 1.7527695128737169, + "learning_rate": 9.94502414697568e-06, + "loss": 0.4519, + "step": 2615 + }, + { + "epoch": 0.08, + "grad_norm": 1.7935714660982733, + "learning_rate": 9.94495466327486e-06, + "loss": 0.4425, + "step": 2616 + }, + { + "epoch": 0.08, + "grad_norm": 2.035478790892685, + "learning_rate": 9.944885135934759e-06, + "loss": 0.4311, + "step": 2617 + }, + { + "epoch": 0.08, + "grad_norm": 1.6682441311306322, + "learning_rate": 9.944815564955995e-06, + "loss": 0.4404, + "step": 2618 + }, + { + "epoch": 0.08, + "grad_norm": 1.7549575881713595, + "learning_rate": 9.944745950339176e-06, + "loss": 0.4282, + "step": 2619 + }, + { + "epoch": 0.08, + "grad_norm": 1.6249704863746401, + "learning_rate": 9.94467629208492e-06, + "loss": 0.4622, + "step": 2620 + }, + { + "epoch": 0.08, + "grad_norm": 1.7133885134009523, + "learning_rate": 9.944606590193841e-06, + "loss": 0.4344, + "step": 2621 + }, + { + "epoch": 0.08, + "grad_norm": 1.9434344368435896, + "learning_rate": 9.944536844666554e-06, + "loss": 0.4444, + "step": 2622 + }, + { + "epoch": 0.08, + "grad_norm": 1.765439217226697, + "learning_rate": 9.944467055503674e-06, + "loss": 0.4269, + "step": 2623 + }, + { + "epoch": 0.08, + "grad_norm": 1.8455401613187437, + "learning_rate": 9.944397222705818e-06, + "loss": 0.4293, + "step": 2624 + }, + { + "epoch": 0.08, + "grad_norm": 1.629838064718498, + "learning_rate": 9.9443273462736e-06, + "loss": 0.4129, + "step": 2625 + }, + { + "epoch": 0.08, + "grad_norm": 1.8252999931344382, + "learning_rate": 9.944257426207643e-06, + "loss": 0.4252, + "step": 2626 + }, + { + "epoch": 0.08, + "grad_norm": 1.777357790422405, + "learning_rate": 9.944187462508554e-06, + "loss": 0.4131, + "step": 2627 + }, + { + "epoch": 0.08, + "grad_norm": 1.725845625587478, + "learning_rate": 9.944117455176958e-06, + "loss": 0.448, + "step": 2628 + }, + { + "epoch": 0.08, + "grad_norm": 1.764966769267837, + "learning_rate": 9.944047404213472e-06, + "loss": 0.4787, + "step": 2629 + }, + { + "epoch": 0.08, + "grad_norm": 1.730425235935806, + "learning_rate": 9.943977309618712e-06, + "loss": 0.452, + "step": 2630 + }, + { + "epoch": 0.08, + "grad_norm": 5.399781792949419, + "learning_rate": 9.943907171393296e-06, + "loss": 0.431, + "step": 2631 + }, + { + "epoch": 0.08, + "grad_norm": 1.9746057278834792, + "learning_rate": 9.943836989537845e-06, + "loss": 0.5174, + "step": 2632 + }, + { + "epoch": 0.08, + "grad_norm": 1.6728808210701474, + "learning_rate": 9.943766764052978e-06, + "loss": 0.4457, + "step": 2633 + }, + { + "epoch": 0.08, + "grad_norm": 2.985982817312912, + "learning_rate": 9.943696494939315e-06, + "loss": 0.4428, + "step": 2634 + }, + { + "epoch": 0.08, + "grad_norm": 1.5634776323202983, + "learning_rate": 9.943626182197475e-06, + "loss": 0.4408, + "step": 2635 + }, + { + "epoch": 0.08, + "grad_norm": 1.710277712631329, + "learning_rate": 9.943555825828078e-06, + "loss": 0.4492, + "step": 2636 + }, + { + "epoch": 0.08, + "grad_norm": 1.6303493317448865, + "learning_rate": 9.943485425831746e-06, + "loss": 0.4303, + "step": 2637 + }, + { + "epoch": 0.08, + "grad_norm": 1.6474166334315878, + "learning_rate": 9.943414982209103e-06, + "loss": 0.4422, + "step": 2638 + }, + { + "epoch": 0.08, + "grad_norm": 1.6994843908293689, + "learning_rate": 9.943344494960766e-06, + "loss": 0.463, + "step": 2639 + }, + { + "epoch": 0.08, + "grad_norm": 1.6307199918230992, + "learning_rate": 9.943273964087359e-06, + "loss": 0.4545, + "step": 2640 + }, + { + "epoch": 0.08, + "grad_norm": 1.6795052480132022, + "learning_rate": 9.943203389589506e-06, + "loss": 0.4937, + "step": 2641 + }, + { + "epoch": 0.08, + "grad_norm": 1.6974165405046835, + "learning_rate": 9.943132771467826e-06, + "loss": 0.4738, + "step": 2642 + }, + { + "epoch": 0.08, + "grad_norm": 1.7915816357296115, + "learning_rate": 9.943062109722945e-06, + "loss": 0.5028, + "step": 2643 + }, + { + "epoch": 0.08, + "grad_norm": 2.3974930297350037, + "learning_rate": 9.942991404355487e-06, + "loss": 0.4425, + "step": 2644 + }, + { + "epoch": 0.08, + "grad_norm": 1.6875490042637513, + "learning_rate": 9.942920655366075e-06, + "loss": 0.4687, + "step": 2645 + }, + { + "epoch": 0.08, + "grad_norm": 1.7006393640892945, + "learning_rate": 9.942849862755332e-06, + "loss": 0.4174, + "step": 2646 + }, + { + "epoch": 0.08, + "grad_norm": 1.105735519296871, + "learning_rate": 9.942779026523886e-06, + "loss": 0.6093, + "step": 2647 + }, + { + "epoch": 0.08, + "grad_norm": 1.6712438829010132, + "learning_rate": 9.94270814667236e-06, + "loss": 0.4655, + "step": 2648 + }, + { + "epoch": 0.08, + "grad_norm": 1.6092131691656344, + "learning_rate": 9.942637223201377e-06, + "loss": 0.4806, + "step": 2649 + }, + { + "epoch": 0.08, + "grad_norm": 1.6917770889427786, + "learning_rate": 9.94256625611157e-06, + "loss": 0.4841, + "step": 2650 + }, + { + "epoch": 0.08, + "grad_norm": 1.6384799097685734, + "learning_rate": 9.942495245403558e-06, + "loss": 0.4159, + "step": 2651 + }, + { + "epoch": 0.08, + "grad_norm": 1.8639412403174622, + "learning_rate": 9.94242419107797e-06, + "loss": 0.4402, + "step": 2652 + }, + { + "epoch": 0.08, + "grad_norm": 1.535816370461862, + "learning_rate": 9.942353093135435e-06, + "loss": 0.445, + "step": 2653 + }, + { + "epoch": 0.08, + "grad_norm": 1.6995122663559181, + "learning_rate": 9.94228195157658e-06, + "loss": 0.445, + "step": 2654 + }, + { + "epoch": 0.08, + "grad_norm": 1.5443535124927974, + "learning_rate": 9.942210766402031e-06, + "loss": 0.4594, + "step": 2655 + }, + { + "epoch": 0.08, + "grad_norm": 1.6144581025604017, + "learning_rate": 9.942139537612416e-06, + "loss": 0.4546, + "step": 2656 + }, + { + "epoch": 0.08, + "grad_norm": 2.193726513271717, + "learning_rate": 9.942068265208367e-06, + "loss": 0.4539, + "step": 2657 + }, + { + "epoch": 0.08, + "grad_norm": 1.600665661375506, + "learning_rate": 9.94199694919051e-06, + "loss": 0.4827, + "step": 2658 + }, + { + "epoch": 0.08, + "grad_norm": 2.478176146175294, + "learning_rate": 9.941925589559475e-06, + "loss": 0.5017, + "step": 2659 + }, + { + "epoch": 0.08, + "grad_norm": 1.553342536023208, + "learning_rate": 9.941854186315891e-06, + "loss": 0.4452, + "step": 2660 + }, + { + "epoch": 0.08, + "grad_norm": 1.6855335018995283, + "learning_rate": 9.94178273946039e-06, + "loss": 0.4537, + "step": 2661 + }, + { + "epoch": 0.08, + "grad_norm": 1.5453003814054485, + "learning_rate": 9.941711248993602e-06, + "loss": 0.4426, + "step": 2662 + }, + { + "epoch": 0.08, + "grad_norm": 1.7180612606974925, + "learning_rate": 9.941639714916156e-06, + "loss": 0.4534, + "step": 2663 + }, + { + "epoch": 0.08, + "grad_norm": 1.677831733550694, + "learning_rate": 9.941568137228686e-06, + "loss": 0.4487, + "step": 2664 + }, + { + "epoch": 0.08, + "grad_norm": 1.7420188520745528, + "learning_rate": 9.94149651593182e-06, + "loss": 0.5097, + "step": 2665 + }, + { + "epoch": 0.08, + "grad_norm": 1.5853400352601834, + "learning_rate": 9.941424851026194e-06, + "loss": 0.4763, + "step": 2666 + }, + { + "epoch": 0.08, + "grad_norm": 1.5893061453681103, + "learning_rate": 9.94135314251244e-06, + "loss": 0.4383, + "step": 2667 + }, + { + "epoch": 0.08, + "grad_norm": 1.574127368265593, + "learning_rate": 9.941281390391189e-06, + "loss": 0.4511, + "step": 2668 + }, + { + "epoch": 0.08, + "grad_norm": 1.7885180822958506, + "learning_rate": 9.941209594663074e-06, + "loss": 0.3986, + "step": 2669 + }, + { + "epoch": 0.08, + "grad_norm": 1.708147614067529, + "learning_rate": 9.941137755328729e-06, + "loss": 0.4784, + "step": 2670 + }, + { + "epoch": 0.08, + "grad_norm": 2.129658829817133, + "learning_rate": 9.941065872388792e-06, + "loss": 0.5176, + "step": 2671 + }, + { + "epoch": 0.08, + "grad_norm": 1.6174222250511128, + "learning_rate": 9.94099394584389e-06, + "loss": 0.4098, + "step": 2672 + }, + { + "epoch": 0.08, + "grad_norm": 1.8669059489553381, + "learning_rate": 9.940921975694662e-06, + "loss": 0.4595, + "step": 2673 + }, + { + "epoch": 0.08, + "grad_norm": 1.8057253013031695, + "learning_rate": 9.940849961941746e-06, + "loss": 0.5159, + "step": 2674 + }, + { + "epoch": 0.08, + "grad_norm": 1.5820986621428295, + "learning_rate": 9.94077790458577e-06, + "loss": 0.4654, + "step": 2675 + }, + { + "epoch": 0.08, + "grad_norm": 1.6358586828059014, + "learning_rate": 9.940705803627377e-06, + "loss": 0.4542, + "step": 2676 + }, + { + "epoch": 0.08, + "grad_norm": 3.248206864172426, + "learning_rate": 9.9406336590672e-06, + "loss": 0.479, + "step": 2677 + }, + { + "epoch": 0.08, + "grad_norm": 1.7009520802940785, + "learning_rate": 9.940561470905876e-06, + "loss": 0.4347, + "step": 2678 + }, + { + "epoch": 0.08, + "grad_norm": 1.8087753513416462, + "learning_rate": 9.940489239144043e-06, + "loss": 0.5042, + "step": 2679 + }, + { + "epoch": 0.08, + "grad_norm": 1.684890365860958, + "learning_rate": 9.940416963782335e-06, + "loss": 0.4458, + "step": 2680 + }, + { + "epoch": 0.08, + "grad_norm": 1.6137568192646825, + "learning_rate": 9.940344644821394e-06, + "loss": 0.5032, + "step": 2681 + }, + { + "epoch": 0.08, + "grad_norm": 1.542191987511335, + "learning_rate": 9.940272282261858e-06, + "loss": 0.446, + "step": 2682 + }, + { + "epoch": 0.08, + "grad_norm": 1.1296243835251727, + "learning_rate": 9.940199876104362e-06, + "loss": 0.5898, + "step": 2683 + }, + { + "epoch": 0.08, + "grad_norm": 1.6937925801540594, + "learning_rate": 9.94012742634955e-06, + "loss": 0.4869, + "step": 2684 + }, + { + "epoch": 0.08, + "grad_norm": 1.6604149520953022, + "learning_rate": 9.940054932998056e-06, + "loss": 0.4495, + "step": 2685 + }, + { + "epoch": 0.08, + "grad_norm": 1.7445198501402928, + "learning_rate": 9.939982396050524e-06, + "loss": 0.4429, + "step": 2686 + }, + { + "epoch": 0.08, + "grad_norm": 1.7106285624579136, + "learning_rate": 9.939909815507593e-06, + "loss": 0.445, + "step": 2687 + }, + { + "epoch": 0.08, + "grad_norm": 1.6140494421356932, + "learning_rate": 9.9398371913699e-06, + "loss": 0.4239, + "step": 2688 + }, + { + "epoch": 0.08, + "grad_norm": 1.5504456338638402, + "learning_rate": 9.939764523638093e-06, + "loss": 0.4878, + "step": 2689 + }, + { + "epoch": 0.08, + "grad_norm": 1.7373487195775248, + "learning_rate": 9.939691812312807e-06, + "loss": 0.4241, + "step": 2690 + }, + { + "epoch": 0.08, + "grad_norm": 2.577579130399938, + "learning_rate": 9.939619057394687e-06, + "loss": 0.4441, + "step": 2691 + }, + { + "epoch": 0.08, + "grad_norm": 1.6450727124285893, + "learning_rate": 9.939546258884375e-06, + "loss": 0.4084, + "step": 2692 + }, + { + "epoch": 0.08, + "grad_norm": 2.2702344334739597, + "learning_rate": 9.939473416782509e-06, + "loss": 0.4797, + "step": 2693 + }, + { + "epoch": 0.08, + "grad_norm": 1.8884940970977637, + "learning_rate": 9.939400531089738e-06, + "loss": 0.4961, + "step": 2694 + }, + { + "epoch": 0.08, + "grad_norm": 1.6038273240865561, + "learning_rate": 9.939327601806701e-06, + "loss": 0.491, + "step": 2695 + }, + { + "epoch": 0.08, + "grad_norm": 1.447016560837788, + "learning_rate": 9.939254628934044e-06, + "loss": 0.424, + "step": 2696 + }, + { + "epoch": 0.08, + "grad_norm": 1.6572122693943823, + "learning_rate": 9.93918161247241e-06, + "loss": 0.4562, + "step": 2697 + }, + { + "epoch": 0.08, + "grad_norm": 1.8800773526878682, + "learning_rate": 9.939108552422443e-06, + "loss": 0.4428, + "step": 2698 + }, + { + "epoch": 0.08, + "grad_norm": 1.8127154593353938, + "learning_rate": 9.939035448784788e-06, + "loss": 0.4913, + "step": 2699 + }, + { + "epoch": 0.08, + "grad_norm": 1.5870184035852144, + "learning_rate": 9.938962301560092e-06, + "loss": 0.4672, + "step": 2700 + }, + { + "epoch": 0.08, + "grad_norm": 1.5794529322080386, + "learning_rate": 9.938889110748998e-06, + "loss": 0.4153, + "step": 2701 + }, + { + "epoch": 0.08, + "grad_norm": 1.6087102935743205, + "learning_rate": 9.938815876352152e-06, + "loss": 0.4159, + "step": 2702 + }, + { + "epoch": 0.08, + "grad_norm": 1.729455035161181, + "learning_rate": 9.938742598370203e-06, + "loss": 0.3957, + "step": 2703 + }, + { + "epoch": 0.08, + "grad_norm": 1.609409074957372, + "learning_rate": 9.938669276803795e-06, + "loss": 0.4519, + "step": 2704 + }, + { + "epoch": 0.08, + "grad_norm": 1.8834291450590364, + "learning_rate": 9.938595911653575e-06, + "loss": 0.4602, + "step": 2705 + }, + { + "epoch": 0.08, + "grad_norm": 1.728708147836861, + "learning_rate": 9.938522502920192e-06, + "loss": 0.4536, + "step": 2706 + }, + { + "epoch": 0.08, + "grad_norm": 1.5777902707773075, + "learning_rate": 9.938449050604293e-06, + "loss": 0.4241, + "step": 2707 + }, + { + "epoch": 0.08, + "grad_norm": 1.769972228880749, + "learning_rate": 9.938375554706525e-06, + "loss": 0.4759, + "step": 2708 + }, + { + "epoch": 0.08, + "grad_norm": 1.6549240756091428, + "learning_rate": 9.93830201522754e-06, + "loss": 0.4391, + "step": 2709 + }, + { + "epoch": 0.08, + "grad_norm": 1.623965563733894, + "learning_rate": 9.938228432167984e-06, + "loss": 0.4391, + "step": 2710 + }, + { + "epoch": 0.08, + "grad_norm": 1.319759691177556, + "learning_rate": 9.938154805528509e-06, + "loss": 0.6401, + "step": 2711 + }, + { + "epoch": 0.08, + "grad_norm": 2.451979238091033, + "learning_rate": 9.938081135309761e-06, + "loss": 0.4702, + "step": 2712 + }, + { + "epoch": 0.08, + "grad_norm": 1.0409016011611105, + "learning_rate": 9.938007421512392e-06, + "loss": 0.5982, + "step": 2713 + }, + { + "epoch": 0.08, + "grad_norm": 1.6422964394014403, + "learning_rate": 9.937933664137054e-06, + "loss": 0.4728, + "step": 2714 + }, + { + "epoch": 0.08, + "grad_norm": 1.8528716868833943, + "learning_rate": 9.937859863184396e-06, + "loss": 0.4403, + "step": 2715 + }, + { + "epoch": 0.08, + "grad_norm": 1.749307016199454, + "learning_rate": 9.937786018655069e-06, + "loss": 0.4737, + "step": 2716 + }, + { + "epoch": 0.08, + "grad_norm": 1.9506077175321088, + "learning_rate": 9.937712130549726e-06, + "loss": 0.4864, + "step": 2717 + }, + { + "epoch": 0.08, + "grad_norm": 1.6236789807666527, + "learning_rate": 9.937638198869021e-06, + "loss": 0.4478, + "step": 2718 + }, + { + "epoch": 0.08, + "grad_norm": 1.9107668772413622, + "learning_rate": 9.937564223613602e-06, + "loss": 0.4556, + "step": 2719 + }, + { + "epoch": 0.08, + "grad_norm": 1.695332416199944, + "learning_rate": 9.937490204784124e-06, + "loss": 0.4122, + "step": 2720 + }, + { + "epoch": 0.08, + "grad_norm": 1.6456870166048958, + "learning_rate": 9.93741614238124e-06, + "loss": 0.4425, + "step": 2721 + }, + { + "epoch": 0.08, + "grad_norm": 1.708851849033533, + "learning_rate": 9.937342036405605e-06, + "loss": 0.4973, + "step": 2722 + }, + { + "epoch": 0.08, + "grad_norm": 1.556192156451232, + "learning_rate": 9.93726788685787e-06, + "loss": 0.4309, + "step": 2723 + }, + { + "epoch": 0.08, + "grad_norm": 2.102422811930572, + "learning_rate": 9.937193693738694e-06, + "loss": 0.4538, + "step": 2724 + }, + { + "epoch": 0.08, + "grad_norm": 1.6069099162647504, + "learning_rate": 9.937119457048728e-06, + "loss": 0.4991, + "step": 2725 + }, + { + "epoch": 0.08, + "grad_norm": 1.6981635263757666, + "learning_rate": 9.937045176788625e-06, + "loss": 0.4516, + "step": 2726 + }, + { + "epoch": 0.08, + "grad_norm": 1.6380395147531122, + "learning_rate": 9.936970852959047e-06, + "loss": 0.4478, + "step": 2727 + }, + { + "epoch": 0.08, + "grad_norm": 1.7616291407220992, + "learning_rate": 9.936896485560645e-06, + "loss": 0.4731, + "step": 2728 + }, + { + "epoch": 0.08, + "grad_norm": 1.6726078378937042, + "learning_rate": 9.936822074594076e-06, + "loss": 0.4683, + "step": 2729 + }, + { + "epoch": 0.08, + "grad_norm": 1.5491588624779975, + "learning_rate": 9.93674762006e-06, + "loss": 0.4238, + "step": 2730 + }, + { + "epoch": 0.08, + "grad_norm": 1.8016752183333713, + "learning_rate": 9.936673121959069e-06, + "loss": 0.6231, + "step": 2731 + }, + { + "epoch": 0.08, + "grad_norm": 2.859452526207898, + "learning_rate": 9.936598580291944e-06, + "loss": 0.4786, + "step": 2732 + }, + { + "epoch": 0.08, + "grad_norm": 1.7617613559698098, + "learning_rate": 9.93652399505928e-06, + "loss": 0.4239, + "step": 2733 + }, + { + "epoch": 0.08, + "grad_norm": 1.3141821075997655, + "learning_rate": 9.936449366261738e-06, + "loss": 0.618, + "step": 2734 + }, + { + "epoch": 0.08, + "grad_norm": 1.6907649250891923, + "learning_rate": 9.936374693899977e-06, + "loss": 0.4834, + "step": 2735 + }, + { + "epoch": 0.08, + "grad_norm": 1.508994185829556, + "learning_rate": 9.936299977974653e-06, + "loss": 0.4451, + "step": 2736 + }, + { + "epoch": 0.08, + "grad_norm": 1.5617548327729354, + "learning_rate": 9.936225218486428e-06, + "loss": 0.4788, + "step": 2737 + }, + { + "epoch": 0.08, + "grad_norm": 1.5869600843572895, + "learning_rate": 9.93615041543596e-06, + "loss": 0.4608, + "step": 2738 + }, + { + "epoch": 0.08, + "grad_norm": 1.510583581375072, + "learning_rate": 9.936075568823911e-06, + "loss": 0.4566, + "step": 2739 + }, + { + "epoch": 0.08, + "grad_norm": 1.5460919906626074, + "learning_rate": 9.936000678650938e-06, + "loss": 0.449, + "step": 2740 + }, + { + "epoch": 0.08, + "grad_norm": 1.6355462823943863, + "learning_rate": 9.935925744917705e-06, + "loss": 0.4056, + "step": 2741 + }, + { + "epoch": 0.08, + "grad_norm": 1.2522611917936384, + "learning_rate": 9.935850767624875e-06, + "loss": 0.6843, + "step": 2742 + }, + { + "epoch": 0.08, + "grad_norm": 1.6389208978751881, + "learning_rate": 9.935775746773104e-06, + "loss": 0.5045, + "step": 2743 + }, + { + "epoch": 0.08, + "grad_norm": 1.7145057356027422, + "learning_rate": 9.935700682363058e-06, + "loss": 0.4553, + "step": 2744 + }, + { + "epoch": 0.08, + "grad_norm": 1.6330102731243175, + "learning_rate": 9.9356255743954e-06, + "loss": 0.434, + "step": 2745 + }, + { + "epoch": 0.08, + "grad_norm": 1.531028818973225, + "learning_rate": 9.935550422870792e-06, + "loss": 0.4429, + "step": 2746 + }, + { + "epoch": 0.08, + "grad_norm": 1.737752573881679, + "learning_rate": 9.935475227789896e-06, + "loss": 0.4574, + "step": 2747 + }, + { + "epoch": 0.08, + "grad_norm": 1.693783238813808, + "learning_rate": 9.935399989153375e-06, + "loss": 0.4233, + "step": 2748 + }, + { + "epoch": 0.08, + "grad_norm": 1.5918577311873106, + "learning_rate": 9.935324706961896e-06, + "loss": 0.4162, + "step": 2749 + }, + { + "epoch": 0.08, + "grad_norm": 1.5270967694797917, + "learning_rate": 9.935249381216121e-06, + "loss": 0.4237, + "step": 2750 + }, + { + "epoch": 0.08, + "grad_norm": 1.6516209031172975, + "learning_rate": 9.935174011916715e-06, + "loss": 0.4489, + "step": 2751 + }, + { + "epoch": 0.08, + "grad_norm": 2.357520789327886, + "learning_rate": 9.935098599064345e-06, + "loss": 0.4941, + "step": 2752 + }, + { + "epoch": 0.08, + "grad_norm": 1.6601423932099177, + "learning_rate": 9.935023142659674e-06, + "loss": 0.4659, + "step": 2753 + }, + { + "epoch": 0.08, + "grad_norm": 1.54642838813399, + "learning_rate": 9.93494764270337e-06, + "loss": 0.4282, + "step": 2754 + }, + { + "epoch": 0.08, + "grad_norm": 1.5898365596436543, + "learning_rate": 9.934872099196098e-06, + "loss": 0.483, + "step": 2755 + }, + { + "epoch": 0.08, + "grad_norm": 1.6518263550966341, + "learning_rate": 9.934796512138523e-06, + "loss": 0.4701, + "step": 2756 + }, + { + "epoch": 0.08, + "grad_norm": 1.4991612726896169, + "learning_rate": 9.934720881531316e-06, + "loss": 0.4238, + "step": 2757 + }, + { + "epoch": 0.08, + "grad_norm": 1.9139292972630706, + "learning_rate": 9.934645207375142e-06, + "loss": 0.4279, + "step": 2758 + }, + { + "epoch": 0.08, + "grad_norm": 1.5890966585036879, + "learning_rate": 9.93456948967067e-06, + "loss": 0.4816, + "step": 2759 + }, + { + "epoch": 0.08, + "grad_norm": 1.8486707692520203, + "learning_rate": 9.934493728418567e-06, + "loss": 0.4775, + "step": 2760 + }, + { + "epoch": 0.08, + "grad_norm": 1.6043250338381125, + "learning_rate": 9.934417923619502e-06, + "loss": 0.4373, + "step": 2761 + }, + { + "epoch": 0.08, + "grad_norm": 1.6557517208591133, + "learning_rate": 9.934342075274143e-06, + "loss": 0.4577, + "step": 2762 + }, + { + "epoch": 0.08, + "grad_norm": 1.5868123614362701, + "learning_rate": 9.934266183383163e-06, + "loss": 0.4164, + "step": 2763 + }, + { + "epoch": 0.08, + "grad_norm": 1.6181599869828798, + "learning_rate": 9.934190247947228e-06, + "loss": 0.4619, + "step": 2764 + }, + { + "epoch": 0.08, + "grad_norm": 1.511312859118179, + "learning_rate": 9.934114268967008e-06, + "loss": 0.3892, + "step": 2765 + }, + { + "epoch": 0.08, + "grad_norm": 1.5870599339819262, + "learning_rate": 9.934038246443176e-06, + "loss": 0.4636, + "step": 2766 + }, + { + "epoch": 0.08, + "grad_norm": 1.8286611390497205, + "learning_rate": 9.933962180376403e-06, + "loss": 0.4369, + "step": 2767 + }, + { + "epoch": 0.08, + "grad_norm": 1.6108822932435591, + "learning_rate": 9.933886070767356e-06, + "loss": 0.467, + "step": 2768 + }, + { + "epoch": 0.08, + "grad_norm": 1.5210552746340409, + "learning_rate": 9.933809917616712e-06, + "loss": 0.4225, + "step": 2769 + }, + { + "epoch": 0.08, + "grad_norm": 3.7868960673874366, + "learning_rate": 9.93373372092514e-06, + "loss": 0.4151, + "step": 2770 + }, + { + "epoch": 0.08, + "grad_norm": 1.1058212592012129, + "learning_rate": 9.933657480693314e-06, + "loss": 0.5589, + "step": 2771 + }, + { + "epoch": 0.08, + "grad_norm": 1.759236343685289, + "learning_rate": 9.933581196921905e-06, + "loss": 0.4533, + "step": 2772 + }, + { + "epoch": 0.08, + "grad_norm": 1.8273187652145084, + "learning_rate": 9.933504869611588e-06, + "loss": 0.5011, + "step": 2773 + }, + { + "epoch": 0.08, + "grad_norm": 1.5477490868577273, + "learning_rate": 9.933428498763036e-06, + "loss": 0.4934, + "step": 2774 + }, + { + "epoch": 0.08, + "grad_norm": 1.5363600164388815, + "learning_rate": 9.933352084376922e-06, + "loss": 0.4978, + "step": 2775 + }, + { + "epoch": 0.08, + "grad_norm": 1.658417804676486, + "learning_rate": 9.93327562645392e-06, + "loss": 0.4486, + "step": 2776 + }, + { + "epoch": 0.08, + "grad_norm": 1.5168878564752115, + "learning_rate": 9.933199124994708e-06, + "loss": 0.4446, + "step": 2777 + }, + { + "epoch": 0.08, + "grad_norm": 1.7073795908771758, + "learning_rate": 9.933122579999958e-06, + "loss": 0.4828, + "step": 2778 + }, + { + "epoch": 0.08, + "grad_norm": 1.9949802768447535, + "learning_rate": 9.933045991470346e-06, + "loss": 0.4619, + "step": 2779 + }, + { + "epoch": 0.08, + "grad_norm": 1.5529977339003225, + "learning_rate": 9.932969359406549e-06, + "loss": 0.4037, + "step": 2780 + }, + { + "epoch": 0.08, + "grad_norm": 1.6639904599636872, + "learning_rate": 9.932892683809243e-06, + "loss": 0.453, + "step": 2781 + }, + { + "epoch": 0.08, + "grad_norm": 1.7113909518112254, + "learning_rate": 9.932815964679103e-06, + "loss": 0.4482, + "step": 2782 + }, + { + "epoch": 0.08, + "grad_norm": 1.6820948700572431, + "learning_rate": 9.93273920201681e-06, + "loss": 0.435, + "step": 2783 + }, + { + "epoch": 0.08, + "grad_norm": 3.1471201413006176, + "learning_rate": 9.932662395823036e-06, + "loss": 0.4634, + "step": 2784 + }, + { + "epoch": 0.08, + "grad_norm": 1.7234939839082888, + "learning_rate": 9.932585546098463e-06, + "loss": 0.4051, + "step": 2785 + }, + { + "epoch": 0.08, + "grad_norm": 1.5508861124486393, + "learning_rate": 9.932508652843768e-06, + "loss": 0.4662, + "step": 2786 + }, + { + "epoch": 0.08, + "grad_norm": 1.5659127375956887, + "learning_rate": 9.932431716059629e-06, + "loss": 0.4738, + "step": 2787 + }, + { + "epoch": 0.08, + "grad_norm": 1.4955691672512847, + "learning_rate": 9.932354735746725e-06, + "loss": 0.4478, + "step": 2788 + }, + { + "epoch": 0.08, + "grad_norm": 1.7056991673680768, + "learning_rate": 9.932277711905737e-06, + "loss": 0.5361, + "step": 2789 + }, + { + "epoch": 0.08, + "grad_norm": 1.62897466359658, + "learning_rate": 9.932200644537342e-06, + "loss": 0.5012, + "step": 2790 + }, + { + "epoch": 0.08, + "grad_norm": 1.6208341550420002, + "learning_rate": 9.932123533642223e-06, + "loss": 0.4932, + "step": 2791 + }, + { + "epoch": 0.08, + "grad_norm": 1.6066665504188087, + "learning_rate": 9.932046379221058e-06, + "loss": 0.4992, + "step": 2792 + }, + { + "epoch": 0.08, + "grad_norm": 1.592893572199292, + "learning_rate": 9.93196918127453e-06, + "loss": 0.489, + "step": 2793 + }, + { + "epoch": 0.08, + "grad_norm": 1.5328185680654707, + "learning_rate": 9.931891939803317e-06, + "loss": 0.4111, + "step": 2794 + }, + { + "epoch": 0.08, + "grad_norm": 1.6411555264029383, + "learning_rate": 9.931814654808105e-06, + "loss": 0.4463, + "step": 2795 + }, + { + "epoch": 0.08, + "grad_norm": 1.5333062865821816, + "learning_rate": 9.931737326289575e-06, + "loss": 0.4586, + "step": 2796 + }, + { + "epoch": 0.08, + "grad_norm": 1.5778870472504893, + "learning_rate": 9.931659954248407e-06, + "loss": 0.4465, + "step": 2797 + }, + { + "epoch": 0.08, + "grad_norm": 1.660828433273159, + "learning_rate": 9.931582538685284e-06, + "loss": 0.4284, + "step": 2798 + }, + { + "epoch": 0.08, + "grad_norm": 1.5316572552596353, + "learning_rate": 9.931505079600892e-06, + "loss": 0.4242, + "step": 2799 + }, + { + "epoch": 0.08, + "grad_norm": 1.5181602652265314, + "learning_rate": 9.931427576995913e-06, + "loss": 0.4176, + "step": 2800 + }, + { + "epoch": 0.08, + "grad_norm": 1.683021578356378, + "learning_rate": 9.93135003087103e-06, + "loss": 0.4108, + "step": 2801 + }, + { + "epoch": 0.08, + "grad_norm": 4.020436677896403, + "learning_rate": 9.931272441226929e-06, + "loss": 0.4533, + "step": 2802 + }, + { + "epoch": 0.08, + "grad_norm": 1.6010026883269026, + "learning_rate": 9.931194808064293e-06, + "loss": 0.4423, + "step": 2803 + }, + { + "epoch": 0.08, + "grad_norm": 1.62892202416877, + "learning_rate": 9.93111713138381e-06, + "loss": 0.4588, + "step": 2804 + }, + { + "epoch": 0.08, + "grad_norm": 1.6509413597138116, + "learning_rate": 9.931039411186161e-06, + "loss": 0.5555, + "step": 2805 + }, + { + "epoch": 0.08, + "grad_norm": 1.6239889871791309, + "learning_rate": 9.930961647472038e-06, + "loss": 0.4932, + "step": 2806 + }, + { + "epoch": 0.08, + "grad_norm": 1.5549622144953175, + "learning_rate": 9.93088384024212e-06, + "loss": 0.434, + "step": 2807 + }, + { + "epoch": 0.08, + "grad_norm": 1.5328632290541433, + "learning_rate": 9.9308059894971e-06, + "loss": 0.4543, + "step": 2808 + }, + { + "epoch": 0.08, + "grad_norm": 1.5358335356126565, + "learning_rate": 9.93072809523766e-06, + "loss": 0.4407, + "step": 2809 + }, + { + "epoch": 0.08, + "grad_norm": 1.528675829760565, + "learning_rate": 9.930650157464492e-06, + "loss": 0.4468, + "step": 2810 + }, + { + "epoch": 0.08, + "grad_norm": 1.5808304631289256, + "learning_rate": 9.930572176178282e-06, + "loss": 0.4085, + "step": 2811 + }, + { + "epoch": 0.08, + "grad_norm": 1.7469338407431345, + "learning_rate": 9.930494151379715e-06, + "loss": 0.4831, + "step": 2812 + }, + { + "epoch": 0.08, + "grad_norm": 1.538445494765707, + "learning_rate": 9.930416083069484e-06, + "loss": 0.4411, + "step": 2813 + }, + { + "epoch": 0.08, + "grad_norm": 1.4955307552044754, + "learning_rate": 9.930337971248279e-06, + "loss": 0.4209, + "step": 2814 + }, + { + "epoch": 0.08, + "grad_norm": 1.1181641613043034, + "learning_rate": 9.930259815916784e-06, + "loss": 0.5915, + "step": 2815 + }, + { + "epoch": 0.08, + "grad_norm": 1.666385838461944, + "learning_rate": 9.93018161707569e-06, + "loss": 0.497, + "step": 2816 + }, + { + "epoch": 0.08, + "grad_norm": 1.5778094456021903, + "learning_rate": 9.930103374725692e-06, + "loss": 0.4491, + "step": 2817 + }, + { + "epoch": 0.08, + "grad_norm": 1.9145654568945398, + "learning_rate": 9.930025088867475e-06, + "loss": 0.4699, + "step": 2818 + }, + { + "epoch": 0.08, + "grad_norm": 1.7093541410904447, + "learning_rate": 9.929946759501732e-06, + "loss": 0.4799, + "step": 2819 + }, + { + "epoch": 0.08, + "grad_norm": 1.7065913579548366, + "learning_rate": 9.929868386629156e-06, + "loss": 0.4948, + "step": 2820 + }, + { + "epoch": 0.08, + "grad_norm": 1.5937949528122444, + "learning_rate": 9.929789970250435e-06, + "loss": 0.4699, + "step": 2821 + }, + { + "epoch": 0.08, + "grad_norm": 1.6542801351603305, + "learning_rate": 9.929711510366263e-06, + "loss": 0.4627, + "step": 2822 + }, + { + "epoch": 0.08, + "grad_norm": 1.4166205655485442, + "learning_rate": 9.929633006977331e-06, + "loss": 0.4549, + "step": 2823 + }, + { + "epoch": 0.08, + "grad_norm": 1.6543537311562027, + "learning_rate": 9.929554460084334e-06, + "loss": 0.5213, + "step": 2824 + }, + { + "epoch": 0.08, + "grad_norm": 1.6466875363825522, + "learning_rate": 9.929475869687965e-06, + "loss": 0.4811, + "step": 2825 + }, + { + "epoch": 0.08, + "grad_norm": 1.7202031029259823, + "learning_rate": 9.929397235788918e-06, + "loss": 0.4538, + "step": 2826 + }, + { + "epoch": 0.08, + "grad_norm": 1.7461689230209008, + "learning_rate": 9.929318558387882e-06, + "loss": 0.4179, + "step": 2827 + }, + { + "epoch": 0.08, + "grad_norm": 1.6287953661055736, + "learning_rate": 9.929239837485558e-06, + "loss": 0.4371, + "step": 2828 + }, + { + "epoch": 0.08, + "grad_norm": 1.5205960518089274, + "learning_rate": 9.929161073082636e-06, + "loss": 0.4262, + "step": 2829 + }, + { + "epoch": 0.08, + "grad_norm": 1.1908613217992516, + "learning_rate": 9.929082265179814e-06, + "loss": 0.5981, + "step": 2830 + }, + { + "epoch": 0.08, + "grad_norm": 1.9709599717351607, + "learning_rate": 9.929003413777788e-06, + "loss": 0.459, + "step": 2831 + }, + { + "epoch": 0.08, + "grad_norm": 1.5551964306762776, + "learning_rate": 9.928924518877248e-06, + "loss": 0.443, + "step": 2832 + }, + { + "epoch": 0.08, + "grad_norm": 1.551555354438762, + "learning_rate": 9.928845580478896e-06, + "loss": 0.4492, + "step": 2833 + }, + { + "epoch": 0.08, + "grad_norm": 2.496568479430043, + "learning_rate": 9.928766598583428e-06, + "loss": 0.4947, + "step": 2834 + }, + { + "epoch": 0.08, + "grad_norm": 1.611896875247582, + "learning_rate": 9.92868757319154e-06, + "loss": 0.4397, + "step": 2835 + }, + { + "epoch": 0.08, + "grad_norm": 1.584885175804674, + "learning_rate": 9.928608504303929e-06, + "loss": 0.458, + "step": 2836 + }, + { + "epoch": 0.08, + "grad_norm": 1.4093945849144576, + "learning_rate": 9.928529391921295e-06, + "loss": 0.4582, + "step": 2837 + }, + { + "epoch": 0.08, + "grad_norm": 1.5747552041446202, + "learning_rate": 9.928450236044333e-06, + "loss": 0.4595, + "step": 2838 + }, + { + "epoch": 0.08, + "grad_norm": 1.616329357612732, + "learning_rate": 9.928371036673744e-06, + "loss": 0.4264, + "step": 2839 + }, + { + "epoch": 0.08, + "grad_norm": 1.7215984216139755, + "learning_rate": 9.928291793810223e-06, + "loss": 0.4621, + "step": 2840 + }, + { + "epoch": 0.08, + "grad_norm": 1.78034159602867, + "learning_rate": 9.928212507454474e-06, + "loss": 0.4144, + "step": 2841 + }, + { + "epoch": 0.08, + "grad_norm": 2.13862898193471, + "learning_rate": 9.928133177607196e-06, + "loss": 0.4573, + "step": 2842 + }, + { + "epoch": 0.08, + "grad_norm": 1.6836045714940737, + "learning_rate": 9.928053804269087e-06, + "loss": 0.4868, + "step": 2843 + }, + { + "epoch": 0.08, + "grad_norm": 1.75288081359617, + "learning_rate": 9.927974387440848e-06, + "loss": 0.4475, + "step": 2844 + }, + { + "epoch": 0.08, + "grad_norm": 1.4550963657305451, + "learning_rate": 9.92789492712318e-06, + "loss": 0.4461, + "step": 2845 + }, + { + "epoch": 0.08, + "grad_norm": 1.6144527648783147, + "learning_rate": 9.927815423316787e-06, + "loss": 0.4766, + "step": 2846 + }, + { + "epoch": 0.08, + "grad_norm": 1.664610360979763, + "learning_rate": 9.927735876022366e-06, + "loss": 0.4056, + "step": 2847 + }, + { + "epoch": 0.08, + "grad_norm": 1.6661891832818112, + "learning_rate": 9.927656285240623e-06, + "loss": 0.4539, + "step": 2848 + }, + { + "epoch": 0.08, + "grad_norm": 1.6216803692220594, + "learning_rate": 9.927576650972258e-06, + "loss": 0.4448, + "step": 2849 + }, + { + "epoch": 0.08, + "grad_norm": 1.6319190637613448, + "learning_rate": 9.927496973217973e-06, + "loss": 0.5093, + "step": 2850 + }, + { + "epoch": 0.08, + "grad_norm": 1.7486368462679929, + "learning_rate": 9.927417251978474e-06, + "loss": 0.4449, + "step": 2851 + }, + { + "epoch": 0.08, + "grad_norm": 2.0219295159695627, + "learning_rate": 9.927337487254463e-06, + "loss": 0.4747, + "step": 2852 + }, + { + "epoch": 0.08, + "grad_norm": 1.6071096883305158, + "learning_rate": 9.927257679046644e-06, + "loss": 0.4331, + "step": 2853 + }, + { + "epoch": 0.08, + "grad_norm": 1.4963944604785404, + "learning_rate": 9.92717782735572e-06, + "loss": 0.4594, + "step": 2854 + }, + { + "epoch": 0.08, + "grad_norm": 1.6829346816527158, + "learning_rate": 9.927097932182398e-06, + "loss": 0.4422, + "step": 2855 + }, + { + "epoch": 0.08, + "grad_norm": 1.6988027872324358, + "learning_rate": 9.927017993527383e-06, + "loss": 0.4306, + "step": 2856 + }, + { + "epoch": 0.08, + "grad_norm": 1.7006107537365047, + "learning_rate": 9.926938011391379e-06, + "loss": 0.4294, + "step": 2857 + }, + { + "epoch": 0.08, + "grad_norm": 1.6563068232323963, + "learning_rate": 9.926857985775091e-06, + "loss": 0.5034, + "step": 2858 + }, + { + "epoch": 0.08, + "grad_norm": 1.6797992551329368, + "learning_rate": 9.926777916679228e-06, + "loss": 0.4211, + "step": 2859 + }, + { + "epoch": 0.08, + "grad_norm": 1.1934049043194657, + "learning_rate": 9.926697804104495e-06, + "loss": 0.6375, + "step": 2860 + }, + { + "epoch": 0.08, + "grad_norm": 1.9981863962891022, + "learning_rate": 9.9266176480516e-06, + "loss": 0.4362, + "step": 2861 + }, + { + "epoch": 0.08, + "grad_norm": 1.7846595466593744, + "learning_rate": 9.926537448521249e-06, + "loss": 0.4568, + "step": 2862 + }, + { + "epoch": 0.08, + "grad_norm": 1.5137235923574865, + "learning_rate": 9.92645720551415e-06, + "loss": 0.4188, + "step": 2863 + }, + { + "epoch": 0.08, + "grad_norm": 1.8890154768452316, + "learning_rate": 9.926376919031012e-06, + "loss": 0.4229, + "step": 2864 + }, + { + "epoch": 0.08, + "grad_norm": 1.8070234266865837, + "learning_rate": 9.926296589072542e-06, + "loss": 0.4349, + "step": 2865 + }, + { + "epoch": 0.08, + "grad_norm": 1.7883247484289058, + "learning_rate": 9.92621621563945e-06, + "loss": 0.4927, + "step": 2866 + }, + { + "epoch": 0.08, + "grad_norm": 1.5911479239852455, + "learning_rate": 9.926135798732446e-06, + "loss": 0.4649, + "step": 2867 + }, + { + "epoch": 0.08, + "grad_norm": 1.5836232845889602, + "learning_rate": 9.926055338352239e-06, + "loss": 0.4531, + "step": 2868 + }, + { + "epoch": 0.08, + "grad_norm": 1.5271817022298657, + "learning_rate": 9.925974834499538e-06, + "loss": 0.4344, + "step": 2869 + }, + { + "epoch": 0.08, + "grad_norm": 1.7541093727200734, + "learning_rate": 9.925894287175057e-06, + "loss": 0.4808, + "step": 2870 + }, + { + "epoch": 0.08, + "grad_norm": 1.7128096461995352, + "learning_rate": 9.925813696379501e-06, + "loss": 0.4566, + "step": 2871 + }, + { + "epoch": 0.08, + "grad_norm": 3.3292015040271896, + "learning_rate": 9.925733062113588e-06, + "loss": 0.4517, + "step": 2872 + }, + { + "epoch": 0.08, + "grad_norm": 1.7274457222738646, + "learning_rate": 9.925652384378023e-06, + "loss": 0.4722, + "step": 2873 + }, + { + "epoch": 0.08, + "grad_norm": 1.8460377566629302, + "learning_rate": 9.925571663173522e-06, + "loss": 0.4978, + "step": 2874 + }, + { + "epoch": 0.08, + "grad_norm": 1.6652102625618403, + "learning_rate": 9.925490898500796e-06, + "loss": 0.5134, + "step": 2875 + }, + { + "epoch": 0.08, + "grad_norm": 1.8518385258386594, + "learning_rate": 9.925410090360559e-06, + "loss": 0.5144, + "step": 2876 + }, + { + "epoch": 0.08, + "grad_norm": 1.8821884736874435, + "learning_rate": 9.925329238753523e-06, + "loss": 0.4521, + "step": 2877 + }, + { + "epoch": 0.08, + "grad_norm": 1.623035195809778, + "learning_rate": 9.925248343680402e-06, + "loss": 0.4362, + "step": 2878 + }, + { + "epoch": 0.08, + "grad_norm": 1.861513412282714, + "learning_rate": 9.92516740514191e-06, + "loss": 0.4235, + "step": 2879 + }, + { + "epoch": 0.08, + "grad_norm": 1.5857342182216703, + "learning_rate": 9.92508642313876e-06, + "loss": 0.4444, + "step": 2880 + }, + { + "epoch": 0.08, + "grad_norm": 1.9738260762006274, + "learning_rate": 9.925005397671668e-06, + "loss": 0.468, + "step": 2881 + }, + { + "epoch": 0.08, + "grad_norm": 1.854808871961835, + "learning_rate": 9.92492432874135e-06, + "loss": 0.4733, + "step": 2882 + }, + { + "epoch": 0.08, + "grad_norm": 1.8756948065612729, + "learning_rate": 9.924843216348518e-06, + "loss": 0.4561, + "step": 2883 + }, + { + "epoch": 0.08, + "grad_norm": 1.5584382065079805, + "learning_rate": 9.924762060493893e-06, + "loss": 0.4586, + "step": 2884 + }, + { + "epoch": 0.08, + "grad_norm": 1.5231581713580062, + "learning_rate": 9.924680861178186e-06, + "loss": 0.4687, + "step": 2885 + }, + { + "epoch": 0.08, + "grad_norm": 1.5968988014781191, + "learning_rate": 9.924599618402116e-06, + "loss": 0.4511, + "step": 2886 + }, + { + "epoch": 0.08, + "grad_norm": 1.9367774837279044, + "learning_rate": 9.9245183321664e-06, + "loss": 0.503, + "step": 2887 + }, + { + "epoch": 0.08, + "grad_norm": 1.8565567078958218, + "learning_rate": 9.924437002471755e-06, + "loss": 0.469, + "step": 2888 + }, + { + "epoch": 0.08, + "grad_norm": 1.824020262484592, + "learning_rate": 9.924355629318898e-06, + "loss": 0.4489, + "step": 2889 + }, + { + "epoch": 0.08, + "grad_norm": 1.6163363691251935, + "learning_rate": 9.924274212708548e-06, + "loss": 0.4656, + "step": 2890 + }, + { + "epoch": 0.08, + "grad_norm": 1.9286382579423733, + "learning_rate": 9.924192752641426e-06, + "loss": 0.4627, + "step": 2891 + }, + { + "epoch": 0.08, + "grad_norm": 2.12327732918412, + "learning_rate": 9.924111249118247e-06, + "loss": 0.4466, + "step": 2892 + }, + { + "epoch": 0.08, + "grad_norm": 1.690478680898853, + "learning_rate": 9.92402970213973e-06, + "loss": 0.4687, + "step": 2893 + }, + { + "epoch": 0.08, + "grad_norm": 1.5963454408246371, + "learning_rate": 9.923948111706598e-06, + "loss": 0.4326, + "step": 2894 + }, + { + "epoch": 0.08, + "grad_norm": 1.6128398775576953, + "learning_rate": 9.923866477819568e-06, + "loss": 0.4563, + "step": 2895 + }, + { + "epoch": 0.08, + "grad_norm": 1.7248842635890778, + "learning_rate": 9.923784800479364e-06, + "loss": 0.4362, + "step": 2896 + }, + { + "epoch": 0.08, + "grad_norm": 1.6946956160194198, + "learning_rate": 9.923703079686703e-06, + "loss": 0.4465, + "step": 2897 + }, + { + "epoch": 0.08, + "grad_norm": 1.8539515294108058, + "learning_rate": 9.923621315442307e-06, + "loss": 0.4491, + "step": 2898 + }, + { + "epoch": 0.08, + "grad_norm": 1.6689991787718461, + "learning_rate": 9.923539507746898e-06, + "loss": 0.4275, + "step": 2899 + }, + { + "epoch": 0.08, + "grad_norm": 1.3041244972459252, + "learning_rate": 9.923457656601199e-06, + "loss": 0.63, + "step": 2900 + }, + { + "epoch": 0.08, + "grad_norm": 1.7525915916798038, + "learning_rate": 9.92337576200593e-06, + "loss": 0.5068, + "step": 2901 + }, + { + "epoch": 0.08, + "grad_norm": 2.6904419575642815, + "learning_rate": 9.923293823961819e-06, + "loss": 0.4498, + "step": 2902 + }, + { + "epoch": 0.08, + "grad_norm": 1.6232130868362402, + "learning_rate": 9.923211842469583e-06, + "loss": 0.4527, + "step": 2903 + }, + { + "epoch": 0.08, + "grad_norm": 1.804197680015541, + "learning_rate": 9.923129817529949e-06, + "loss": 0.4203, + "step": 2904 + }, + { + "epoch": 0.08, + "grad_norm": 1.5663408967387924, + "learning_rate": 9.923047749143638e-06, + "loss": 0.4894, + "step": 2905 + }, + { + "epoch": 0.08, + "grad_norm": 1.6129463971825388, + "learning_rate": 9.922965637311376e-06, + "loss": 0.4444, + "step": 2906 + }, + { + "epoch": 0.08, + "grad_norm": 1.6264261150031345, + "learning_rate": 9.92288348203389e-06, + "loss": 0.433, + "step": 2907 + }, + { + "epoch": 0.08, + "grad_norm": 1.4962454816636503, + "learning_rate": 9.922801283311901e-06, + "loss": 0.4622, + "step": 2908 + }, + { + "epoch": 0.08, + "grad_norm": 1.65104506092401, + "learning_rate": 9.922719041146136e-06, + "loss": 0.4615, + "step": 2909 + }, + { + "epoch": 0.08, + "grad_norm": 1.8408766437455084, + "learning_rate": 9.922636755537321e-06, + "loss": 0.4607, + "step": 2910 + }, + { + "epoch": 0.08, + "grad_norm": 1.7660952040781972, + "learning_rate": 9.922554426486183e-06, + "loss": 0.4802, + "step": 2911 + }, + { + "epoch": 0.08, + "grad_norm": 1.8549524568247842, + "learning_rate": 9.922472053993446e-06, + "loss": 0.4331, + "step": 2912 + }, + { + "epoch": 0.08, + "grad_norm": 1.6702168496411995, + "learning_rate": 9.92238963805984e-06, + "loss": 0.4551, + "step": 2913 + }, + { + "epoch": 0.08, + "grad_norm": 1.6660561671180836, + "learning_rate": 9.92230717868609e-06, + "loss": 0.4334, + "step": 2914 + }, + { + "epoch": 0.08, + "grad_norm": 1.738265916077561, + "learning_rate": 9.922224675872925e-06, + "loss": 0.4493, + "step": 2915 + }, + { + "epoch": 0.08, + "grad_norm": 1.5898078362613537, + "learning_rate": 9.922142129621072e-06, + "loss": 0.4081, + "step": 2916 + }, + { + "epoch": 0.08, + "grad_norm": 2.7126333516580154, + "learning_rate": 9.92205953993126e-06, + "loss": 0.416, + "step": 2917 + }, + { + "epoch": 0.08, + "grad_norm": 1.6324466611020398, + "learning_rate": 9.921976906804219e-06, + "loss": 0.4416, + "step": 2918 + }, + { + "epoch": 0.08, + "grad_norm": 1.676566166795685, + "learning_rate": 9.921894230240676e-06, + "loss": 0.4444, + "step": 2919 + }, + { + "epoch": 0.08, + "grad_norm": 1.5285302719333873, + "learning_rate": 9.921811510241362e-06, + "loss": 0.433, + "step": 2920 + }, + { + "epoch": 0.08, + "grad_norm": 1.6138120332879082, + "learning_rate": 9.921728746807008e-06, + "loss": 0.4725, + "step": 2921 + }, + { + "epoch": 0.08, + "grad_norm": 1.5665562913501436, + "learning_rate": 9.921645939938343e-06, + "loss": 0.4359, + "step": 2922 + }, + { + "epoch": 0.08, + "grad_norm": 1.633065965025926, + "learning_rate": 9.921563089636097e-06, + "loss": 0.4695, + "step": 2923 + }, + { + "epoch": 0.08, + "grad_norm": 1.554315401800247, + "learning_rate": 9.921480195901003e-06, + "loss": 0.4338, + "step": 2924 + }, + { + "epoch": 0.08, + "grad_norm": 1.8504887890140362, + "learning_rate": 9.921397258733791e-06, + "loss": 0.4747, + "step": 2925 + }, + { + "epoch": 0.08, + "grad_norm": 1.824650991006405, + "learning_rate": 9.921314278135194e-06, + "loss": 0.4423, + "step": 2926 + }, + { + "epoch": 0.08, + "grad_norm": 1.6610077174607623, + "learning_rate": 9.921231254105943e-06, + "loss": 0.481, + "step": 2927 + }, + { + "epoch": 0.08, + "grad_norm": 1.6274589160700779, + "learning_rate": 9.921148186646771e-06, + "loss": 0.4449, + "step": 2928 + }, + { + "epoch": 0.08, + "grad_norm": 2.129239122185402, + "learning_rate": 9.921065075758413e-06, + "loss": 0.4121, + "step": 2929 + }, + { + "epoch": 0.08, + "grad_norm": 1.5492457345655366, + "learning_rate": 9.920981921441602e-06, + "loss": 0.5289, + "step": 2930 + }, + { + "epoch": 0.09, + "grad_norm": 1.4906362705745848, + "learning_rate": 9.92089872369707e-06, + "loss": 0.475, + "step": 2931 + }, + { + "epoch": 0.09, + "grad_norm": 1.4643359533049778, + "learning_rate": 9.920815482525552e-06, + "loss": 0.388, + "step": 2932 + }, + { + "epoch": 0.09, + "grad_norm": 1.5260356342111159, + "learning_rate": 9.920732197927782e-06, + "loss": 0.4102, + "step": 2933 + }, + { + "epoch": 0.09, + "grad_norm": 1.8259037464444892, + "learning_rate": 9.920648869904498e-06, + "loss": 0.4607, + "step": 2934 + }, + { + "epoch": 0.09, + "grad_norm": 1.6900053591849915, + "learning_rate": 9.920565498456432e-06, + "loss": 0.4676, + "step": 2935 + }, + { + "epoch": 0.09, + "grad_norm": 1.693305158860666, + "learning_rate": 9.92048208358432e-06, + "loss": 0.4698, + "step": 2936 + }, + { + "epoch": 0.09, + "grad_norm": 1.6538751727452765, + "learning_rate": 9.9203986252889e-06, + "loss": 0.4287, + "step": 2937 + }, + { + "epoch": 0.09, + "grad_norm": 1.583869427944705, + "learning_rate": 9.920315123570907e-06, + "loss": 0.4441, + "step": 2938 + }, + { + "epoch": 0.09, + "grad_norm": 1.892916283586095, + "learning_rate": 9.920231578431078e-06, + "loss": 0.4684, + "step": 2939 + }, + { + "epoch": 0.09, + "grad_norm": 1.595058320469888, + "learning_rate": 9.920147989870153e-06, + "loss": 0.4469, + "step": 2940 + }, + { + "epoch": 0.09, + "grad_norm": 1.569032799057301, + "learning_rate": 9.920064357888866e-06, + "loss": 0.4826, + "step": 2941 + }, + { + "epoch": 0.09, + "grad_norm": 1.5201748553457053, + "learning_rate": 9.919980682487956e-06, + "loss": 0.4635, + "step": 2942 + }, + { + "epoch": 0.09, + "grad_norm": 1.5749500038574447, + "learning_rate": 9.919896963668163e-06, + "loss": 0.4819, + "step": 2943 + }, + { + "epoch": 0.09, + "grad_norm": 1.5372567754208633, + "learning_rate": 9.919813201430224e-06, + "loss": 0.4244, + "step": 2944 + }, + { + "epoch": 0.09, + "grad_norm": 1.6869478749714464, + "learning_rate": 9.91972939577488e-06, + "loss": 0.4534, + "step": 2945 + }, + { + "epoch": 0.09, + "grad_norm": 1.5078894989481486, + "learning_rate": 9.919645546702868e-06, + "loss": 0.4302, + "step": 2946 + }, + { + "epoch": 0.09, + "grad_norm": 1.6866387883772003, + "learning_rate": 9.919561654214929e-06, + "loss": 0.4753, + "step": 2947 + }, + { + "epoch": 0.09, + "grad_norm": 1.558897852112669, + "learning_rate": 9.919477718311806e-06, + "loss": 0.424, + "step": 2948 + }, + { + "epoch": 0.09, + "grad_norm": 1.6946306960977464, + "learning_rate": 9.919393738994235e-06, + "loss": 0.4003, + "step": 2949 + }, + { + "epoch": 0.09, + "grad_norm": 1.5718037398015214, + "learning_rate": 9.91930971626296e-06, + "loss": 0.422, + "step": 2950 + }, + { + "epoch": 0.09, + "grad_norm": 1.5919485438318628, + "learning_rate": 9.919225650118724e-06, + "loss": 0.4331, + "step": 2951 + }, + { + "epoch": 0.09, + "grad_norm": 1.5891674853522348, + "learning_rate": 9.919141540562266e-06, + "loss": 0.3996, + "step": 2952 + }, + { + "epoch": 0.09, + "grad_norm": 1.6597307189766022, + "learning_rate": 9.91905738759433e-06, + "loss": 0.4267, + "step": 2953 + }, + { + "epoch": 0.09, + "grad_norm": 1.7749674215669589, + "learning_rate": 9.918973191215657e-06, + "loss": 0.4914, + "step": 2954 + }, + { + "epoch": 0.09, + "grad_norm": 1.5199788906348266, + "learning_rate": 9.918888951426993e-06, + "loss": 0.4951, + "step": 2955 + }, + { + "epoch": 0.09, + "grad_norm": 1.87665334038132, + "learning_rate": 9.918804668229078e-06, + "loss": 0.4652, + "step": 2956 + }, + { + "epoch": 0.09, + "grad_norm": 1.7317888991496722, + "learning_rate": 9.918720341622657e-06, + "loss": 0.472, + "step": 2957 + }, + { + "epoch": 0.09, + "grad_norm": 1.8043570151179562, + "learning_rate": 9.918635971608474e-06, + "loss": 0.4365, + "step": 2958 + }, + { + "epoch": 0.09, + "grad_norm": 1.9980342789372658, + "learning_rate": 9.918551558187275e-06, + "loss": 0.4703, + "step": 2959 + }, + { + "epoch": 0.09, + "grad_norm": 1.5858715357092024, + "learning_rate": 9.918467101359803e-06, + "loss": 0.4399, + "step": 2960 + }, + { + "epoch": 0.09, + "grad_norm": 1.479886537917235, + "learning_rate": 9.918382601126805e-06, + "loss": 0.4437, + "step": 2961 + }, + { + "epoch": 0.09, + "grad_norm": 1.7017736893675428, + "learning_rate": 9.918298057489026e-06, + "loss": 0.4588, + "step": 2962 + }, + { + "epoch": 0.09, + "grad_norm": 1.5331702278165489, + "learning_rate": 9.918213470447211e-06, + "loss": 0.4246, + "step": 2963 + }, + { + "epoch": 0.09, + "grad_norm": 1.735854045714866, + "learning_rate": 9.918128840002109e-06, + "loss": 0.4655, + "step": 2964 + }, + { + "epoch": 0.09, + "grad_norm": 1.6765865990212054, + "learning_rate": 9.918044166154465e-06, + "loss": 0.4561, + "step": 2965 + }, + { + "epoch": 0.09, + "grad_norm": 1.6932628625277744, + "learning_rate": 9.917959448905028e-06, + "loss": 0.528, + "step": 2966 + }, + { + "epoch": 0.09, + "grad_norm": 1.717651265029366, + "learning_rate": 9.917874688254542e-06, + "loss": 0.4535, + "step": 2967 + }, + { + "epoch": 0.09, + "grad_norm": 1.6387292279515269, + "learning_rate": 9.917789884203758e-06, + "loss": 0.4583, + "step": 2968 + }, + { + "epoch": 0.09, + "grad_norm": 1.6725821523720192, + "learning_rate": 9.917705036753424e-06, + "loss": 0.4745, + "step": 2969 + }, + { + "epoch": 0.09, + "grad_norm": 1.7604223005248505, + "learning_rate": 9.917620145904288e-06, + "loss": 0.4582, + "step": 2970 + }, + { + "epoch": 0.09, + "grad_norm": 1.9494111999501655, + "learning_rate": 9.9175352116571e-06, + "loss": 0.5091, + "step": 2971 + }, + { + "epoch": 0.09, + "grad_norm": 1.446168486439141, + "learning_rate": 9.917450234012609e-06, + "loss": 0.3915, + "step": 2972 + }, + { + "epoch": 0.09, + "grad_norm": 1.5508527864026311, + "learning_rate": 9.917365212971566e-06, + "loss": 0.4609, + "step": 2973 + }, + { + "epoch": 0.09, + "grad_norm": 1.5830733928456135, + "learning_rate": 9.91728014853472e-06, + "loss": 0.4377, + "step": 2974 + }, + { + "epoch": 0.09, + "grad_norm": 1.5790902334204788, + "learning_rate": 9.917195040702822e-06, + "loss": 0.4244, + "step": 2975 + }, + { + "epoch": 0.09, + "grad_norm": 1.850561827796478, + "learning_rate": 9.917109889476623e-06, + "loss": 0.454, + "step": 2976 + }, + { + "epoch": 0.09, + "grad_norm": 1.5929874318895458, + "learning_rate": 9.917024694856876e-06, + "loss": 0.4361, + "step": 2977 + }, + { + "epoch": 0.09, + "grad_norm": 1.5996386232462267, + "learning_rate": 9.91693945684433e-06, + "loss": 0.4403, + "step": 2978 + }, + { + "epoch": 0.09, + "grad_norm": 2.817684136683929, + "learning_rate": 9.916854175439738e-06, + "loss": 0.4427, + "step": 2979 + }, + { + "epoch": 0.09, + "grad_norm": 1.5002768322288405, + "learning_rate": 9.916768850643854e-06, + "loss": 0.445, + "step": 2980 + }, + { + "epoch": 0.09, + "grad_norm": 1.5712530635685527, + "learning_rate": 9.916683482457432e-06, + "loss": 0.454, + "step": 2981 + }, + { + "epoch": 0.09, + "grad_norm": 1.7011882617827305, + "learning_rate": 9.916598070881222e-06, + "loss": 0.4609, + "step": 2982 + }, + { + "epoch": 0.09, + "grad_norm": 1.5843871115798855, + "learning_rate": 9.916512615915981e-06, + "loss": 0.445, + "step": 2983 + }, + { + "epoch": 0.09, + "grad_norm": 1.5669316329144827, + "learning_rate": 9.91642711756246e-06, + "loss": 0.4264, + "step": 2984 + }, + { + "epoch": 0.09, + "grad_norm": 1.0281857667616485, + "learning_rate": 9.916341575821416e-06, + "loss": 0.5714, + "step": 2985 + }, + { + "epoch": 0.09, + "grad_norm": 1.8031429112016315, + "learning_rate": 9.916255990693603e-06, + "loss": 0.434, + "step": 2986 + }, + { + "epoch": 0.09, + "grad_norm": 1.7606156549005707, + "learning_rate": 9.916170362179775e-06, + "loss": 0.4436, + "step": 2987 + }, + { + "epoch": 0.09, + "grad_norm": 1.6380313613523718, + "learning_rate": 9.916084690280692e-06, + "loss": 0.4739, + "step": 2988 + }, + { + "epoch": 0.09, + "grad_norm": 1.5853594156486954, + "learning_rate": 9.915998974997105e-06, + "loss": 0.4301, + "step": 2989 + }, + { + "epoch": 0.09, + "grad_norm": 1.5859780361041016, + "learning_rate": 9.915913216329774e-06, + "loss": 0.4212, + "step": 2990 + }, + { + "epoch": 0.09, + "grad_norm": 2.063133073861592, + "learning_rate": 9.915827414279452e-06, + "loss": 0.4883, + "step": 2991 + }, + { + "epoch": 0.09, + "grad_norm": 1.608211160588078, + "learning_rate": 9.9157415688469e-06, + "loss": 0.4413, + "step": 2992 + }, + { + "epoch": 0.09, + "grad_norm": 1.6471836204317905, + "learning_rate": 9.915655680032874e-06, + "loss": 0.4515, + "step": 2993 + }, + { + "epoch": 0.09, + "grad_norm": 1.724720188203945, + "learning_rate": 9.915569747838132e-06, + "loss": 0.4794, + "step": 2994 + }, + { + "epoch": 0.09, + "grad_norm": 1.6907793379608982, + "learning_rate": 9.915483772263432e-06, + "loss": 0.4293, + "step": 2995 + }, + { + "epoch": 0.09, + "grad_norm": 1.4809535901628703, + "learning_rate": 9.915397753309533e-06, + "loss": 0.4505, + "step": 2996 + }, + { + "epoch": 0.09, + "grad_norm": 1.9762496966965872, + "learning_rate": 9.915311690977194e-06, + "loss": 0.4291, + "step": 2997 + }, + { + "epoch": 0.09, + "grad_norm": 1.544206472881016, + "learning_rate": 9.915225585267176e-06, + "loss": 0.4368, + "step": 2998 + }, + { + "epoch": 0.09, + "grad_norm": 1.6170496285416474, + "learning_rate": 9.915139436180237e-06, + "loss": 0.4481, + "step": 2999 + }, + { + "epoch": 0.09, + "grad_norm": 1.6236986750459732, + "learning_rate": 9.915053243717138e-06, + "loss": 0.453, + "step": 3000 + }, + { + "epoch": 0.09, + "grad_norm": 1.5630934755290995, + "learning_rate": 9.914967007878642e-06, + "loss": 0.51, + "step": 3001 + }, + { + "epoch": 0.09, + "grad_norm": 1.7594894224740933, + "learning_rate": 9.914880728665505e-06, + "loss": 0.4655, + "step": 3002 + }, + { + "epoch": 0.09, + "grad_norm": 1.519371656185153, + "learning_rate": 9.914794406078492e-06, + "loss": 0.4377, + "step": 3003 + }, + { + "epoch": 0.09, + "grad_norm": 1.5372317861947071, + "learning_rate": 9.914708040118362e-06, + "loss": 0.4224, + "step": 3004 + }, + { + "epoch": 0.09, + "grad_norm": 2.051618881343814, + "learning_rate": 9.91462163078588e-06, + "loss": 0.4872, + "step": 3005 + }, + { + "epoch": 0.09, + "grad_norm": 1.637450704455996, + "learning_rate": 9.914535178081808e-06, + "loss": 0.5006, + "step": 3006 + }, + { + "epoch": 0.09, + "grad_norm": 1.6570400121790405, + "learning_rate": 9.91444868200691e-06, + "loss": 0.5283, + "step": 3007 + }, + { + "epoch": 0.09, + "grad_norm": 1.9491284438020409, + "learning_rate": 9.914362142561946e-06, + "loss": 0.4497, + "step": 3008 + }, + { + "epoch": 0.09, + "grad_norm": 1.738163700246122, + "learning_rate": 9.914275559747682e-06, + "loss": 0.4649, + "step": 3009 + }, + { + "epoch": 0.09, + "grad_norm": 1.999312905635789, + "learning_rate": 9.91418893356488e-06, + "loss": 0.4344, + "step": 3010 + }, + { + "epoch": 0.09, + "grad_norm": 1.6665223212500349, + "learning_rate": 9.91410226401431e-06, + "loss": 0.4187, + "step": 3011 + }, + { + "epoch": 0.09, + "grad_norm": 1.620350608987971, + "learning_rate": 9.91401555109673e-06, + "loss": 0.4719, + "step": 3012 + }, + { + "epoch": 0.09, + "grad_norm": 1.602279471563705, + "learning_rate": 9.913928794812909e-06, + "loss": 0.4865, + "step": 3013 + }, + { + "epoch": 0.09, + "grad_norm": 2.719212301257217, + "learning_rate": 9.913841995163613e-06, + "loss": 0.4453, + "step": 3014 + }, + { + "epoch": 0.09, + "grad_norm": 1.6110837201576107, + "learning_rate": 9.913755152149604e-06, + "loss": 0.4073, + "step": 3015 + }, + { + "epoch": 0.09, + "grad_norm": 1.693969796964214, + "learning_rate": 9.913668265771655e-06, + "loss": 0.423, + "step": 3016 + }, + { + "epoch": 0.09, + "grad_norm": 1.5948173337139768, + "learning_rate": 9.913581336030527e-06, + "loss": 0.451, + "step": 3017 + }, + { + "epoch": 0.09, + "grad_norm": 1.7015869922522784, + "learning_rate": 9.913494362926988e-06, + "loss": 0.4353, + "step": 3018 + }, + { + "epoch": 0.09, + "grad_norm": 1.6547356304060241, + "learning_rate": 9.913407346461808e-06, + "loss": 0.4806, + "step": 3019 + }, + { + "epoch": 0.09, + "grad_norm": 1.2899229487175392, + "learning_rate": 9.913320286635755e-06, + "loss": 0.6574, + "step": 3020 + }, + { + "epoch": 0.09, + "grad_norm": 1.6385566888537402, + "learning_rate": 9.913233183449595e-06, + "loss": 0.4237, + "step": 3021 + }, + { + "epoch": 0.09, + "grad_norm": 1.8191404459697216, + "learning_rate": 9.913146036904096e-06, + "loss": 0.4511, + "step": 3022 + }, + { + "epoch": 0.09, + "grad_norm": 1.9243232496280769, + "learning_rate": 9.91305884700003e-06, + "loss": 0.4441, + "step": 3023 + }, + { + "epoch": 0.09, + "grad_norm": 1.654972156793858, + "learning_rate": 9.912971613738166e-06, + "loss": 0.4573, + "step": 3024 + }, + { + "epoch": 0.09, + "grad_norm": 1.5110730333238882, + "learning_rate": 9.912884337119273e-06, + "loss": 0.4855, + "step": 3025 + }, + { + "epoch": 0.09, + "grad_norm": 1.7022757639760573, + "learning_rate": 9.91279701714412e-06, + "loss": 0.4201, + "step": 3026 + }, + { + "epoch": 0.09, + "grad_norm": 1.5297543963107114, + "learning_rate": 9.912709653813479e-06, + "loss": 0.4778, + "step": 3027 + }, + { + "epoch": 0.09, + "grad_norm": 1.8230457495073888, + "learning_rate": 9.912622247128121e-06, + "loss": 0.4566, + "step": 3028 + }, + { + "epoch": 0.09, + "grad_norm": 1.1008608294243496, + "learning_rate": 9.912534797088817e-06, + "loss": 0.6342, + "step": 3029 + }, + { + "epoch": 0.09, + "grad_norm": 2.234671055475007, + "learning_rate": 9.91244730369634e-06, + "loss": 0.4438, + "step": 3030 + }, + { + "epoch": 0.09, + "grad_norm": 1.6317518517904064, + "learning_rate": 9.91235976695146e-06, + "loss": 0.4482, + "step": 3031 + }, + { + "epoch": 0.09, + "grad_norm": 1.6920730556015853, + "learning_rate": 9.91227218685495e-06, + "loss": 0.427, + "step": 3032 + }, + { + "epoch": 0.09, + "grad_norm": 1.7232078230398646, + "learning_rate": 9.912184563407586e-06, + "loss": 0.4359, + "step": 3033 + }, + { + "epoch": 0.09, + "grad_norm": 1.736725870029985, + "learning_rate": 9.912096896610135e-06, + "loss": 0.474, + "step": 3034 + }, + { + "epoch": 0.09, + "grad_norm": 2.0187629008433827, + "learning_rate": 9.912009186463377e-06, + "loss": 0.421, + "step": 3035 + }, + { + "epoch": 0.09, + "grad_norm": 1.8796415403114783, + "learning_rate": 9.911921432968084e-06, + "loss": 0.4571, + "step": 3036 + }, + { + "epoch": 0.09, + "grad_norm": 1.8423111597943154, + "learning_rate": 9.911833636125027e-06, + "loss": 0.4306, + "step": 3037 + }, + { + "epoch": 0.09, + "grad_norm": 1.829170838593278, + "learning_rate": 9.911745795934987e-06, + "loss": 0.4627, + "step": 3038 + }, + { + "epoch": 0.09, + "grad_norm": 1.8426882308386772, + "learning_rate": 9.911657912398733e-06, + "loss": 0.4404, + "step": 3039 + }, + { + "epoch": 0.09, + "grad_norm": 1.4877226649512736, + "learning_rate": 9.911569985517044e-06, + "loss": 0.4026, + "step": 3040 + }, + { + "epoch": 0.09, + "grad_norm": 1.669781959816963, + "learning_rate": 9.911482015290697e-06, + "loss": 0.4354, + "step": 3041 + }, + { + "epoch": 0.09, + "grad_norm": 1.556523892131949, + "learning_rate": 9.911394001720466e-06, + "loss": 0.4727, + "step": 3042 + }, + { + "epoch": 0.09, + "grad_norm": 1.517729603292076, + "learning_rate": 9.911305944807127e-06, + "loss": 0.422, + "step": 3043 + }, + { + "epoch": 0.09, + "grad_norm": 1.4861829732178498, + "learning_rate": 9.91121784455146e-06, + "loss": 0.4384, + "step": 3044 + }, + { + "epoch": 0.09, + "grad_norm": 2.0885434215251135, + "learning_rate": 9.911129700954242e-06, + "loss": 0.4359, + "step": 3045 + }, + { + "epoch": 0.09, + "grad_norm": 1.6635588060174562, + "learning_rate": 9.911041514016248e-06, + "loss": 0.5066, + "step": 3046 + }, + { + "epoch": 0.09, + "grad_norm": 1.4496355188093566, + "learning_rate": 9.91095328373826e-06, + "loss": 0.4553, + "step": 3047 + }, + { + "epoch": 0.09, + "grad_norm": 1.5178208526970853, + "learning_rate": 9.910865010121054e-06, + "loss": 0.4744, + "step": 3048 + }, + { + "epoch": 0.09, + "grad_norm": 1.6364277222498937, + "learning_rate": 9.910776693165409e-06, + "loss": 0.4425, + "step": 3049 + }, + { + "epoch": 0.09, + "grad_norm": 2.046935148347621, + "learning_rate": 9.910688332872105e-06, + "loss": 0.4305, + "step": 3050 + }, + { + "epoch": 0.09, + "grad_norm": 1.7577874703215934, + "learning_rate": 9.910599929241923e-06, + "loss": 0.4619, + "step": 3051 + }, + { + "epoch": 0.09, + "grad_norm": 1.6559441850550893, + "learning_rate": 9.910511482275642e-06, + "loss": 0.4658, + "step": 3052 + }, + { + "epoch": 0.09, + "grad_norm": 1.634413801960893, + "learning_rate": 9.910422991974043e-06, + "loss": 0.4555, + "step": 3053 + }, + { + "epoch": 0.09, + "grad_norm": 2.733301052999422, + "learning_rate": 9.910334458337905e-06, + "loss": 0.4373, + "step": 3054 + }, + { + "epoch": 0.09, + "grad_norm": 1.5747921594963108, + "learning_rate": 9.910245881368012e-06, + "loss": 0.4271, + "step": 3055 + }, + { + "epoch": 0.09, + "grad_norm": 1.8438457871688925, + "learning_rate": 9.910157261065146e-06, + "loss": 0.4517, + "step": 3056 + }, + { + "epoch": 0.09, + "grad_norm": 1.7196698953201643, + "learning_rate": 9.910068597430084e-06, + "loss": 0.4388, + "step": 3057 + }, + { + "epoch": 0.09, + "grad_norm": 1.4847734019424406, + "learning_rate": 9.909979890463617e-06, + "loss": 0.4832, + "step": 3058 + }, + { + "epoch": 0.09, + "grad_norm": 1.0429116946269616, + "learning_rate": 9.90989114016652e-06, + "loss": 0.6455, + "step": 3059 + }, + { + "epoch": 0.09, + "grad_norm": 1.6932450539322128, + "learning_rate": 9.909802346539578e-06, + "loss": 0.5052, + "step": 3060 + }, + { + "epoch": 0.09, + "grad_norm": 1.6970905429207732, + "learning_rate": 9.909713509583578e-06, + "loss": 0.477, + "step": 3061 + }, + { + "epoch": 0.09, + "grad_norm": 1.7368239186136385, + "learning_rate": 9.909624629299303e-06, + "loss": 0.4388, + "step": 3062 + }, + { + "epoch": 0.09, + "grad_norm": 1.7307388384253164, + "learning_rate": 9.909535705687534e-06, + "loss": 0.4539, + "step": 3063 + }, + { + "epoch": 0.09, + "grad_norm": 1.7722538780424835, + "learning_rate": 9.909446738749058e-06, + "loss": 0.4384, + "step": 3064 + }, + { + "epoch": 0.09, + "grad_norm": 1.6456233240376923, + "learning_rate": 9.909357728484662e-06, + "loss": 0.4279, + "step": 3065 + }, + { + "epoch": 0.09, + "grad_norm": 1.7865595010865396, + "learning_rate": 9.909268674895126e-06, + "loss": 0.4122, + "step": 3066 + }, + { + "epoch": 0.09, + "grad_norm": 1.7491001471145542, + "learning_rate": 9.909179577981243e-06, + "loss": 0.4181, + "step": 3067 + }, + { + "epoch": 0.09, + "grad_norm": 1.770887780156519, + "learning_rate": 9.909090437743795e-06, + "loss": 0.3986, + "step": 3068 + }, + { + "epoch": 0.09, + "grad_norm": 1.0235700267954693, + "learning_rate": 9.909001254183568e-06, + "loss": 0.6031, + "step": 3069 + }, + { + "epoch": 0.09, + "grad_norm": 1.7285088266286937, + "learning_rate": 9.908912027301351e-06, + "loss": 0.42, + "step": 3070 + }, + { + "epoch": 0.09, + "grad_norm": 1.9370646565025582, + "learning_rate": 9.90882275709793e-06, + "loss": 0.4297, + "step": 3071 + }, + { + "epoch": 0.09, + "grad_norm": 1.643217155562944, + "learning_rate": 9.908733443574096e-06, + "loss": 0.4787, + "step": 3072 + }, + { + "epoch": 0.09, + "grad_norm": 1.7522356408521595, + "learning_rate": 9.908644086730635e-06, + "loss": 0.454, + "step": 3073 + }, + { + "epoch": 0.09, + "grad_norm": 1.7333047743736076, + "learning_rate": 9.908554686568335e-06, + "loss": 0.4504, + "step": 3074 + }, + { + "epoch": 0.09, + "grad_norm": 1.606317028747602, + "learning_rate": 9.908465243087984e-06, + "loss": 0.4623, + "step": 3075 + }, + { + "epoch": 0.09, + "grad_norm": 1.5719433991124239, + "learning_rate": 9.908375756290373e-06, + "loss": 0.4599, + "step": 3076 + }, + { + "epoch": 0.09, + "grad_norm": 1.9570873623454665, + "learning_rate": 9.908286226176292e-06, + "loss": 0.4363, + "step": 3077 + }, + { + "epoch": 0.09, + "grad_norm": 1.7541285217995326, + "learning_rate": 9.90819665274653e-06, + "loss": 0.4399, + "step": 3078 + }, + { + "epoch": 0.09, + "grad_norm": 1.0132119845051188, + "learning_rate": 9.90810703600188e-06, + "loss": 0.6111, + "step": 3079 + }, + { + "epoch": 0.09, + "grad_norm": 1.7323505032174653, + "learning_rate": 9.90801737594313e-06, + "loss": 0.4305, + "step": 3080 + }, + { + "epoch": 0.09, + "grad_norm": 1.5483684144320986, + "learning_rate": 9.907927672571071e-06, + "loss": 0.4236, + "step": 3081 + }, + { + "epoch": 0.09, + "grad_norm": 1.4452164947411608, + "learning_rate": 9.907837925886498e-06, + "loss": 0.4369, + "step": 3082 + }, + { + "epoch": 0.09, + "grad_norm": 1.8620158162952363, + "learning_rate": 9.907748135890198e-06, + "loss": 0.4503, + "step": 3083 + }, + { + "epoch": 0.09, + "grad_norm": 1.6284000851153684, + "learning_rate": 9.90765830258297e-06, + "loss": 0.4282, + "step": 3084 + }, + { + "epoch": 0.09, + "grad_norm": 1.6340224545193769, + "learning_rate": 9.907568425965599e-06, + "loss": 0.5137, + "step": 3085 + }, + { + "epoch": 0.09, + "grad_norm": 2.022668952934321, + "learning_rate": 9.907478506038885e-06, + "loss": 0.475, + "step": 3086 + }, + { + "epoch": 0.09, + "grad_norm": 1.6555608762557286, + "learning_rate": 9.907388542803617e-06, + "loss": 0.4887, + "step": 3087 + }, + { + "epoch": 0.09, + "grad_norm": 2.8583088564679433, + "learning_rate": 9.907298536260593e-06, + "loss": 0.4203, + "step": 3088 + }, + { + "epoch": 0.09, + "grad_norm": 1.7537332347971855, + "learning_rate": 9.907208486410602e-06, + "loss": 0.4204, + "step": 3089 + }, + { + "epoch": 0.09, + "grad_norm": 1.5755180328390714, + "learning_rate": 9.907118393254444e-06, + "loss": 0.474, + "step": 3090 + }, + { + "epoch": 0.09, + "grad_norm": 1.8093741519642124, + "learning_rate": 9.90702825679291e-06, + "loss": 0.4473, + "step": 3091 + }, + { + "epoch": 0.09, + "grad_norm": 1.576951443232889, + "learning_rate": 9.9069380770268e-06, + "loss": 0.4124, + "step": 3092 + }, + { + "epoch": 0.09, + "grad_norm": 3.1774331264706412, + "learning_rate": 9.906847853956905e-06, + "loss": 0.4411, + "step": 3093 + }, + { + "epoch": 0.09, + "grad_norm": 2.2841786897229848, + "learning_rate": 9.906757587584023e-06, + "loss": 0.3924, + "step": 3094 + }, + { + "epoch": 0.09, + "grad_norm": 1.6967235685512025, + "learning_rate": 9.906667277908951e-06, + "loss": 0.4451, + "step": 3095 + }, + { + "epoch": 0.09, + "grad_norm": 1.9864204930541207, + "learning_rate": 9.906576924932486e-06, + "loss": 0.4564, + "step": 3096 + }, + { + "epoch": 0.09, + "grad_norm": 1.8119112974666303, + "learning_rate": 9.906486528655425e-06, + "loss": 0.47, + "step": 3097 + }, + { + "epoch": 0.09, + "grad_norm": 1.9432759825172292, + "learning_rate": 9.906396089078567e-06, + "loss": 0.4816, + "step": 3098 + }, + { + "epoch": 0.09, + "grad_norm": 1.1574576030427226, + "learning_rate": 9.906305606202709e-06, + "loss": 0.6001, + "step": 3099 + }, + { + "epoch": 0.09, + "grad_norm": 2.3941757922448894, + "learning_rate": 9.906215080028648e-06, + "loss": 0.4533, + "step": 3100 + }, + { + "epoch": 0.09, + "grad_norm": 2.242239660622625, + "learning_rate": 9.906124510557185e-06, + "loss": 0.435, + "step": 3101 + }, + { + "epoch": 0.09, + "grad_norm": 2.306794916040203, + "learning_rate": 9.90603389778912e-06, + "loss": 0.4359, + "step": 3102 + }, + { + "epoch": 0.09, + "grad_norm": 1.6279971733518595, + "learning_rate": 9.905943241725252e-06, + "loss": 0.4202, + "step": 3103 + }, + { + "epoch": 0.09, + "grad_norm": 4.284969431949516, + "learning_rate": 9.905852542366377e-06, + "loss": 0.4503, + "step": 3104 + }, + { + "epoch": 0.09, + "grad_norm": 1.7370979500407229, + "learning_rate": 9.905761799713302e-06, + "loss": 0.4503, + "step": 3105 + }, + { + "epoch": 0.09, + "grad_norm": 1.6398674019279975, + "learning_rate": 9.905671013766823e-06, + "loss": 0.4914, + "step": 3106 + }, + { + "epoch": 0.09, + "grad_norm": 1.8187561340883016, + "learning_rate": 9.905580184527743e-06, + "loss": 0.4618, + "step": 3107 + }, + { + "epoch": 0.09, + "grad_norm": 1.5615650696962604, + "learning_rate": 9.905489311996865e-06, + "loss": 0.447, + "step": 3108 + }, + { + "epoch": 0.09, + "grad_norm": 1.6643776278874283, + "learning_rate": 9.905398396174988e-06, + "loss": 0.4573, + "step": 3109 + }, + { + "epoch": 0.09, + "grad_norm": 1.6377936487438014, + "learning_rate": 9.905307437062916e-06, + "loss": 0.4361, + "step": 3110 + }, + { + "epoch": 0.09, + "grad_norm": 1.5598086310232797, + "learning_rate": 9.905216434661452e-06, + "loss": 0.402, + "step": 3111 + }, + { + "epoch": 0.09, + "grad_norm": 1.5658127184832584, + "learning_rate": 9.905125388971398e-06, + "loss": 0.4108, + "step": 3112 + }, + { + "epoch": 0.09, + "grad_norm": 1.5341363144511235, + "learning_rate": 9.90503429999356e-06, + "loss": 0.4053, + "step": 3113 + }, + { + "epoch": 0.09, + "grad_norm": 2.0046883632066557, + "learning_rate": 9.904943167728737e-06, + "loss": 0.4628, + "step": 3114 + }, + { + "epoch": 0.09, + "grad_norm": 1.7069822177679572, + "learning_rate": 9.904851992177738e-06, + "loss": 0.4351, + "step": 3115 + }, + { + "epoch": 0.09, + "grad_norm": 1.1269013597775501, + "learning_rate": 9.904760773341365e-06, + "loss": 0.6349, + "step": 3116 + }, + { + "epoch": 0.09, + "grad_norm": 1.642156620036515, + "learning_rate": 9.904669511220422e-06, + "loss": 0.4159, + "step": 3117 + }, + { + "epoch": 0.09, + "grad_norm": 1.5953334954238974, + "learning_rate": 9.904578205815718e-06, + "loss": 0.4709, + "step": 3118 + }, + { + "epoch": 0.09, + "grad_norm": 1.6468232713631674, + "learning_rate": 9.904486857128057e-06, + "loss": 0.455, + "step": 3119 + }, + { + "epoch": 0.09, + "grad_norm": 1.8206723612070699, + "learning_rate": 9.904395465158246e-06, + "loss": 0.4788, + "step": 3120 + }, + { + "epoch": 0.09, + "grad_norm": 2.316190022704612, + "learning_rate": 9.904304029907089e-06, + "loss": 0.4471, + "step": 3121 + }, + { + "epoch": 0.09, + "grad_norm": 1.6085342378441823, + "learning_rate": 9.904212551375394e-06, + "loss": 0.4335, + "step": 3122 + }, + { + "epoch": 0.09, + "grad_norm": 1.5243796424889169, + "learning_rate": 9.90412102956397e-06, + "loss": 0.4699, + "step": 3123 + }, + { + "epoch": 0.09, + "grad_norm": 1.5095162606240362, + "learning_rate": 9.904029464473623e-06, + "loss": 0.419, + "step": 3124 + }, + { + "epoch": 0.09, + "grad_norm": 1.6419553817690629, + "learning_rate": 9.903937856105163e-06, + "loss": 0.4454, + "step": 3125 + }, + { + "epoch": 0.09, + "grad_norm": 1.5855707249885829, + "learning_rate": 9.903846204459395e-06, + "loss": 0.4578, + "step": 3126 + }, + { + "epoch": 0.09, + "grad_norm": 1.5332208511870193, + "learning_rate": 9.90375450953713e-06, + "loss": 0.4336, + "step": 3127 + }, + { + "epoch": 0.09, + "grad_norm": 1.537285404574041, + "learning_rate": 9.90366277133918e-06, + "loss": 0.4586, + "step": 3128 + }, + { + "epoch": 0.09, + "grad_norm": 1.1718146015966593, + "learning_rate": 9.903570989866349e-06, + "loss": 0.5938, + "step": 3129 + }, + { + "epoch": 0.09, + "grad_norm": 1.6031810894714325, + "learning_rate": 9.90347916511945e-06, + "loss": 0.4316, + "step": 3130 + }, + { + "epoch": 0.09, + "grad_norm": 1.6857751959461253, + "learning_rate": 9.903387297099293e-06, + "loss": 0.4363, + "step": 3131 + }, + { + "epoch": 0.09, + "grad_norm": 1.6261544684683251, + "learning_rate": 9.903295385806689e-06, + "loss": 0.4698, + "step": 3132 + }, + { + "epoch": 0.09, + "grad_norm": 1.5754466202074697, + "learning_rate": 9.90320343124245e-06, + "loss": 0.4906, + "step": 3133 + }, + { + "epoch": 0.09, + "grad_norm": 1.507599457931767, + "learning_rate": 9.903111433407384e-06, + "loss": 0.4612, + "step": 3134 + }, + { + "epoch": 0.09, + "grad_norm": 1.5985297604093271, + "learning_rate": 9.903019392302307e-06, + "loss": 0.4262, + "step": 3135 + }, + { + "epoch": 0.09, + "grad_norm": 1.6073262682575964, + "learning_rate": 9.902927307928027e-06, + "loss": 0.4856, + "step": 3136 + }, + { + "epoch": 0.09, + "grad_norm": 1.5298540714177447, + "learning_rate": 9.902835180285362e-06, + "loss": 0.4577, + "step": 3137 + }, + { + "epoch": 0.09, + "grad_norm": 1.5398787324463648, + "learning_rate": 9.90274300937512e-06, + "loss": 0.4319, + "step": 3138 + }, + { + "epoch": 0.09, + "grad_norm": 1.5187827243578054, + "learning_rate": 9.902650795198118e-06, + "loss": 0.4068, + "step": 3139 + }, + { + "epoch": 0.09, + "grad_norm": 1.712666554486226, + "learning_rate": 9.902558537755168e-06, + "loss": 0.4191, + "step": 3140 + }, + { + "epoch": 0.09, + "grad_norm": 1.640016777240831, + "learning_rate": 9.902466237047084e-06, + "loss": 0.4343, + "step": 3141 + }, + { + "epoch": 0.09, + "grad_norm": 1.5322550085560724, + "learning_rate": 9.902373893074682e-06, + "loss": 0.4545, + "step": 3142 + }, + { + "epoch": 0.09, + "grad_norm": 1.7369789680782803, + "learning_rate": 9.902281505838774e-06, + "loss": 0.4817, + "step": 3143 + }, + { + "epoch": 0.09, + "grad_norm": 1.8167792728348715, + "learning_rate": 9.902189075340179e-06, + "loss": 0.4423, + "step": 3144 + }, + { + "epoch": 0.09, + "grad_norm": 2.3616932731634046, + "learning_rate": 9.902096601579711e-06, + "loss": 0.4556, + "step": 3145 + }, + { + "epoch": 0.09, + "grad_norm": 1.8919011408065656, + "learning_rate": 9.902004084558185e-06, + "loss": 0.4368, + "step": 3146 + }, + { + "epoch": 0.09, + "grad_norm": 3.0437866289237214, + "learning_rate": 9.901911524276419e-06, + "loss": 0.4721, + "step": 3147 + }, + { + "epoch": 0.09, + "grad_norm": 1.1168393388851128, + "learning_rate": 9.90181892073523e-06, + "loss": 0.5967, + "step": 3148 + }, + { + "epoch": 0.09, + "grad_norm": 1.8071580643581033, + "learning_rate": 9.901726273935436e-06, + "loss": 0.4325, + "step": 3149 + }, + { + "epoch": 0.09, + "grad_norm": 1.702619279393977, + "learning_rate": 9.901633583877853e-06, + "loss": 0.3967, + "step": 3150 + }, + { + "epoch": 0.09, + "grad_norm": 1.9632747369348598, + "learning_rate": 9.901540850563295e-06, + "loss": 0.4991, + "step": 3151 + }, + { + "epoch": 0.09, + "grad_norm": 1.5945361606276318, + "learning_rate": 9.90144807399259e-06, + "loss": 0.4655, + "step": 3152 + }, + { + "epoch": 0.09, + "grad_norm": 1.8716025799849993, + "learning_rate": 9.901355254166547e-06, + "loss": 0.4337, + "step": 3153 + }, + { + "epoch": 0.09, + "grad_norm": 1.7838277305581827, + "learning_rate": 9.901262391085993e-06, + "loss": 0.4796, + "step": 3154 + }, + { + "epoch": 0.09, + "grad_norm": 1.5558575701450603, + "learning_rate": 9.901169484751743e-06, + "loss": 0.4372, + "step": 3155 + }, + { + "epoch": 0.09, + "grad_norm": 1.45126189637709, + "learning_rate": 9.901076535164618e-06, + "loss": 0.4576, + "step": 3156 + }, + { + "epoch": 0.09, + "grad_norm": 1.5429197035555933, + "learning_rate": 9.900983542325436e-06, + "loss": 0.4828, + "step": 3157 + }, + { + "epoch": 0.09, + "grad_norm": 1.728997574937446, + "learning_rate": 9.90089050623502e-06, + "loss": 0.4575, + "step": 3158 + }, + { + "epoch": 0.09, + "grad_norm": 1.637411443966966, + "learning_rate": 9.900797426894196e-06, + "loss": 0.4353, + "step": 3159 + }, + { + "epoch": 0.09, + "grad_norm": 1.6145749289296196, + "learning_rate": 9.900704304303775e-06, + "loss": 0.4369, + "step": 3160 + }, + { + "epoch": 0.09, + "grad_norm": 1.6994974839682522, + "learning_rate": 9.900611138464586e-06, + "loss": 0.4717, + "step": 3161 + }, + { + "epoch": 0.09, + "grad_norm": 1.8319444367096989, + "learning_rate": 9.90051792937745e-06, + "loss": 0.4453, + "step": 3162 + }, + { + "epoch": 0.09, + "grad_norm": 1.884589917469665, + "learning_rate": 9.900424677043187e-06, + "loss": 0.4279, + "step": 3163 + }, + { + "epoch": 0.09, + "grad_norm": 1.6311741817728216, + "learning_rate": 9.900331381462625e-06, + "loss": 0.4737, + "step": 3164 + }, + { + "epoch": 0.09, + "grad_norm": 1.566598613393686, + "learning_rate": 9.90023804263658e-06, + "loss": 0.4583, + "step": 3165 + }, + { + "epoch": 0.09, + "grad_norm": 1.8395694644424816, + "learning_rate": 9.900144660565883e-06, + "loss": 0.4618, + "step": 3166 + }, + { + "epoch": 0.09, + "grad_norm": 1.52193859882321, + "learning_rate": 9.900051235251354e-06, + "loss": 0.4329, + "step": 3167 + }, + { + "epoch": 0.09, + "grad_norm": 1.7380447776777335, + "learning_rate": 9.89995776669382e-06, + "loss": 0.4252, + "step": 3168 + }, + { + "epoch": 0.09, + "grad_norm": 0.9851204144039187, + "learning_rate": 9.899864254894103e-06, + "loss": 0.5904, + "step": 3169 + }, + { + "epoch": 0.09, + "grad_norm": 1.7054102466786911, + "learning_rate": 9.899770699853028e-06, + "loss": 0.4498, + "step": 3170 + }, + { + "epoch": 0.09, + "grad_norm": 1.8157478042995339, + "learning_rate": 9.899677101571425e-06, + "loss": 0.4481, + "step": 3171 + }, + { + "epoch": 0.09, + "grad_norm": 1.6356719805273574, + "learning_rate": 9.899583460050115e-06, + "loss": 0.4423, + "step": 3172 + }, + { + "epoch": 0.09, + "grad_norm": 1.5682074080071409, + "learning_rate": 9.899489775289928e-06, + "loss": 0.4337, + "step": 3173 + }, + { + "epoch": 0.09, + "grad_norm": 1.5753171420668564, + "learning_rate": 9.899396047291689e-06, + "loss": 0.425, + "step": 3174 + }, + { + "epoch": 0.09, + "grad_norm": 1.783985544594431, + "learning_rate": 9.899302276056226e-06, + "loss": 0.4476, + "step": 3175 + }, + { + "epoch": 0.09, + "grad_norm": 1.7877235237003701, + "learning_rate": 9.899208461584366e-06, + "loss": 0.4961, + "step": 3176 + }, + { + "epoch": 0.09, + "grad_norm": 2.353958714355899, + "learning_rate": 9.899114603876935e-06, + "loss": 0.4721, + "step": 3177 + }, + { + "epoch": 0.09, + "grad_norm": 1.7105989755020519, + "learning_rate": 9.899020702934766e-06, + "loss": 0.4366, + "step": 3178 + }, + { + "epoch": 0.09, + "grad_norm": 1.6170990051214582, + "learning_rate": 9.898926758758684e-06, + "loss": 0.4822, + "step": 3179 + }, + { + "epoch": 0.09, + "grad_norm": 1.5658029642712479, + "learning_rate": 9.898832771349518e-06, + "loss": 0.4056, + "step": 3180 + }, + { + "epoch": 0.09, + "grad_norm": 2.058802638541792, + "learning_rate": 9.898738740708101e-06, + "loss": 0.4609, + "step": 3181 + }, + { + "epoch": 0.09, + "grad_norm": 1.6644722674477488, + "learning_rate": 9.898644666835257e-06, + "loss": 0.4435, + "step": 3182 + }, + { + "epoch": 0.09, + "grad_norm": 1.938365679057396, + "learning_rate": 9.898550549731822e-06, + "loss": 0.4023, + "step": 3183 + }, + { + "epoch": 0.09, + "grad_norm": 1.7016607148145344, + "learning_rate": 9.898456389398623e-06, + "loss": 0.4506, + "step": 3184 + }, + { + "epoch": 0.09, + "grad_norm": 1.4691434624625526, + "learning_rate": 9.898362185836492e-06, + "loss": 0.4233, + "step": 3185 + }, + { + "epoch": 0.09, + "grad_norm": 1.689574300852249, + "learning_rate": 9.89826793904626e-06, + "loss": 0.4618, + "step": 3186 + }, + { + "epoch": 0.09, + "grad_norm": 1.9085724015776497, + "learning_rate": 9.89817364902876e-06, + "loss": 0.4283, + "step": 3187 + }, + { + "epoch": 0.09, + "grad_norm": 1.957667149670345, + "learning_rate": 9.898079315784823e-06, + "loss": 0.4557, + "step": 3188 + }, + { + "epoch": 0.09, + "grad_norm": 1.6889807581006762, + "learning_rate": 9.897984939315281e-06, + "loss": 0.42, + "step": 3189 + }, + { + "epoch": 0.09, + "grad_norm": 1.5548836811944853, + "learning_rate": 9.897890519620969e-06, + "loss": 0.398, + "step": 3190 + }, + { + "epoch": 0.09, + "grad_norm": 1.7713464033708288, + "learning_rate": 9.897796056702718e-06, + "loss": 0.4245, + "step": 3191 + }, + { + "epoch": 0.09, + "grad_norm": 1.625779416755699, + "learning_rate": 9.897701550561363e-06, + "loss": 0.4476, + "step": 3192 + }, + { + "epoch": 0.09, + "grad_norm": 1.6868369916014119, + "learning_rate": 9.897607001197736e-06, + "loss": 0.4245, + "step": 3193 + }, + { + "epoch": 0.09, + "grad_norm": 1.9164747157373168, + "learning_rate": 9.897512408612672e-06, + "loss": 0.4339, + "step": 3194 + }, + { + "epoch": 0.09, + "grad_norm": 1.477597451484058, + "learning_rate": 9.897417772807009e-06, + "loss": 0.4054, + "step": 3195 + }, + { + "epoch": 0.09, + "grad_norm": 1.5755447627827757, + "learning_rate": 9.897323093781581e-06, + "loss": 0.4012, + "step": 3196 + }, + { + "epoch": 0.09, + "grad_norm": 1.7083859500350722, + "learning_rate": 9.89722837153722e-06, + "loss": 0.4654, + "step": 3197 + }, + { + "epoch": 0.09, + "grad_norm": 1.8863348186228057, + "learning_rate": 9.897133606074765e-06, + "loss": 0.4033, + "step": 3198 + }, + { + "epoch": 0.09, + "grad_norm": 1.9952351996208282, + "learning_rate": 9.89703879739505e-06, + "loss": 0.4293, + "step": 3199 + }, + { + "epoch": 0.09, + "grad_norm": 1.5760573256033643, + "learning_rate": 9.896943945498916e-06, + "loss": 0.452, + "step": 3200 + }, + { + "epoch": 0.09, + "grad_norm": 1.6375657608544572, + "learning_rate": 9.896849050387195e-06, + "loss": 0.4797, + "step": 3201 + }, + { + "epoch": 0.09, + "grad_norm": 1.5984708844660505, + "learning_rate": 9.896754112060728e-06, + "loss": 0.4745, + "step": 3202 + }, + { + "epoch": 0.09, + "grad_norm": 2.232283966741944, + "learning_rate": 9.896659130520353e-06, + "loss": 0.4044, + "step": 3203 + }, + { + "epoch": 0.09, + "grad_norm": 1.619047969806501, + "learning_rate": 9.896564105766906e-06, + "loss": 0.4686, + "step": 3204 + }, + { + "epoch": 0.09, + "grad_norm": 1.8128252396230737, + "learning_rate": 9.896469037801225e-06, + "loss": 0.5093, + "step": 3205 + }, + { + "epoch": 0.09, + "grad_norm": 1.654154686978195, + "learning_rate": 9.896373926624153e-06, + "loss": 0.4371, + "step": 3206 + }, + { + "epoch": 0.09, + "grad_norm": 1.685211353225526, + "learning_rate": 9.896278772236523e-06, + "loss": 0.434, + "step": 3207 + }, + { + "epoch": 0.09, + "grad_norm": 1.705373407924566, + "learning_rate": 9.896183574639181e-06, + "loss": 0.4482, + "step": 3208 + }, + { + "epoch": 0.09, + "grad_norm": 1.4910701752849607, + "learning_rate": 9.896088333832965e-06, + "loss": 0.4199, + "step": 3209 + }, + { + "epoch": 0.09, + "grad_norm": 1.5782350302140782, + "learning_rate": 9.895993049818714e-06, + "loss": 0.4452, + "step": 3210 + }, + { + "epoch": 0.09, + "grad_norm": 1.7636552676045698, + "learning_rate": 9.89589772259727e-06, + "loss": 0.4025, + "step": 3211 + }, + { + "epoch": 0.09, + "grad_norm": 1.8018596909071445, + "learning_rate": 9.895802352169474e-06, + "loss": 0.4235, + "step": 3212 + }, + { + "epoch": 0.09, + "grad_norm": 1.6556665427395267, + "learning_rate": 9.895706938536169e-06, + "loss": 0.4327, + "step": 3213 + }, + { + "epoch": 0.09, + "grad_norm": 1.6486725400366504, + "learning_rate": 9.895611481698197e-06, + "loss": 0.4216, + "step": 3214 + }, + { + "epoch": 0.09, + "grad_norm": 1.9136995830122023, + "learning_rate": 9.895515981656398e-06, + "loss": 0.4673, + "step": 3215 + }, + { + "epoch": 0.09, + "grad_norm": 1.609074091535915, + "learning_rate": 9.895420438411616e-06, + "loss": 0.4464, + "step": 3216 + }, + { + "epoch": 0.09, + "grad_norm": 1.6924919380718173, + "learning_rate": 9.895324851964694e-06, + "loss": 0.4153, + "step": 3217 + }, + { + "epoch": 0.09, + "grad_norm": 1.7139323002907527, + "learning_rate": 9.895229222316476e-06, + "loss": 0.4693, + "step": 3218 + }, + { + "epoch": 0.09, + "grad_norm": 1.770390593059356, + "learning_rate": 9.895133549467806e-06, + "loss": 0.4384, + "step": 3219 + }, + { + "epoch": 0.09, + "grad_norm": 1.6794068828086974, + "learning_rate": 9.895037833419529e-06, + "loss": 0.4419, + "step": 3220 + }, + { + "epoch": 0.09, + "grad_norm": 1.7036527918615163, + "learning_rate": 9.894942074172487e-06, + "loss": 0.4725, + "step": 3221 + }, + { + "epoch": 0.09, + "grad_norm": 1.9881436488622852, + "learning_rate": 9.894846271727529e-06, + "loss": 0.5047, + "step": 3222 + }, + { + "epoch": 0.09, + "grad_norm": 1.6074300061102291, + "learning_rate": 9.894750426085496e-06, + "loss": 0.439, + "step": 3223 + }, + { + "epoch": 0.09, + "grad_norm": 1.6119680876949765, + "learning_rate": 9.894654537247238e-06, + "loss": 0.4536, + "step": 3224 + }, + { + "epoch": 0.09, + "grad_norm": 2.636063601892772, + "learning_rate": 9.8945586052136e-06, + "loss": 0.4364, + "step": 3225 + }, + { + "epoch": 0.09, + "grad_norm": 1.751996286101846, + "learning_rate": 9.894462629985426e-06, + "loss": 0.4161, + "step": 3226 + }, + { + "epoch": 0.09, + "grad_norm": 1.5876845306411018, + "learning_rate": 9.894366611563566e-06, + "loss": 0.4683, + "step": 3227 + }, + { + "epoch": 0.09, + "grad_norm": 1.6740461645296147, + "learning_rate": 9.894270549948866e-06, + "loss": 0.455, + "step": 3228 + }, + { + "epoch": 0.09, + "grad_norm": 1.671205477009877, + "learning_rate": 9.894174445142175e-06, + "loss": 0.4052, + "step": 3229 + }, + { + "epoch": 0.09, + "grad_norm": 1.6996779109679694, + "learning_rate": 9.894078297144339e-06, + "loss": 0.4426, + "step": 3230 + }, + { + "epoch": 0.09, + "grad_norm": 1.8661046196641577, + "learning_rate": 9.893982105956208e-06, + "loss": 0.4602, + "step": 3231 + }, + { + "epoch": 0.09, + "grad_norm": 1.536750910268633, + "learning_rate": 9.893885871578631e-06, + "loss": 0.4351, + "step": 3232 + }, + { + "epoch": 0.09, + "grad_norm": 1.8031653720323202, + "learning_rate": 9.893789594012458e-06, + "loss": 0.417, + "step": 3233 + }, + { + "epoch": 0.09, + "grad_norm": 1.690521512890301, + "learning_rate": 9.893693273258535e-06, + "loss": 0.4739, + "step": 3234 + }, + { + "epoch": 0.09, + "grad_norm": 1.739392367685147, + "learning_rate": 9.893596909317716e-06, + "loss": 0.4749, + "step": 3235 + }, + { + "epoch": 0.09, + "grad_norm": 1.4762184456753906, + "learning_rate": 9.89350050219085e-06, + "loss": 0.4192, + "step": 3236 + }, + { + "epoch": 0.09, + "grad_norm": 1.760510292724376, + "learning_rate": 9.893404051878788e-06, + "loss": 0.4927, + "step": 3237 + }, + { + "epoch": 0.09, + "grad_norm": 1.7952236547317877, + "learning_rate": 9.89330755838238e-06, + "loss": 0.4127, + "step": 3238 + }, + { + "epoch": 0.09, + "grad_norm": 2.2678784139398322, + "learning_rate": 9.89321102170248e-06, + "loss": 0.4689, + "step": 3239 + }, + { + "epoch": 0.09, + "grad_norm": 2.5968637966934605, + "learning_rate": 9.893114441839938e-06, + "loss": 0.4508, + "step": 3240 + }, + { + "epoch": 0.09, + "grad_norm": 1.8300732064067589, + "learning_rate": 9.893017818795606e-06, + "loss": 0.4267, + "step": 3241 + }, + { + "epoch": 0.09, + "grad_norm": 1.7171619733924135, + "learning_rate": 9.892921152570338e-06, + "loss": 0.4932, + "step": 3242 + }, + { + "epoch": 0.09, + "grad_norm": 1.6157976052763303, + "learning_rate": 9.892824443164987e-06, + "loss": 0.4409, + "step": 3243 + }, + { + "epoch": 0.09, + "grad_norm": 2.022732737628366, + "learning_rate": 9.892727690580405e-06, + "loss": 0.4518, + "step": 3244 + }, + { + "epoch": 0.09, + "grad_norm": 1.1639969276200948, + "learning_rate": 9.892630894817448e-06, + "loss": 0.5999, + "step": 3245 + }, + { + "epoch": 0.09, + "grad_norm": 2.016729056342312, + "learning_rate": 9.892534055876968e-06, + "loss": 0.4325, + "step": 3246 + }, + { + "epoch": 0.09, + "grad_norm": 1.8197794331725003, + "learning_rate": 9.892437173759822e-06, + "loss": 0.4213, + "step": 3247 + }, + { + "epoch": 0.09, + "grad_norm": 1.9033047659529472, + "learning_rate": 9.892340248466862e-06, + "loss": 0.4168, + "step": 3248 + }, + { + "epoch": 0.09, + "grad_norm": 1.981514359428364, + "learning_rate": 9.892243279998946e-06, + "loss": 0.4605, + "step": 3249 + }, + { + "epoch": 0.09, + "grad_norm": 1.804497032047691, + "learning_rate": 9.892146268356928e-06, + "loss": 0.4317, + "step": 3250 + }, + { + "epoch": 0.09, + "grad_norm": 1.6757079019154844, + "learning_rate": 9.892049213541665e-06, + "loss": 0.4425, + "step": 3251 + }, + { + "epoch": 0.09, + "grad_norm": 1.9409719720239553, + "learning_rate": 9.891952115554015e-06, + "loss": 0.4534, + "step": 3252 + }, + { + "epoch": 0.09, + "grad_norm": 1.8564520989074738, + "learning_rate": 9.891854974394832e-06, + "loss": 0.4062, + "step": 3253 + }, + { + "epoch": 0.09, + "grad_norm": 2.12362094885505, + "learning_rate": 9.891757790064974e-06, + "loss": 0.4477, + "step": 3254 + }, + { + "epoch": 0.09, + "grad_norm": 1.8666784436657902, + "learning_rate": 9.8916605625653e-06, + "loss": 0.4449, + "step": 3255 + }, + { + "epoch": 0.09, + "grad_norm": 1.8633059869626645, + "learning_rate": 9.891563291896667e-06, + "loss": 0.4302, + "step": 3256 + }, + { + "epoch": 0.09, + "grad_norm": 1.886987985745018, + "learning_rate": 9.891465978059936e-06, + "loss": 0.4307, + "step": 3257 + }, + { + "epoch": 0.09, + "grad_norm": 1.7524306984253593, + "learning_rate": 9.89136862105596e-06, + "loss": 0.4563, + "step": 3258 + }, + { + "epoch": 0.09, + "grad_norm": 1.6390859870354297, + "learning_rate": 9.891271220885603e-06, + "loss": 0.4331, + "step": 3259 + }, + { + "epoch": 0.09, + "grad_norm": 1.7947320085491756, + "learning_rate": 9.891173777549722e-06, + "loss": 0.4505, + "step": 3260 + }, + { + "epoch": 0.09, + "grad_norm": 1.790170288318856, + "learning_rate": 9.891076291049181e-06, + "loss": 0.4441, + "step": 3261 + }, + { + "epoch": 0.09, + "grad_norm": 1.6526038552102544, + "learning_rate": 9.890978761384835e-06, + "loss": 0.4581, + "step": 3262 + }, + { + "epoch": 0.09, + "grad_norm": 1.7766062926363664, + "learning_rate": 9.890881188557549e-06, + "loss": 0.4588, + "step": 3263 + }, + { + "epoch": 0.09, + "grad_norm": 2.3531825401132753, + "learning_rate": 9.89078357256818e-06, + "loss": 0.4224, + "step": 3264 + }, + { + "epoch": 0.09, + "grad_norm": 2.8041744045929504, + "learning_rate": 9.890685913417595e-06, + "loss": 0.3918, + "step": 3265 + }, + { + "epoch": 0.09, + "grad_norm": 1.6266880695233017, + "learning_rate": 9.89058821110665e-06, + "loss": 0.4449, + "step": 3266 + }, + { + "epoch": 0.09, + "grad_norm": 1.6247021298908837, + "learning_rate": 9.89049046563621e-06, + "loss": 0.3953, + "step": 3267 + }, + { + "epoch": 0.09, + "grad_norm": 1.7155497724701556, + "learning_rate": 9.890392677007139e-06, + "loss": 0.4303, + "step": 3268 + }, + { + "epoch": 0.09, + "grad_norm": 1.750579953176582, + "learning_rate": 9.890294845220296e-06, + "loss": 0.4105, + "step": 3269 + }, + { + "epoch": 0.09, + "grad_norm": 1.601157952042574, + "learning_rate": 9.890196970276549e-06, + "loss": 0.4187, + "step": 3270 + }, + { + "epoch": 0.09, + "grad_norm": 1.8915205532961632, + "learning_rate": 9.890099052176758e-06, + "loss": 0.4293, + "step": 3271 + }, + { + "epoch": 0.09, + "grad_norm": 1.8297578572405926, + "learning_rate": 9.890001090921788e-06, + "loss": 0.4156, + "step": 3272 + }, + { + "epoch": 0.09, + "grad_norm": 1.8609555400038724, + "learning_rate": 9.889903086512506e-06, + "loss": 0.4107, + "step": 3273 + }, + { + "epoch": 0.09, + "grad_norm": 2.147586630622629, + "learning_rate": 9.889805038949775e-06, + "loss": 0.4648, + "step": 3274 + }, + { + "epoch": 0.09, + "grad_norm": 1.0845839820585705, + "learning_rate": 9.88970694823446e-06, + "loss": 0.6019, + "step": 3275 + }, + { + "epoch": 0.1, + "grad_norm": 1.6488012916071029, + "learning_rate": 9.889608814367424e-06, + "loss": 0.434, + "step": 3276 + }, + { + "epoch": 0.1, + "grad_norm": 2.227230579636249, + "learning_rate": 9.88951063734954e-06, + "loss": 0.4604, + "step": 3277 + }, + { + "epoch": 0.1, + "grad_norm": 1.7580088458106384, + "learning_rate": 9.889412417181668e-06, + "loss": 0.4415, + "step": 3278 + }, + { + "epoch": 0.1, + "grad_norm": 1.468630190700361, + "learning_rate": 9.889314153864679e-06, + "loss": 0.401, + "step": 3279 + }, + { + "epoch": 0.1, + "grad_norm": 1.6442933020021993, + "learning_rate": 9.889215847399436e-06, + "loss": 0.4311, + "step": 3280 + }, + { + "epoch": 0.1, + "grad_norm": 1.8438987394655366, + "learning_rate": 9.88911749778681e-06, + "loss": 0.4186, + "step": 3281 + }, + { + "epoch": 0.1, + "grad_norm": 1.7302040835652392, + "learning_rate": 9.889019105027669e-06, + "loss": 0.4228, + "step": 3282 + }, + { + "epoch": 0.1, + "grad_norm": 1.863909650348357, + "learning_rate": 9.88892066912288e-06, + "loss": 0.4573, + "step": 3283 + }, + { + "epoch": 0.1, + "grad_norm": 2.239505961585501, + "learning_rate": 9.88882219007331e-06, + "loss": 0.4156, + "step": 3284 + }, + { + "epoch": 0.1, + "grad_norm": 1.6956870083496716, + "learning_rate": 9.888723667879831e-06, + "loss": 0.4195, + "step": 3285 + }, + { + "epoch": 0.1, + "grad_norm": 1.5344421700094695, + "learning_rate": 9.888625102543312e-06, + "loss": 0.4263, + "step": 3286 + }, + { + "epoch": 0.1, + "grad_norm": 1.6116967975205627, + "learning_rate": 9.888526494064621e-06, + "loss": 0.4565, + "step": 3287 + }, + { + "epoch": 0.1, + "grad_norm": 1.6402149376020336, + "learning_rate": 9.88842784244463e-06, + "loss": 0.4128, + "step": 3288 + }, + { + "epoch": 0.1, + "grad_norm": 1.161288630883141, + "learning_rate": 9.88832914768421e-06, + "loss": 0.5726, + "step": 3289 + }, + { + "epoch": 0.1, + "grad_norm": 1.7060715837124216, + "learning_rate": 9.88823040978423e-06, + "loss": 0.4197, + "step": 3290 + }, + { + "epoch": 0.1, + "grad_norm": 1.6030121688708132, + "learning_rate": 9.888131628745563e-06, + "loss": 0.4463, + "step": 3291 + }, + { + "epoch": 0.1, + "grad_norm": 1.698779249662717, + "learning_rate": 9.888032804569078e-06, + "loss": 0.4871, + "step": 3292 + }, + { + "epoch": 0.1, + "grad_norm": 1.9513723872047224, + "learning_rate": 9.887933937255652e-06, + "loss": 0.4404, + "step": 3293 + }, + { + "epoch": 0.1, + "grad_norm": 1.605210073606089, + "learning_rate": 9.887835026806155e-06, + "loss": 0.4443, + "step": 3294 + }, + { + "epoch": 0.1, + "grad_norm": 3.4057681544808993, + "learning_rate": 9.887736073221459e-06, + "loss": 0.4223, + "step": 3295 + }, + { + "epoch": 0.1, + "grad_norm": 1.8181925258800717, + "learning_rate": 9.887637076502436e-06, + "loss": 0.4899, + "step": 3296 + }, + { + "epoch": 0.1, + "grad_norm": 1.5814833709415963, + "learning_rate": 9.887538036649962e-06, + "loss": 0.4424, + "step": 3297 + }, + { + "epoch": 0.1, + "grad_norm": 1.9329246009521555, + "learning_rate": 9.887438953664912e-06, + "loss": 0.4528, + "step": 3298 + }, + { + "epoch": 0.1, + "grad_norm": 1.623101103511935, + "learning_rate": 9.88733982754816e-06, + "loss": 0.4057, + "step": 3299 + }, + { + "epoch": 0.1, + "grad_norm": 1.6769355225671005, + "learning_rate": 9.887240658300578e-06, + "loss": 0.3969, + "step": 3300 + }, + { + "epoch": 0.1, + "grad_norm": 1.6308495170975528, + "learning_rate": 9.887141445923042e-06, + "loss": 0.4617, + "step": 3301 + }, + { + "epoch": 0.1, + "grad_norm": 1.7635710360544095, + "learning_rate": 9.887042190416432e-06, + "loss": 0.4595, + "step": 3302 + }, + { + "epoch": 0.1, + "grad_norm": 1.930896605251076, + "learning_rate": 9.886942891781617e-06, + "loss": 0.4835, + "step": 3303 + }, + { + "epoch": 0.1, + "grad_norm": 1.7642073860040297, + "learning_rate": 9.886843550019478e-06, + "loss": 0.4713, + "step": 3304 + }, + { + "epoch": 0.1, + "grad_norm": 1.6356527351384698, + "learning_rate": 9.886744165130891e-06, + "loss": 0.4048, + "step": 3305 + }, + { + "epoch": 0.1, + "grad_norm": 1.8750012543143015, + "learning_rate": 9.886644737116732e-06, + "loss": 0.4975, + "step": 3306 + }, + { + "epoch": 0.1, + "grad_norm": 1.6170562819669279, + "learning_rate": 9.886545265977879e-06, + "loss": 0.4421, + "step": 3307 + }, + { + "epoch": 0.1, + "grad_norm": 1.7072837273284511, + "learning_rate": 9.88644575171521e-06, + "loss": 0.4624, + "step": 3308 + }, + { + "epoch": 0.1, + "grad_norm": 1.7385521735505394, + "learning_rate": 9.886346194329604e-06, + "loss": 0.417, + "step": 3309 + }, + { + "epoch": 0.1, + "grad_norm": 2.491225613597644, + "learning_rate": 9.886246593821937e-06, + "loss": 0.4352, + "step": 3310 + }, + { + "epoch": 0.1, + "grad_norm": 1.6965559763093232, + "learning_rate": 9.886146950193089e-06, + "loss": 0.4941, + "step": 3311 + }, + { + "epoch": 0.1, + "grad_norm": 1.7883143288556405, + "learning_rate": 9.886047263443943e-06, + "loss": 0.4646, + "step": 3312 + }, + { + "epoch": 0.1, + "grad_norm": 1.5528301531156556, + "learning_rate": 9.885947533575373e-06, + "loss": 0.4439, + "step": 3313 + }, + { + "epoch": 0.1, + "grad_norm": 1.7786948689377011, + "learning_rate": 9.885847760588263e-06, + "loss": 0.4233, + "step": 3314 + }, + { + "epoch": 0.1, + "grad_norm": 1.7009584584270034, + "learning_rate": 9.885747944483493e-06, + "loss": 0.4396, + "step": 3315 + }, + { + "epoch": 0.1, + "grad_norm": 1.7409292519878596, + "learning_rate": 9.88564808526194e-06, + "loss": 0.3878, + "step": 3316 + }, + { + "epoch": 0.1, + "grad_norm": 2.028570702369063, + "learning_rate": 9.885548182924492e-06, + "loss": 0.457, + "step": 3317 + }, + { + "epoch": 0.1, + "grad_norm": 1.6930243130976155, + "learning_rate": 9.885448237472026e-06, + "loss": 0.4152, + "step": 3318 + }, + { + "epoch": 0.1, + "grad_norm": 1.5982543367133901, + "learning_rate": 9.885348248905424e-06, + "loss": 0.4302, + "step": 3319 + }, + { + "epoch": 0.1, + "grad_norm": 1.564341045476591, + "learning_rate": 9.88524821722557e-06, + "loss": 0.4172, + "step": 3320 + }, + { + "epoch": 0.1, + "grad_norm": 1.7750525047381671, + "learning_rate": 9.885148142433348e-06, + "loss": 0.4283, + "step": 3321 + }, + { + "epoch": 0.1, + "grad_norm": 1.561112869667659, + "learning_rate": 9.885048024529638e-06, + "loss": 0.4595, + "step": 3322 + }, + { + "epoch": 0.1, + "grad_norm": 1.8600786681904387, + "learning_rate": 9.884947863515324e-06, + "loss": 0.4173, + "step": 3323 + }, + { + "epoch": 0.1, + "grad_norm": 1.731374128472419, + "learning_rate": 9.884847659391293e-06, + "loss": 0.4508, + "step": 3324 + }, + { + "epoch": 0.1, + "grad_norm": 1.8094785395695205, + "learning_rate": 9.884747412158427e-06, + "loss": 0.432, + "step": 3325 + }, + { + "epoch": 0.1, + "grad_norm": 1.5200743662687155, + "learning_rate": 9.884647121817609e-06, + "loss": 0.4314, + "step": 3326 + }, + { + "epoch": 0.1, + "grad_norm": 2.571597924906129, + "learning_rate": 9.884546788369726e-06, + "loss": 0.402, + "step": 3327 + }, + { + "epoch": 0.1, + "grad_norm": 1.9326430525222418, + "learning_rate": 9.884446411815664e-06, + "loss": 0.417, + "step": 3328 + }, + { + "epoch": 0.1, + "grad_norm": 1.9516583275982833, + "learning_rate": 9.88434599215631e-06, + "loss": 0.4639, + "step": 3329 + }, + { + "epoch": 0.1, + "grad_norm": 1.6548414735093029, + "learning_rate": 9.884245529392547e-06, + "loss": 0.4429, + "step": 3330 + }, + { + "epoch": 0.1, + "grad_norm": 1.7069332624816242, + "learning_rate": 9.884145023525263e-06, + "loss": 0.417, + "step": 3331 + }, + { + "epoch": 0.1, + "grad_norm": 1.6533592098842136, + "learning_rate": 9.884044474555346e-06, + "loss": 0.4008, + "step": 3332 + }, + { + "epoch": 0.1, + "grad_norm": 1.7230448413452042, + "learning_rate": 9.883943882483681e-06, + "loss": 0.4285, + "step": 3333 + }, + { + "epoch": 0.1, + "grad_norm": 1.7551988588808038, + "learning_rate": 9.883843247311158e-06, + "loss": 0.4711, + "step": 3334 + }, + { + "epoch": 0.1, + "grad_norm": 1.737018831405366, + "learning_rate": 9.883742569038663e-06, + "loss": 0.444, + "step": 3335 + }, + { + "epoch": 0.1, + "grad_norm": 1.759335383144232, + "learning_rate": 9.883641847667087e-06, + "loss": 0.4297, + "step": 3336 + }, + { + "epoch": 0.1, + "grad_norm": 1.47286194530331, + "learning_rate": 9.883541083197316e-06, + "loss": 0.4194, + "step": 3337 + }, + { + "epoch": 0.1, + "grad_norm": 1.6327166167208722, + "learning_rate": 9.883440275630242e-06, + "loss": 0.4165, + "step": 3338 + }, + { + "epoch": 0.1, + "grad_norm": 1.7420551386369028, + "learning_rate": 9.883339424966754e-06, + "loss": 0.4237, + "step": 3339 + }, + { + "epoch": 0.1, + "grad_norm": 2.0074657814148904, + "learning_rate": 9.883238531207739e-06, + "loss": 0.4464, + "step": 3340 + }, + { + "epoch": 0.1, + "grad_norm": 2.170744996217807, + "learning_rate": 9.883137594354091e-06, + "loss": 0.4586, + "step": 3341 + }, + { + "epoch": 0.1, + "grad_norm": 2.335637354872468, + "learning_rate": 9.8830366144067e-06, + "loss": 0.4806, + "step": 3342 + }, + { + "epoch": 0.1, + "grad_norm": 1.7108070932656345, + "learning_rate": 9.882935591366456e-06, + "loss": 0.494, + "step": 3343 + }, + { + "epoch": 0.1, + "grad_norm": 1.812039818309852, + "learning_rate": 9.882834525234252e-06, + "loss": 0.4212, + "step": 3344 + }, + { + "epoch": 0.1, + "grad_norm": 2.7624844209466546, + "learning_rate": 9.882733416010979e-06, + "loss": 0.4607, + "step": 3345 + }, + { + "epoch": 0.1, + "grad_norm": 1.5486452404606568, + "learning_rate": 9.88263226369753e-06, + "loss": 0.4144, + "step": 3346 + }, + { + "epoch": 0.1, + "grad_norm": 1.7704156703115013, + "learning_rate": 9.882531068294797e-06, + "loss": 0.441, + "step": 3347 + }, + { + "epoch": 0.1, + "grad_norm": 1.6048804776616805, + "learning_rate": 9.882429829803672e-06, + "loss": 0.4454, + "step": 3348 + }, + { + "epoch": 0.1, + "grad_norm": 1.5926839753272015, + "learning_rate": 9.88232854822505e-06, + "loss": 0.4039, + "step": 3349 + }, + { + "epoch": 0.1, + "grad_norm": 1.72671797153442, + "learning_rate": 9.882227223559824e-06, + "loss": 0.5077, + "step": 3350 + }, + { + "epoch": 0.1, + "grad_norm": 1.5657554209559217, + "learning_rate": 9.88212585580889e-06, + "loss": 0.3891, + "step": 3351 + }, + { + "epoch": 0.1, + "grad_norm": 1.4998301853744544, + "learning_rate": 9.88202444497314e-06, + "loss": 0.3987, + "step": 3352 + }, + { + "epoch": 0.1, + "grad_norm": 1.5615143433082068, + "learning_rate": 9.88192299105347e-06, + "loss": 0.4068, + "step": 3353 + }, + { + "epoch": 0.1, + "grad_norm": 1.5433073415825667, + "learning_rate": 9.881821494050776e-06, + "loss": 0.4487, + "step": 3354 + }, + { + "epoch": 0.1, + "grad_norm": 1.6002394296593356, + "learning_rate": 9.881719953965953e-06, + "loss": 0.4535, + "step": 3355 + }, + { + "epoch": 0.1, + "grad_norm": 1.5443455446582708, + "learning_rate": 9.881618370799897e-06, + "loss": 0.4803, + "step": 3356 + }, + { + "epoch": 0.1, + "grad_norm": 1.8635685516737421, + "learning_rate": 9.881516744553505e-06, + "loss": 0.4958, + "step": 3357 + }, + { + "epoch": 0.1, + "grad_norm": 1.7532743048205386, + "learning_rate": 9.881415075227674e-06, + "loss": 0.4616, + "step": 3358 + }, + { + "epoch": 0.1, + "grad_norm": 1.6028067379614552, + "learning_rate": 9.8813133628233e-06, + "loss": 0.4376, + "step": 3359 + }, + { + "epoch": 0.1, + "grad_norm": 1.7307591229670092, + "learning_rate": 9.881211607341283e-06, + "loss": 0.4188, + "step": 3360 + }, + { + "epoch": 0.1, + "grad_norm": 1.1784265027881005, + "learning_rate": 9.881109808782517e-06, + "loss": 0.5603, + "step": 3361 + }, + { + "epoch": 0.1, + "grad_norm": 1.1757558350139483, + "learning_rate": 9.881007967147904e-06, + "loss": 0.6158, + "step": 3362 + }, + { + "epoch": 0.1, + "grad_norm": 1.7233422554844313, + "learning_rate": 9.880906082438342e-06, + "loss": 0.4805, + "step": 3363 + }, + { + "epoch": 0.1, + "grad_norm": 2.1549611495566072, + "learning_rate": 9.88080415465473e-06, + "loss": 0.477, + "step": 3364 + }, + { + "epoch": 0.1, + "grad_norm": 2.470009530843728, + "learning_rate": 9.880702183797966e-06, + "loss": 0.4181, + "step": 3365 + }, + { + "epoch": 0.1, + "grad_norm": 1.808642726251371, + "learning_rate": 9.88060016986895e-06, + "loss": 0.5036, + "step": 3366 + }, + { + "epoch": 0.1, + "grad_norm": 2.310398760709161, + "learning_rate": 9.880498112868586e-06, + "loss": 0.5309, + "step": 3367 + }, + { + "epoch": 0.1, + "grad_norm": 1.6059262925121873, + "learning_rate": 9.88039601279777e-06, + "loss": 0.4667, + "step": 3368 + }, + { + "epoch": 0.1, + "grad_norm": 1.766192065636433, + "learning_rate": 9.880293869657407e-06, + "loss": 0.4132, + "step": 3369 + }, + { + "epoch": 0.1, + "grad_norm": 2.7781247133291025, + "learning_rate": 9.880191683448394e-06, + "loss": 0.4237, + "step": 3370 + }, + { + "epoch": 0.1, + "grad_norm": 1.7941690288387364, + "learning_rate": 9.880089454171638e-06, + "loss": 0.4456, + "step": 3371 + }, + { + "epoch": 0.1, + "grad_norm": 2.0165110846859315, + "learning_rate": 9.879987181828038e-06, + "loss": 0.4462, + "step": 3372 + }, + { + "epoch": 0.1, + "grad_norm": 1.7063321676370171, + "learning_rate": 9.879884866418494e-06, + "loss": 0.443, + "step": 3373 + }, + { + "epoch": 0.1, + "grad_norm": 1.684478333765024, + "learning_rate": 9.879782507943916e-06, + "loss": 0.4723, + "step": 3374 + }, + { + "epoch": 0.1, + "grad_norm": 1.842804102619836, + "learning_rate": 9.8796801064052e-06, + "loss": 0.4079, + "step": 3375 + }, + { + "epoch": 0.1, + "grad_norm": 2.177170629828762, + "learning_rate": 9.879577661803255e-06, + "loss": 0.4648, + "step": 3376 + }, + { + "epoch": 0.1, + "grad_norm": 1.6775492008772097, + "learning_rate": 9.879475174138982e-06, + "loss": 0.4386, + "step": 3377 + }, + { + "epoch": 0.1, + "grad_norm": 1.636630477974574, + "learning_rate": 9.879372643413287e-06, + "loss": 0.4681, + "step": 3378 + }, + { + "epoch": 0.1, + "grad_norm": 1.7580961835653361, + "learning_rate": 9.879270069627074e-06, + "loss": 0.4269, + "step": 3379 + }, + { + "epoch": 0.1, + "grad_norm": 1.8489998149557685, + "learning_rate": 9.879167452781247e-06, + "loss": 0.4149, + "step": 3380 + }, + { + "epoch": 0.1, + "grad_norm": 1.5956565678490102, + "learning_rate": 9.879064792876717e-06, + "loss": 0.4573, + "step": 3381 + }, + { + "epoch": 0.1, + "grad_norm": 1.6750559983532618, + "learning_rate": 9.878962089914381e-06, + "loss": 0.4954, + "step": 3382 + }, + { + "epoch": 0.1, + "grad_norm": 1.727169197500219, + "learning_rate": 9.878859343895154e-06, + "loss": 0.467, + "step": 3383 + }, + { + "epoch": 0.1, + "grad_norm": 1.748762084080032, + "learning_rate": 9.878756554819937e-06, + "loss": 0.4143, + "step": 3384 + }, + { + "epoch": 0.1, + "grad_norm": 1.9729445443968125, + "learning_rate": 9.878653722689641e-06, + "loss": 0.4551, + "step": 3385 + }, + { + "epoch": 0.1, + "grad_norm": 1.7168550052633864, + "learning_rate": 9.878550847505172e-06, + "loss": 0.4355, + "step": 3386 + }, + { + "epoch": 0.1, + "grad_norm": 1.675599866560593, + "learning_rate": 9.878447929267436e-06, + "loss": 0.4203, + "step": 3387 + }, + { + "epoch": 0.1, + "grad_norm": 1.9793874872499395, + "learning_rate": 9.878344967977345e-06, + "loss": 0.4123, + "step": 3388 + }, + { + "epoch": 0.1, + "grad_norm": 1.5926810180626054, + "learning_rate": 9.878241963635805e-06, + "loss": 0.4117, + "step": 3389 + }, + { + "epoch": 0.1, + "grad_norm": 1.6427396129174325, + "learning_rate": 9.878138916243723e-06, + "loss": 0.4878, + "step": 3390 + }, + { + "epoch": 0.1, + "grad_norm": 1.5716335553831005, + "learning_rate": 9.878035825802012e-06, + "loss": 0.4515, + "step": 3391 + }, + { + "epoch": 0.1, + "grad_norm": 2.1032248943770604, + "learning_rate": 9.877932692311583e-06, + "loss": 0.4568, + "step": 3392 + }, + { + "epoch": 0.1, + "grad_norm": 1.9299722304344704, + "learning_rate": 9.877829515773342e-06, + "loss": 0.4443, + "step": 3393 + }, + { + "epoch": 0.1, + "grad_norm": 1.7038498103523942, + "learning_rate": 9.877726296188202e-06, + "loss": 0.6525, + "step": 3394 + }, + { + "epoch": 0.1, + "grad_norm": 1.6613582451342463, + "learning_rate": 9.877623033557073e-06, + "loss": 0.3964, + "step": 3395 + }, + { + "epoch": 0.1, + "grad_norm": 1.8886847557245527, + "learning_rate": 9.877519727880867e-06, + "loss": 0.4706, + "step": 3396 + }, + { + "epoch": 0.1, + "grad_norm": 1.8277148680486113, + "learning_rate": 9.877416379160496e-06, + "loss": 0.4115, + "step": 3397 + }, + { + "epoch": 0.1, + "grad_norm": 1.6374376844115799, + "learning_rate": 9.87731298739687e-06, + "loss": 0.4062, + "step": 3398 + }, + { + "epoch": 0.1, + "grad_norm": 1.696502975665547, + "learning_rate": 9.877209552590902e-06, + "loss": 0.4584, + "step": 3399 + }, + { + "epoch": 0.1, + "grad_norm": 0.954850759821852, + "learning_rate": 9.877106074743507e-06, + "loss": 0.6076, + "step": 3400 + }, + { + "epoch": 0.1, + "grad_norm": 2.013144382058694, + "learning_rate": 9.877002553855597e-06, + "loss": 0.4344, + "step": 3401 + }, + { + "epoch": 0.1, + "grad_norm": 2.7853377320869788, + "learning_rate": 9.876898989928085e-06, + "loss": 0.4465, + "step": 3402 + }, + { + "epoch": 0.1, + "grad_norm": 1.8591326552878025, + "learning_rate": 9.876795382961886e-06, + "loss": 0.4263, + "step": 3403 + }, + { + "epoch": 0.1, + "grad_norm": 1.6979612943179394, + "learning_rate": 9.876691732957913e-06, + "loss": 0.4493, + "step": 3404 + }, + { + "epoch": 0.1, + "grad_norm": 2.021818224350349, + "learning_rate": 9.876588039917082e-06, + "loss": 0.4273, + "step": 3405 + }, + { + "epoch": 0.1, + "grad_norm": 1.5478596754913538, + "learning_rate": 9.876484303840306e-06, + "loss": 0.4391, + "step": 3406 + }, + { + "epoch": 0.1, + "grad_norm": 1.5345925760506482, + "learning_rate": 9.876380524728501e-06, + "loss": 0.405, + "step": 3407 + }, + { + "epoch": 0.1, + "grad_norm": 1.5718906325968043, + "learning_rate": 9.876276702582586e-06, + "loss": 0.4387, + "step": 3408 + }, + { + "epoch": 0.1, + "grad_norm": 1.5997146973346064, + "learning_rate": 9.876172837403476e-06, + "loss": 0.4066, + "step": 3409 + }, + { + "epoch": 0.1, + "grad_norm": 1.5563539359044352, + "learning_rate": 9.876068929192085e-06, + "loss": 0.4494, + "step": 3410 + }, + { + "epoch": 0.1, + "grad_norm": 1.6721303669805179, + "learning_rate": 9.875964977949331e-06, + "loss": 0.432, + "step": 3411 + }, + { + "epoch": 0.1, + "grad_norm": 1.6026660930337464, + "learning_rate": 9.875860983676132e-06, + "loss": 0.4439, + "step": 3412 + }, + { + "epoch": 0.1, + "grad_norm": 1.5320463814981482, + "learning_rate": 9.875756946373408e-06, + "loss": 0.4227, + "step": 3413 + }, + { + "epoch": 0.1, + "grad_norm": 1.6712815043381046, + "learning_rate": 9.875652866042072e-06, + "loss": 0.4607, + "step": 3414 + }, + { + "epoch": 0.1, + "grad_norm": 1.730221688208388, + "learning_rate": 9.875548742683047e-06, + "loss": 0.4459, + "step": 3415 + }, + { + "epoch": 0.1, + "grad_norm": 1.8675795396161416, + "learning_rate": 9.875444576297249e-06, + "loss": 0.4359, + "step": 3416 + }, + { + "epoch": 0.1, + "grad_norm": 1.6269100765555393, + "learning_rate": 9.8753403668856e-06, + "loss": 0.4808, + "step": 3417 + }, + { + "epoch": 0.1, + "grad_norm": 1.7448415397169699, + "learning_rate": 9.875236114449017e-06, + "loss": 0.4454, + "step": 3418 + }, + { + "epoch": 0.1, + "grad_norm": 1.5656660936472935, + "learning_rate": 9.875131818988421e-06, + "loss": 0.4328, + "step": 3419 + }, + { + "epoch": 0.1, + "grad_norm": 2.018073618555013, + "learning_rate": 9.875027480504733e-06, + "loss": 0.4457, + "step": 3420 + }, + { + "epoch": 0.1, + "grad_norm": 1.8337104791386325, + "learning_rate": 9.874923098998874e-06, + "loss": 0.4462, + "step": 3421 + }, + { + "epoch": 0.1, + "grad_norm": 2.6653468283984356, + "learning_rate": 9.874818674471765e-06, + "loss": 0.4276, + "step": 3422 + }, + { + "epoch": 0.1, + "grad_norm": 5.022665642172762, + "learning_rate": 9.874714206924327e-06, + "loss": 0.458, + "step": 3423 + }, + { + "epoch": 0.1, + "grad_norm": 1.8634382404787297, + "learning_rate": 9.874609696357482e-06, + "loss": 0.4271, + "step": 3424 + }, + { + "epoch": 0.1, + "grad_norm": 1.8114342968216044, + "learning_rate": 9.87450514277215e-06, + "loss": 0.4087, + "step": 3425 + }, + { + "epoch": 0.1, + "grad_norm": 1.7794102777811502, + "learning_rate": 9.87440054616926e-06, + "loss": 0.4336, + "step": 3426 + }, + { + "epoch": 0.1, + "grad_norm": 1.4133257293804045, + "learning_rate": 9.874295906549728e-06, + "loss": 0.6047, + "step": 3427 + }, + { + "epoch": 0.1, + "grad_norm": 1.6713013299424522, + "learning_rate": 9.874191223914482e-06, + "loss": 0.4888, + "step": 3428 + }, + { + "epoch": 0.1, + "grad_norm": 1.7958778951033456, + "learning_rate": 9.874086498264444e-06, + "loss": 0.4618, + "step": 3429 + }, + { + "epoch": 0.1, + "grad_norm": 1.660850959014511, + "learning_rate": 9.873981729600539e-06, + "loss": 0.3944, + "step": 3430 + }, + { + "epoch": 0.1, + "grad_norm": 1.5649509517998617, + "learning_rate": 9.87387691792369e-06, + "loss": 0.4318, + "step": 3431 + }, + { + "epoch": 0.1, + "grad_norm": 1.573736520442553, + "learning_rate": 9.873772063234823e-06, + "loss": 0.4469, + "step": 3432 + }, + { + "epoch": 0.1, + "grad_norm": 1.6217798120676878, + "learning_rate": 9.873667165534864e-06, + "loss": 0.4352, + "step": 3433 + }, + { + "epoch": 0.1, + "grad_norm": 1.9711220063867385, + "learning_rate": 9.873562224824739e-06, + "loss": 0.4334, + "step": 3434 + }, + { + "epoch": 0.1, + "grad_norm": 1.6117718364128437, + "learning_rate": 9.873457241105372e-06, + "loss": 0.4169, + "step": 3435 + }, + { + "epoch": 0.1, + "grad_norm": 5.274919626569139, + "learning_rate": 9.873352214377693e-06, + "loss": 0.4373, + "step": 3436 + }, + { + "epoch": 0.1, + "grad_norm": 2.099282114219012, + "learning_rate": 9.873247144642627e-06, + "loss": 0.473, + "step": 3437 + }, + { + "epoch": 0.1, + "grad_norm": 1.80401692022957, + "learning_rate": 9.873142031901097e-06, + "loss": 0.4368, + "step": 3438 + }, + { + "epoch": 0.1, + "grad_norm": 1.634338552821972, + "learning_rate": 9.873036876154037e-06, + "loss": 0.4294, + "step": 3439 + }, + { + "epoch": 0.1, + "grad_norm": 1.6783895112420175, + "learning_rate": 9.872931677402372e-06, + "loss": 0.4146, + "step": 3440 + }, + { + "epoch": 0.1, + "grad_norm": 1.8092076034711277, + "learning_rate": 9.872826435647032e-06, + "loss": 0.4423, + "step": 3441 + }, + { + "epoch": 0.1, + "grad_norm": 2.4585252868439103, + "learning_rate": 9.872721150888942e-06, + "loss": 0.4274, + "step": 3442 + }, + { + "epoch": 0.1, + "grad_norm": 1.5515171519124917, + "learning_rate": 9.872615823129036e-06, + "loss": 0.4281, + "step": 3443 + }, + { + "epoch": 0.1, + "grad_norm": 1.6398754812550227, + "learning_rate": 9.87251045236824e-06, + "loss": 0.4006, + "step": 3444 + }, + { + "epoch": 0.1, + "grad_norm": 1.6744477349179134, + "learning_rate": 9.872405038607487e-06, + "loss": 0.4386, + "step": 3445 + }, + { + "epoch": 0.1, + "grad_norm": 1.6613901531283497, + "learning_rate": 9.872299581847703e-06, + "loss": 0.4008, + "step": 3446 + }, + { + "epoch": 0.1, + "grad_norm": 1.6434308614076065, + "learning_rate": 9.872194082089824e-06, + "loss": 0.443, + "step": 3447 + }, + { + "epoch": 0.1, + "grad_norm": 2.9439372351851976, + "learning_rate": 9.872088539334777e-06, + "loss": 0.4541, + "step": 3448 + }, + { + "epoch": 0.1, + "grad_norm": 1.6862589130930046, + "learning_rate": 9.871982953583494e-06, + "loss": 0.4543, + "step": 3449 + }, + { + "epoch": 0.1, + "grad_norm": 1.5977961534718874, + "learning_rate": 9.871877324836906e-06, + "loss": 0.3787, + "step": 3450 + }, + { + "epoch": 0.1, + "grad_norm": 2.2427356808380154, + "learning_rate": 9.87177165309595e-06, + "loss": 0.4094, + "step": 3451 + }, + { + "epoch": 0.1, + "grad_norm": 1.7159632208612174, + "learning_rate": 9.871665938361554e-06, + "loss": 0.4533, + "step": 3452 + }, + { + "epoch": 0.1, + "grad_norm": 1.58207621302842, + "learning_rate": 9.871560180634651e-06, + "loss": 0.3928, + "step": 3453 + }, + { + "epoch": 0.1, + "grad_norm": 2.4949092450434454, + "learning_rate": 9.871454379916177e-06, + "loss": 0.503, + "step": 3454 + }, + { + "epoch": 0.1, + "grad_norm": 1.7049219691768638, + "learning_rate": 9.871348536207061e-06, + "loss": 0.4343, + "step": 3455 + }, + { + "epoch": 0.1, + "grad_norm": 1.6928670960975314, + "learning_rate": 9.871242649508243e-06, + "loss": 0.4085, + "step": 3456 + }, + { + "epoch": 0.1, + "grad_norm": 1.598518599525816, + "learning_rate": 9.871136719820653e-06, + "loss": 0.4068, + "step": 3457 + }, + { + "epoch": 0.1, + "grad_norm": 11.504515505552138, + "learning_rate": 9.871030747145228e-06, + "loss": 0.4661, + "step": 3458 + }, + { + "epoch": 0.1, + "grad_norm": 2.097532975253543, + "learning_rate": 9.870924731482903e-06, + "loss": 0.4713, + "step": 3459 + }, + { + "epoch": 0.1, + "grad_norm": 1.8032838681869223, + "learning_rate": 9.870818672834612e-06, + "loss": 0.435, + "step": 3460 + }, + { + "epoch": 0.1, + "grad_norm": 1.617832409293424, + "learning_rate": 9.870712571201294e-06, + "loss": 0.4076, + "step": 3461 + }, + { + "epoch": 0.1, + "grad_norm": 2.1267483137585472, + "learning_rate": 9.870606426583881e-06, + "loss": 0.4615, + "step": 3462 + }, + { + "epoch": 0.1, + "grad_norm": 1.7336066948180278, + "learning_rate": 9.870500238983315e-06, + "loss": 0.4374, + "step": 3463 + }, + { + "epoch": 0.1, + "grad_norm": 1.7685053786796832, + "learning_rate": 9.87039400840053e-06, + "loss": 0.4611, + "step": 3464 + }, + { + "epoch": 0.1, + "grad_norm": 1.7420204054887751, + "learning_rate": 9.870287734836463e-06, + "loss": 0.4485, + "step": 3465 + }, + { + "epoch": 0.1, + "grad_norm": 1.6753409858314814, + "learning_rate": 9.870181418292053e-06, + "loss": 0.4538, + "step": 3466 + }, + { + "epoch": 0.1, + "grad_norm": 1.8642648949820646, + "learning_rate": 9.870075058768237e-06, + "loss": 0.4513, + "step": 3467 + }, + { + "epoch": 0.1, + "grad_norm": 1.5324172648955157, + "learning_rate": 9.869968656265955e-06, + "loss": 0.4334, + "step": 3468 + }, + { + "epoch": 0.1, + "grad_norm": 4.5431319091416364, + "learning_rate": 9.869862210786146e-06, + "loss": 0.4015, + "step": 3469 + }, + { + "epoch": 0.1, + "grad_norm": 2.0642161766716125, + "learning_rate": 9.869755722329748e-06, + "loss": 0.4311, + "step": 3470 + }, + { + "epoch": 0.1, + "grad_norm": 1.8743624996143402, + "learning_rate": 9.869649190897703e-06, + "loss": 0.3948, + "step": 3471 + }, + { + "epoch": 0.1, + "grad_norm": 1.7966217200061008, + "learning_rate": 9.869542616490951e-06, + "loss": 0.4304, + "step": 3472 + }, + { + "epoch": 0.1, + "grad_norm": 1.6827266211974818, + "learning_rate": 9.869435999110428e-06, + "loss": 0.4422, + "step": 3473 + }, + { + "epoch": 0.1, + "grad_norm": 1.643799327548662, + "learning_rate": 9.869329338757081e-06, + "loss": 0.4401, + "step": 3474 + }, + { + "epoch": 0.1, + "grad_norm": 1.8045687163271191, + "learning_rate": 9.869222635431847e-06, + "loss": 0.4731, + "step": 3475 + }, + { + "epoch": 0.1, + "grad_norm": 1.873914723551838, + "learning_rate": 9.86911588913567e-06, + "loss": 0.4181, + "step": 3476 + }, + { + "epoch": 0.1, + "grad_norm": 5.856805401760571, + "learning_rate": 9.86900909986949e-06, + "loss": 0.4604, + "step": 3477 + }, + { + "epoch": 0.1, + "grad_norm": 2.091849138186663, + "learning_rate": 9.868902267634253e-06, + "loss": 0.4599, + "step": 3478 + }, + { + "epoch": 0.1, + "grad_norm": 1.5314968421815431, + "learning_rate": 9.868795392430899e-06, + "loss": 0.4185, + "step": 3479 + }, + { + "epoch": 0.1, + "grad_norm": 5.171695208475087, + "learning_rate": 9.86868847426037e-06, + "loss": 0.4129, + "step": 3480 + }, + { + "epoch": 0.1, + "grad_norm": 1.657569030824905, + "learning_rate": 9.868581513123612e-06, + "loss": 0.4262, + "step": 3481 + }, + { + "epoch": 0.1, + "grad_norm": 1.519647233991108, + "learning_rate": 9.868474509021569e-06, + "loss": 0.4108, + "step": 3482 + }, + { + "epoch": 0.1, + "grad_norm": 1.7404944951138368, + "learning_rate": 9.868367461955185e-06, + "loss": 0.4705, + "step": 3483 + }, + { + "epoch": 0.1, + "grad_norm": 1.528067493227226, + "learning_rate": 9.868260371925402e-06, + "loss": 0.4199, + "step": 3484 + }, + { + "epoch": 0.1, + "grad_norm": 1.6974811818334974, + "learning_rate": 9.868153238933168e-06, + "loss": 0.3874, + "step": 3485 + }, + { + "epoch": 0.1, + "grad_norm": 1.8240206362559126, + "learning_rate": 9.868046062979427e-06, + "loss": 0.4677, + "step": 3486 + }, + { + "epoch": 0.1, + "grad_norm": 1.8714229183956255, + "learning_rate": 9.867938844065127e-06, + "loss": 0.4293, + "step": 3487 + }, + { + "epoch": 0.1, + "grad_norm": 1.7768498607604621, + "learning_rate": 9.867831582191212e-06, + "loss": 0.4135, + "step": 3488 + }, + { + "epoch": 0.1, + "grad_norm": 1.7121992808405049, + "learning_rate": 9.867724277358627e-06, + "loss": 0.3934, + "step": 3489 + }, + { + "epoch": 0.1, + "grad_norm": 1.6762713351212724, + "learning_rate": 9.867616929568324e-06, + "loss": 0.4392, + "step": 3490 + }, + { + "epoch": 0.1, + "grad_norm": 1.508326971146406, + "learning_rate": 9.867509538821249e-06, + "loss": 0.4085, + "step": 3491 + }, + { + "epoch": 0.1, + "grad_norm": 1.7060944847731947, + "learning_rate": 9.867402105118346e-06, + "loss": 0.4426, + "step": 3492 + }, + { + "epoch": 0.1, + "grad_norm": 1.9627756841916488, + "learning_rate": 9.867294628460566e-06, + "loss": 0.4676, + "step": 3493 + }, + { + "epoch": 0.1, + "grad_norm": 2.600175846946604, + "learning_rate": 9.867187108848857e-06, + "loss": 0.4736, + "step": 3494 + }, + { + "epoch": 0.1, + "grad_norm": 1.8053166792392674, + "learning_rate": 9.867079546284167e-06, + "loss": 0.4507, + "step": 3495 + }, + { + "epoch": 0.1, + "grad_norm": 1.6580311323794326, + "learning_rate": 9.866971940767447e-06, + "loss": 0.3823, + "step": 3496 + }, + { + "epoch": 0.1, + "grad_norm": 1.802638305321211, + "learning_rate": 9.866864292299646e-06, + "loss": 0.4242, + "step": 3497 + }, + { + "epoch": 0.1, + "grad_norm": 1.764683027036177, + "learning_rate": 9.86675660088171e-06, + "loss": 0.4725, + "step": 3498 + }, + { + "epoch": 0.1, + "grad_norm": 1.5363918942600578, + "learning_rate": 9.866648866514596e-06, + "loss": 0.4182, + "step": 3499 + }, + { + "epoch": 0.1, + "grad_norm": 1.763736510898274, + "learning_rate": 9.866541089199252e-06, + "loss": 0.4511, + "step": 3500 + }, + { + "epoch": 0.1, + "grad_norm": 1.6928265418748987, + "learning_rate": 9.866433268936628e-06, + "loss": 0.4365, + "step": 3501 + }, + { + "epoch": 0.1, + "grad_norm": 1.773701073226327, + "learning_rate": 9.866325405727678e-06, + "loss": 0.4157, + "step": 3502 + }, + { + "epoch": 0.1, + "grad_norm": 1.875323458000657, + "learning_rate": 9.866217499573351e-06, + "loss": 0.4532, + "step": 3503 + }, + { + "epoch": 0.1, + "grad_norm": 1.5738031344841883, + "learning_rate": 9.866109550474601e-06, + "loss": 0.4231, + "step": 3504 + }, + { + "epoch": 0.1, + "grad_norm": 1.6551097043966456, + "learning_rate": 9.86600155843238e-06, + "loss": 0.4239, + "step": 3505 + }, + { + "epoch": 0.1, + "grad_norm": 1.7320601934618671, + "learning_rate": 9.865893523447641e-06, + "loss": 0.48, + "step": 3506 + }, + { + "epoch": 0.1, + "grad_norm": 1.7018229555194277, + "learning_rate": 9.865785445521338e-06, + "loss": 0.4131, + "step": 3507 + }, + { + "epoch": 0.1, + "grad_norm": 1.6318405013106994, + "learning_rate": 9.865677324654425e-06, + "loss": 0.4208, + "step": 3508 + }, + { + "epoch": 0.1, + "grad_norm": 1.786942849267072, + "learning_rate": 9.865569160847855e-06, + "loss": 0.4291, + "step": 3509 + }, + { + "epoch": 0.1, + "grad_norm": 1.5260460061656989, + "learning_rate": 9.865460954102582e-06, + "loss": 0.4223, + "step": 3510 + }, + { + "epoch": 0.1, + "grad_norm": 1.5509969268841328, + "learning_rate": 9.865352704419564e-06, + "loss": 0.4399, + "step": 3511 + }, + { + "epoch": 0.1, + "grad_norm": 2.0602120550603544, + "learning_rate": 9.865244411799754e-06, + "loss": 0.4215, + "step": 3512 + }, + { + "epoch": 0.1, + "grad_norm": 1.6064626502240202, + "learning_rate": 9.865136076244109e-06, + "loss": 0.4234, + "step": 3513 + }, + { + "epoch": 0.1, + "grad_norm": 2.4624145634014534, + "learning_rate": 9.865027697753582e-06, + "loss": 0.4276, + "step": 3514 + }, + { + "epoch": 0.1, + "grad_norm": 1.5814360303011579, + "learning_rate": 9.864919276329132e-06, + "loss": 0.4388, + "step": 3515 + }, + { + "epoch": 0.1, + "grad_norm": 1.784285063990952, + "learning_rate": 9.864810811971717e-06, + "loss": 0.42, + "step": 3516 + }, + { + "epoch": 0.1, + "grad_norm": 5.1174378116629935, + "learning_rate": 9.86470230468229e-06, + "loss": 0.4671, + "step": 3517 + }, + { + "epoch": 0.1, + "grad_norm": 1.8030681820904388, + "learning_rate": 9.864593754461814e-06, + "loss": 0.4197, + "step": 3518 + }, + { + "epoch": 0.1, + "grad_norm": 2.0063051540034778, + "learning_rate": 9.864485161311242e-06, + "loss": 0.4289, + "step": 3519 + }, + { + "epoch": 0.1, + "grad_norm": 1.453010821871173, + "learning_rate": 9.864376525231537e-06, + "loss": 0.4259, + "step": 3520 + }, + { + "epoch": 0.1, + "grad_norm": 1.4646582297234558, + "learning_rate": 9.864267846223652e-06, + "loss": 0.4169, + "step": 3521 + }, + { + "epoch": 0.1, + "grad_norm": 1.5537537021027559, + "learning_rate": 9.864159124288552e-06, + "loss": 0.4107, + "step": 3522 + }, + { + "epoch": 0.1, + "grad_norm": 1.0660274652006596, + "learning_rate": 9.864050359427194e-06, + "loss": 0.5991, + "step": 3523 + }, + { + "epoch": 0.1, + "grad_norm": 2.107532903968669, + "learning_rate": 9.863941551640535e-06, + "loss": 0.4569, + "step": 3524 + }, + { + "epoch": 0.1, + "grad_norm": 2.0179302905810577, + "learning_rate": 9.863832700929538e-06, + "loss": 0.4087, + "step": 3525 + }, + { + "epoch": 0.1, + "grad_norm": 1.8956508548670254, + "learning_rate": 9.863723807295165e-06, + "loss": 0.421, + "step": 3526 + }, + { + "epoch": 0.1, + "grad_norm": 1.980080072671125, + "learning_rate": 9.863614870738375e-06, + "loss": 0.4125, + "step": 3527 + }, + { + "epoch": 0.1, + "grad_norm": 1.5928338014805088, + "learning_rate": 9.86350589126013e-06, + "loss": 0.4481, + "step": 3528 + }, + { + "epoch": 0.1, + "grad_norm": 1.7833546659495947, + "learning_rate": 9.863396868861393e-06, + "loss": 0.4168, + "step": 3529 + }, + { + "epoch": 0.1, + "grad_norm": 1.6509441727977507, + "learning_rate": 9.863287803543122e-06, + "loss": 0.4389, + "step": 3530 + }, + { + "epoch": 0.1, + "grad_norm": 1.9804293013992627, + "learning_rate": 9.863178695306284e-06, + "loss": 0.4414, + "step": 3531 + }, + { + "epoch": 0.1, + "grad_norm": 1.7658338602865373, + "learning_rate": 9.863069544151838e-06, + "loss": 0.4501, + "step": 3532 + }, + { + "epoch": 0.1, + "grad_norm": 1.6752243743284452, + "learning_rate": 9.86296035008075e-06, + "loss": 0.4576, + "step": 3533 + }, + { + "epoch": 0.1, + "grad_norm": 1.6658713899045694, + "learning_rate": 9.862851113093985e-06, + "loss": 0.4323, + "step": 3534 + }, + { + "epoch": 0.1, + "grad_norm": 1.6649943386128123, + "learning_rate": 9.862741833192504e-06, + "loss": 0.4228, + "step": 3535 + }, + { + "epoch": 0.1, + "grad_norm": 1.5250650048762624, + "learning_rate": 9.862632510377272e-06, + "loss": 0.423, + "step": 3536 + }, + { + "epoch": 0.1, + "grad_norm": 1.592446689048901, + "learning_rate": 9.862523144649253e-06, + "loss": 0.4113, + "step": 3537 + }, + { + "epoch": 0.1, + "grad_norm": 1.796893725273209, + "learning_rate": 9.862413736009414e-06, + "loss": 0.4392, + "step": 3538 + }, + { + "epoch": 0.1, + "grad_norm": 1.9820993652369312, + "learning_rate": 9.862304284458722e-06, + "loss": 0.4121, + "step": 3539 + }, + { + "epoch": 0.1, + "grad_norm": 1.4851819564072781, + "learning_rate": 9.862194789998138e-06, + "loss": 0.4181, + "step": 3540 + }, + { + "epoch": 0.1, + "grad_norm": 1.6373551825340495, + "learning_rate": 9.862085252628633e-06, + "loss": 0.4453, + "step": 3541 + }, + { + "epoch": 0.1, + "grad_norm": 2.38626611499603, + "learning_rate": 9.861975672351172e-06, + "loss": 0.4368, + "step": 3542 + }, + { + "epoch": 0.1, + "grad_norm": 1.5440807756685169, + "learning_rate": 9.86186604916672e-06, + "loss": 0.4103, + "step": 3543 + }, + { + "epoch": 0.1, + "grad_norm": 1.8535915147494784, + "learning_rate": 9.861756383076247e-06, + "loss": 0.4755, + "step": 3544 + }, + { + "epoch": 0.1, + "grad_norm": 1.9950804367683037, + "learning_rate": 9.86164667408072e-06, + "loss": 0.4663, + "step": 3545 + }, + { + "epoch": 0.1, + "grad_norm": 1.50207500053409, + "learning_rate": 9.861536922181106e-06, + "loss": 0.403, + "step": 3546 + }, + { + "epoch": 0.1, + "grad_norm": 2.0253175818548215, + "learning_rate": 9.861427127378376e-06, + "loss": 0.4212, + "step": 3547 + }, + { + "epoch": 0.1, + "grad_norm": 1.7593555896950772, + "learning_rate": 9.861317289673497e-06, + "loss": 0.439, + "step": 3548 + }, + { + "epoch": 0.1, + "grad_norm": 1.9042262874004863, + "learning_rate": 9.861207409067439e-06, + "loss": 0.4317, + "step": 3549 + }, + { + "epoch": 0.1, + "grad_norm": 1.5326274378957199, + "learning_rate": 9.861097485561172e-06, + "loss": 0.3947, + "step": 3550 + }, + { + "epoch": 0.1, + "grad_norm": 1.463602507869068, + "learning_rate": 9.860987519155665e-06, + "loss": 0.3779, + "step": 3551 + }, + { + "epoch": 0.1, + "grad_norm": 1.6876663565998196, + "learning_rate": 9.860877509851892e-06, + "loss": 0.4557, + "step": 3552 + }, + { + "epoch": 0.1, + "grad_norm": 1.5902964498981116, + "learning_rate": 9.860767457650819e-06, + "loss": 0.4085, + "step": 3553 + }, + { + "epoch": 0.1, + "grad_norm": 1.7156994966598245, + "learning_rate": 9.860657362553418e-06, + "loss": 0.4217, + "step": 3554 + }, + { + "epoch": 0.1, + "grad_norm": 1.8365026302756102, + "learning_rate": 9.860547224560664e-06, + "loss": 0.3933, + "step": 3555 + }, + { + "epoch": 0.1, + "grad_norm": 1.7097047673263224, + "learning_rate": 9.860437043673524e-06, + "loss": 0.4185, + "step": 3556 + }, + { + "epoch": 0.1, + "grad_norm": 1.701126679472636, + "learning_rate": 9.860326819892977e-06, + "loss": 0.4124, + "step": 3557 + }, + { + "epoch": 0.1, + "grad_norm": 1.775326430572021, + "learning_rate": 9.86021655321999e-06, + "loss": 0.4233, + "step": 3558 + }, + { + "epoch": 0.1, + "grad_norm": 1.9676030131409263, + "learning_rate": 9.860106243655538e-06, + "loss": 0.3958, + "step": 3559 + }, + { + "epoch": 0.1, + "grad_norm": 1.5628724189936, + "learning_rate": 9.859995891200594e-06, + "loss": 0.4509, + "step": 3560 + }, + { + "epoch": 0.1, + "grad_norm": 1.6827973252083894, + "learning_rate": 9.859885495856132e-06, + "loss": 0.4429, + "step": 3561 + }, + { + "epoch": 0.1, + "grad_norm": 1.7576024858702317, + "learning_rate": 9.85977505762313e-06, + "loss": 0.474, + "step": 3562 + }, + { + "epoch": 0.1, + "grad_norm": 1.8081945580852976, + "learning_rate": 9.859664576502555e-06, + "loss": 0.4207, + "step": 3563 + }, + { + "epoch": 0.1, + "grad_norm": 1.524905884844047, + "learning_rate": 9.859554052495387e-06, + "loss": 0.4011, + "step": 3564 + }, + { + "epoch": 0.1, + "grad_norm": 1.5382438684566864, + "learning_rate": 9.859443485602603e-06, + "loss": 0.4383, + "step": 3565 + }, + { + "epoch": 0.1, + "grad_norm": 1.793999208601125, + "learning_rate": 9.859332875825174e-06, + "loss": 0.4329, + "step": 3566 + }, + { + "epoch": 0.1, + "grad_norm": 1.6151875417773165, + "learning_rate": 9.85922222316408e-06, + "loss": 0.4242, + "step": 3567 + }, + { + "epoch": 0.1, + "grad_norm": 1.532321375213675, + "learning_rate": 9.859111527620296e-06, + "loss": 0.4238, + "step": 3568 + }, + { + "epoch": 0.1, + "grad_norm": 1.6952654672576497, + "learning_rate": 9.859000789194797e-06, + "loss": 0.4207, + "step": 3569 + }, + { + "epoch": 0.1, + "grad_norm": 1.6092484243320242, + "learning_rate": 9.858890007888563e-06, + "loss": 0.416, + "step": 3570 + }, + { + "epoch": 0.1, + "grad_norm": 1.5862481597748856, + "learning_rate": 9.858779183702571e-06, + "loss": 0.4378, + "step": 3571 + }, + { + "epoch": 0.1, + "grad_norm": 1.6567098528938269, + "learning_rate": 9.858668316637799e-06, + "loss": 0.4418, + "step": 3572 + }, + { + "epoch": 0.1, + "grad_norm": 1.505567286445591, + "learning_rate": 9.858557406695226e-06, + "loss": 0.404, + "step": 3573 + }, + { + "epoch": 0.1, + "grad_norm": 1.545937258256914, + "learning_rate": 9.858446453875828e-06, + "loss": 0.3873, + "step": 3574 + }, + { + "epoch": 0.1, + "grad_norm": 1.5433861687272534, + "learning_rate": 9.858335458180586e-06, + "loss": 0.4349, + "step": 3575 + }, + { + "epoch": 0.1, + "grad_norm": 1.4368531493257903, + "learning_rate": 9.858224419610483e-06, + "loss": 0.4043, + "step": 3576 + }, + { + "epoch": 0.1, + "grad_norm": 1.6323416632153485, + "learning_rate": 9.858113338166493e-06, + "loss": 0.3963, + "step": 3577 + }, + { + "epoch": 0.1, + "grad_norm": 1.7945360003665187, + "learning_rate": 9.858002213849598e-06, + "loss": 0.4157, + "step": 3578 + }, + { + "epoch": 0.1, + "grad_norm": 1.825813290327355, + "learning_rate": 9.857891046660781e-06, + "loss": 0.4224, + "step": 3579 + }, + { + "epoch": 0.1, + "grad_norm": 1.6760346561175472, + "learning_rate": 9.857779836601023e-06, + "loss": 0.4248, + "step": 3580 + }, + { + "epoch": 0.1, + "grad_norm": 2.595782527017667, + "learning_rate": 9.857668583671301e-06, + "loss": 0.4707, + "step": 3581 + }, + { + "epoch": 0.1, + "grad_norm": 1.5313763805445475, + "learning_rate": 9.857557287872602e-06, + "loss": 0.402, + "step": 3582 + }, + { + "epoch": 0.1, + "grad_norm": 1.7329720206056147, + "learning_rate": 9.857445949205906e-06, + "loss": 0.4323, + "step": 3583 + }, + { + "epoch": 0.1, + "grad_norm": 1.7798462033096178, + "learning_rate": 9.857334567672194e-06, + "loss": 0.4501, + "step": 3584 + }, + { + "epoch": 0.1, + "grad_norm": 1.8071368422481962, + "learning_rate": 9.857223143272452e-06, + "loss": 0.4133, + "step": 3585 + }, + { + "epoch": 0.1, + "grad_norm": 4.395173216977805, + "learning_rate": 9.857111676007662e-06, + "loss": 0.4568, + "step": 3586 + }, + { + "epoch": 0.1, + "grad_norm": 1.149737023527002, + "learning_rate": 9.857000165878808e-06, + "loss": 0.621, + "step": 3587 + }, + { + "epoch": 0.1, + "grad_norm": 1.7576527242735223, + "learning_rate": 9.856888612886872e-06, + "loss": 0.4138, + "step": 3588 + }, + { + "epoch": 0.1, + "grad_norm": 1.7139669728808975, + "learning_rate": 9.85677701703284e-06, + "loss": 0.4337, + "step": 3589 + }, + { + "epoch": 0.1, + "grad_norm": 1.5616522670728556, + "learning_rate": 9.856665378317699e-06, + "loss": 0.4079, + "step": 3590 + }, + { + "epoch": 0.1, + "grad_norm": 1.5087574731537918, + "learning_rate": 9.85655369674243e-06, + "loss": 0.4618, + "step": 3591 + }, + { + "epoch": 0.1, + "grad_norm": 1.5849584035313167, + "learning_rate": 9.856441972308022e-06, + "loss": 0.4497, + "step": 3592 + }, + { + "epoch": 0.1, + "grad_norm": 1.6964985110697453, + "learning_rate": 9.85633020501546e-06, + "loss": 0.4467, + "step": 3593 + }, + { + "epoch": 0.1, + "grad_norm": 2.284739500462478, + "learning_rate": 9.856218394865728e-06, + "loss": 0.3996, + "step": 3594 + }, + { + "epoch": 0.1, + "grad_norm": 1.6199493993637382, + "learning_rate": 9.856106541859818e-06, + "loss": 0.4352, + "step": 3595 + }, + { + "epoch": 0.1, + "grad_norm": 1.7544589998056441, + "learning_rate": 9.855994645998712e-06, + "loss": 0.4482, + "step": 3596 + }, + { + "epoch": 0.1, + "grad_norm": 1.6892726723769989, + "learning_rate": 9.8558827072834e-06, + "loss": 0.4621, + "step": 3597 + }, + { + "epoch": 0.1, + "grad_norm": 1.991218634683487, + "learning_rate": 9.855770725714869e-06, + "loss": 0.4325, + "step": 3598 + }, + { + "epoch": 0.1, + "grad_norm": 1.7782185933446468, + "learning_rate": 9.855658701294106e-06, + "loss": 0.4266, + "step": 3599 + }, + { + "epoch": 0.1, + "grad_norm": 1.6590242343354202, + "learning_rate": 9.855546634022101e-06, + "loss": 0.4038, + "step": 3600 + }, + { + "epoch": 0.1, + "grad_norm": 1.8115865352675682, + "learning_rate": 9.855434523899846e-06, + "loss": 0.4333, + "step": 3601 + }, + { + "epoch": 0.1, + "grad_norm": 1.5748285617833746, + "learning_rate": 9.855322370928327e-06, + "loss": 0.4084, + "step": 3602 + }, + { + "epoch": 0.1, + "grad_norm": 1.9433870617480604, + "learning_rate": 9.855210175108531e-06, + "loss": 0.4267, + "step": 3603 + }, + { + "epoch": 0.1, + "grad_norm": 1.5837882446576714, + "learning_rate": 9.855097936441454e-06, + "loss": 0.4104, + "step": 3604 + }, + { + "epoch": 0.1, + "grad_norm": 1.907559247061491, + "learning_rate": 9.854985654928084e-06, + "loss": 0.4115, + "step": 3605 + }, + { + "epoch": 0.1, + "grad_norm": 3.021874270524371, + "learning_rate": 9.854873330569411e-06, + "loss": 0.4565, + "step": 3606 + }, + { + "epoch": 0.1, + "grad_norm": 1.6102036327006688, + "learning_rate": 9.854760963366427e-06, + "loss": 0.4188, + "step": 3607 + }, + { + "epoch": 0.1, + "grad_norm": 2.2029364088838297, + "learning_rate": 9.854648553320124e-06, + "loss": 0.389, + "step": 3608 + }, + { + "epoch": 0.1, + "grad_norm": 1.8483563986545009, + "learning_rate": 9.854536100431492e-06, + "loss": 0.4013, + "step": 3609 + }, + { + "epoch": 0.1, + "grad_norm": 1.5022189998456426, + "learning_rate": 9.854423604701528e-06, + "loss": 0.4242, + "step": 3610 + }, + { + "epoch": 0.1, + "grad_norm": 1.4681810699979712, + "learning_rate": 9.85431106613122e-06, + "loss": 0.4299, + "step": 3611 + }, + { + "epoch": 0.1, + "grad_norm": 1.7397316812531722, + "learning_rate": 9.854198484721564e-06, + "loss": 0.4653, + "step": 3612 + }, + { + "epoch": 0.1, + "grad_norm": 1.638815271046899, + "learning_rate": 9.854085860473551e-06, + "loss": 0.4184, + "step": 3613 + }, + { + "epoch": 0.1, + "grad_norm": 1.7221197723642938, + "learning_rate": 9.853973193388178e-06, + "loss": 0.4612, + "step": 3614 + }, + { + "epoch": 0.1, + "grad_norm": 1.641213767374213, + "learning_rate": 9.853860483466436e-06, + "loss": 0.3773, + "step": 3615 + }, + { + "epoch": 0.1, + "grad_norm": 2.151005519434221, + "learning_rate": 9.853747730709322e-06, + "loss": 0.4232, + "step": 3616 + }, + { + "epoch": 0.1, + "grad_norm": 1.7848524562032775, + "learning_rate": 9.853634935117831e-06, + "loss": 0.4679, + "step": 3617 + }, + { + "epoch": 0.1, + "grad_norm": 1.6706202101668495, + "learning_rate": 9.853522096692958e-06, + "loss": 0.4267, + "step": 3618 + }, + { + "epoch": 0.1, + "grad_norm": 1.6736612878595507, + "learning_rate": 9.853409215435699e-06, + "loss": 0.4325, + "step": 3619 + }, + { + "epoch": 0.1, + "grad_norm": 1.7831761529572865, + "learning_rate": 9.853296291347048e-06, + "loss": 0.4407, + "step": 3620 + }, + { + "epoch": 0.11, + "grad_norm": 1.526433811416281, + "learning_rate": 9.853183324428006e-06, + "loss": 0.3757, + "step": 3621 + }, + { + "epoch": 0.11, + "grad_norm": 1.9050494743266193, + "learning_rate": 9.853070314679565e-06, + "loss": 0.4298, + "step": 3622 + }, + { + "epoch": 0.11, + "grad_norm": 1.8674109595175916, + "learning_rate": 9.852957262102725e-06, + "loss": 0.4015, + "step": 3623 + }, + { + "epoch": 0.11, + "grad_norm": 1.923651182227452, + "learning_rate": 9.852844166698485e-06, + "loss": 0.4491, + "step": 3624 + }, + { + "epoch": 0.11, + "grad_norm": 1.81379537757563, + "learning_rate": 9.852731028467838e-06, + "loss": 0.4053, + "step": 3625 + }, + { + "epoch": 0.11, + "grad_norm": 1.7165242788652009, + "learning_rate": 9.852617847411789e-06, + "loss": 0.3945, + "step": 3626 + }, + { + "epoch": 0.11, + "grad_norm": 2.086261669770334, + "learning_rate": 9.852504623531334e-06, + "loss": 0.4788, + "step": 3627 + }, + { + "epoch": 0.11, + "grad_norm": 1.6945083375523289, + "learning_rate": 9.85239135682747e-06, + "loss": 0.4878, + "step": 3628 + }, + { + "epoch": 0.11, + "grad_norm": 2.9796642102235484, + "learning_rate": 9.852278047301198e-06, + "loss": 0.4284, + "step": 3629 + }, + { + "epoch": 0.11, + "grad_norm": 1.681456451188162, + "learning_rate": 9.85216469495352e-06, + "loss": 0.4153, + "step": 3630 + }, + { + "epoch": 0.11, + "grad_norm": 1.5876835766820157, + "learning_rate": 9.852051299785434e-06, + "loss": 0.4077, + "step": 3631 + }, + { + "epoch": 0.11, + "grad_norm": 4.051730514465605, + "learning_rate": 9.851937861797942e-06, + "loss": 0.4085, + "step": 3632 + }, + { + "epoch": 0.11, + "grad_norm": 1.6435461114813055, + "learning_rate": 9.851824380992043e-06, + "loss": 0.4138, + "step": 3633 + }, + { + "epoch": 0.11, + "grad_norm": 1.9863661979007203, + "learning_rate": 9.851710857368741e-06, + "loss": 0.4381, + "step": 3634 + }, + { + "epoch": 0.11, + "grad_norm": 2.0193895966141904, + "learning_rate": 9.851597290929038e-06, + "loss": 0.4163, + "step": 3635 + }, + { + "epoch": 0.11, + "grad_norm": 1.8943346220836326, + "learning_rate": 9.851483681673934e-06, + "loss": 0.4192, + "step": 3636 + }, + { + "epoch": 0.11, + "grad_norm": 16.532670595392638, + "learning_rate": 9.851370029604432e-06, + "loss": 0.4102, + "step": 3637 + }, + { + "epoch": 0.11, + "grad_norm": 1.5677228408414168, + "learning_rate": 9.851256334721537e-06, + "loss": 0.4325, + "step": 3638 + }, + { + "epoch": 0.11, + "grad_norm": 1.5829356066550673, + "learning_rate": 9.851142597026251e-06, + "loss": 0.4283, + "step": 3639 + }, + { + "epoch": 0.11, + "grad_norm": 1.803159691961414, + "learning_rate": 9.851028816519577e-06, + "loss": 0.4744, + "step": 3640 + }, + { + "epoch": 0.11, + "grad_norm": 1.7815571496031506, + "learning_rate": 9.85091499320252e-06, + "loss": 0.4683, + "step": 3641 + }, + { + "epoch": 0.11, + "grad_norm": 1.5664364068212662, + "learning_rate": 9.850801127076086e-06, + "loss": 0.4208, + "step": 3642 + }, + { + "epoch": 0.11, + "grad_norm": 1.7358114507965212, + "learning_rate": 9.850687218141275e-06, + "loss": 0.4543, + "step": 3643 + }, + { + "epoch": 0.11, + "grad_norm": 1.8116340464269667, + "learning_rate": 9.850573266399099e-06, + "loss": 0.3946, + "step": 3644 + }, + { + "epoch": 0.11, + "grad_norm": 1.803612784243514, + "learning_rate": 9.850459271850558e-06, + "loss": 0.4396, + "step": 3645 + }, + { + "epoch": 0.11, + "grad_norm": 1.7001269742642058, + "learning_rate": 9.85034523449666e-06, + "loss": 0.4381, + "step": 3646 + }, + { + "epoch": 0.11, + "grad_norm": 1.6293717170566706, + "learning_rate": 9.850231154338413e-06, + "loss": 0.4004, + "step": 3647 + }, + { + "epoch": 0.11, + "grad_norm": 1.7259949083234636, + "learning_rate": 9.85011703137682e-06, + "loss": 0.4068, + "step": 3648 + }, + { + "epoch": 0.11, + "grad_norm": 1.9486748223783852, + "learning_rate": 9.85000286561289e-06, + "loss": 0.4524, + "step": 3649 + }, + { + "epoch": 0.11, + "grad_norm": 1.8306434921029884, + "learning_rate": 9.849888657047636e-06, + "loss": 0.4274, + "step": 3650 + }, + { + "epoch": 0.11, + "grad_norm": 1.8662417787777588, + "learning_rate": 9.849774405682055e-06, + "loss": 0.4468, + "step": 3651 + }, + { + "epoch": 0.11, + "grad_norm": 2.7139664213943346, + "learning_rate": 9.849660111517165e-06, + "loss": 0.4255, + "step": 3652 + }, + { + "epoch": 0.11, + "grad_norm": 1.7671560143620644, + "learning_rate": 9.849545774553969e-06, + "loss": 0.4369, + "step": 3653 + }, + { + "epoch": 0.11, + "grad_norm": 1.9281150382129628, + "learning_rate": 9.849431394793477e-06, + "loss": 0.4236, + "step": 3654 + }, + { + "epoch": 0.11, + "grad_norm": 1.4801498833208881, + "learning_rate": 9.8493169722367e-06, + "loss": 0.4371, + "step": 3655 + }, + { + "epoch": 0.11, + "grad_norm": 2.0235175485987376, + "learning_rate": 9.849202506884647e-06, + "loss": 0.4371, + "step": 3656 + }, + { + "epoch": 0.11, + "grad_norm": 1.587577011130212, + "learning_rate": 9.849087998738328e-06, + "loss": 0.4361, + "step": 3657 + }, + { + "epoch": 0.11, + "grad_norm": 1.8011943100673835, + "learning_rate": 9.848973447798753e-06, + "loss": 0.4642, + "step": 3658 + }, + { + "epoch": 0.11, + "grad_norm": 1.7813832289129623, + "learning_rate": 9.848858854066936e-06, + "loss": 0.4193, + "step": 3659 + }, + { + "epoch": 0.11, + "grad_norm": 1.5812758579182218, + "learning_rate": 9.848744217543883e-06, + "loss": 0.3903, + "step": 3660 + }, + { + "epoch": 0.11, + "grad_norm": 1.8667545380563058, + "learning_rate": 9.84862953823061e-06, + "loss": 0.4264, + "step": 3661 + }, + { + "epoch": 0.11, + "grad_norm": 1.6235966070816052, + "learning_rate": 9.848514816128126e-06, + "loss": 0.4061, + "step": 3662 + }, + { + "epoch": 0.11, + "grad_norm": 1.5829068452229493, + "learning_rate": 9.848400051237445e-06, + "loss": 0.4416, + "step": 3663 + }, + { + "epoch": 0.11, + "grad_norm": 1.7832878982348956, + "learning_rate": 9.848285243559581e-06, + "loss": 0.4472, + "step": 3664 + }, + { + "epoch": 0.11, + "grad_norm": 1.5859263778948511, + "learning_rate": 9.848170393095546e-06, + "loss": 0.419, + "step": 3665 + }, + { + "epoch": 0.11, + "grad_norm": 2.1775883656885817, + "learning_rate": 9.848055499846355e-06, + "loss": 0.4079, + "step": 3666 + }, + { + "epoch": 0.11, + "grad_norm": 1.840121168403647, + "learning_rate": 9.847940563813018e-06, + "loss": 0.4473, + "step": 3667 + }, + { + "epoch": 0.11, + "grad_norm": 1.547663726339005, + "learning_rate": 9.847825584996552e-06, + "loss": 0.3979, + "step": 3668 + }, + { + "epoch": 0.11, + "grad_norm": 1.7649325624948, + "learning_rate": 9.847710563397973e-06, + "loss": 0.459, + "step": 3669 + }, + { + "epoch": 0.11, + "grad_norm": 1.6597078067533835, + "learning_rate": 9.847595499018292e-06, + "loss": 0.4143, + "step": 3670 + }, + { + "epoch": 0.11, + "grad_norm": 1.7544891148478712, + "learning_rate": 9.847480391858528e-06, + "loss": 0.4571, + "step": 3671 + }, + { + "epoch": 0.11, + "grad_norm": 1.5763992379368243, + "learning_rate": 9.847365241919697e-06, + "loss": 0.3932, + "step": 3672 + }, + { + "epoch": 0.11, + "grad_norm": 1.1865138477641326, + "learning_rate": 9.847250049202812e-06, + "loss": 0.6211, + "step": 3673 + }, + { + "epoch": 0.11, + "grad_norm": 1.6624224664352207, + "learning_rate": 9.847134813708891e-06, + "loss": 0.4383, + "step": 3674 + }, + { + "epoch": 0.11, + "grad_norm": 1.6897333512125527, + "learning_rate": 9.847019535438955e-06, + "loss": 0.4515, + "step": 3675 + }, + { + "epoch": 0.11, + "grad_norm": 1.5699490955896218, + "learning_rate": 9.846904214394015e-06, + "loss": 0.3853, + "step": 3676 + }, + { + "epoch": 0.11, + "grad_norm": 1.586547171939866, + "learning_rate": 9.846788850575092e-06, + "loss": 0.4418, + "step": 3677 + }, + { + "epoch": 0.11, + "grad_norm": 1.6904446075941477, + "learning_rate": 9.846673443983205e-06, + "loss": 0.4196, + "step": 3678 + }, + { + "epoch": 0.11, + "grad_norm": 1.849538925884021, + "learning_rate": 9.84655799461937e-06, + "loss": 0.4674, + "step": 3679 + }, + { + "epoch": 0.11, + "grad_norm": 1.5543128422692116, + "learning_rate": 9.846442502484608e-06, + "loss": 0.4115, + "step": 3680 + }, + { + "epoch": 0.11, + "grad_norm": 1.9165458872378036, + "learning_rate": 9.846326967579936e-06, + "loss": 0.3984, + "step": 3681 + }, + { + "epoch": 0.11, + "grad_norm": 2.0526688499532413, + "learning_rate": 9.846211389906375e-06, + "loss": 0.4261, + "step": 3682 + }, + { + "epoch": 0.11, + "grad_norm": 1.617915337883276, + "learning_rate": 9.846095769464945e-06, + "loss": 0.414, + "step": 3683 + }, + { + "epoch": 0.11, + "grad_norm": 1.6518238893360266, + "learning_rate": 9.845980106256665e-06, + "loss": 0.4453, + "step": 3684 + }, + { + "epoch": 0.11, + "grad_norm": 2.2266993469909324, + "learning_rate": 9.845864400282556e-06, + "loss": 0.4256, + "step": 3685 + }, + { + "epoch": 0.11, + "grad_norm": 1.5717621706933633, + "learning_rate": 9.845748651543643e-06, + "loss": 0.4546, + "step": 3686 + }, + { + "epoch": 0.11, + "grad_norm": 1.5126636221396834, + "learning_rate": 9.845632860040943e-06, + "loss": 0.4052, + "step": 3687 + }, + { + "epoch": 0.11, + "grad_norm": 1.7118963157566862, + "learning_rate": 9.84551702577548e-06, + "loss": 0.4179, + "step": 3688 + }, + { + "epoch": 0.11, + "grad_norm": 1.2233684087482637, + "learning_rate": 9.845401148748272e-06, + "loss": 0.5713, + "step": 3689 + }, + { + "epoch": 0.11, + "grad_norm": 1.9853163790282622, + "learning_rate": 9.845285228960349e-06, + "loss": 0.414, + "step": 3690 + }, + { + "epoch": 0.11, + "grad_norm": 1.7031917762286433, + "learning_rate": 9.845169266412728e-06, + "loss": 0.4067, + "step": 3691 + }, + { + "epoch": 0.11, + "grad_norm": 1.7819279137219055, + "learning_rate": 9.845053261106437e-06, + "loss": 0.4962, + "step": 3692 + }, + { + "epoch": 0.11, + "grad_norm": 1.6516657563642019, + "learning_rate": 9.844937213042494e-06, + "loss": 0.4452, + "step": 3693 + }, + { + "epoch": 0.11, + "grad_norm": 1.6108865964566035, + "learning_rate": 9.844821122221928e-06, + "loss": 0.3999, + "step": 3694 + }, + { + "epoch": 0.11, + "grad_norm": 1.7648966708275007, + "learning_rate": 9.844704988645762e-06, + "loss": 0.4692, + "step": 3695 + }, + { + "epoch": 0.11, + "grad_norm": 1.7180648626002364, + "learning_rate": 9.84458881231502e-06, + "loss": 0.436, + "step": 3696 + }, + { + "epoch": 0.11, + "grad_norm": 1.7867870538138109, + "learning_rate": 9.844472593230728e-06, + "loss": 0.4296, + "step": 3697 + }, + { + "epoch": 0.11, + "grad_norm": 1.6311441872336425, + "learning_rate": 9.844356331393912e-06, + "loss": 0.4106, + "step": 3698 + }, + { + "epoch": 0.11, + "grad_norm": 1.487227802475079, + "learning_rate": 9.844240026805598e-06, + "loss": 0.3955, + "step": 3699 + }, + { + "epoch": 0.11, + "grad_norm": 1.7711189032885166, + "learning_rate": 9.844123679466812e-06, + "loss": 0.4104, + "step": 3700 + }, + { + "epoch": 0.11, + "grad_norm": 1.6022915297783047, + "learning_rate": 9.844007289378581e-06, + "loss": 0.4861, + "step": 3701 + }, + { + "epoch": 0.11, + "grad_norm": 2.041739318634499, + "learning_rate": 9.84389085654193e-06, + "loss": 0.4066, + "step": 3702 + }, + { + "epoch": 0.11, + "grad_norm": 1.5307376907148726, + "learning_rate": 9.84377438095789e-06, + "loss": 0.433, + "step": 3703 + }, + { + "epoch": 0.11, + "grad_norm": 1.5173502384759556, + "learning_rate": 9.843657862627489e-06, + "loss": 0.4449, + "step": 3704 + }, + { + "epoch": 0.11, + "grad_norm": 1.8077898830347294, + "learning_rate": 9.843541301551751e-06, + "loss": 0.4175, + "step": 3705 + }, + { + "epoch": 0.11, + "grad_norm": 1.7176876358485107, + "learning_rate": 9.843424697731707e-06, + "loss": 0.4533, + "step": 3706 + }, + { + "epoch": 0.11, + "grad_norm": 1.9387843455511484, + "learning_rate": 9.843308051168388e-06, + "loss": 0.4933, + "step": 3707 + }, + { + "epoch": 0.11, + "grad_norm": 1.6022070918647127, + "learning_rate": 9.843191361862822e-06, + "loss": 0.3978, + "step": 3708 + }, + { + "epoch": 0.11, + "grad_norm": 1.785966759752939, + "learning_rate": 9.843074629816037e-06, + "loss": 0.4821, + "step": 3709 + }, + { + "epoch": 0.11, + "grad_norm": 1.5390856367880947, + "learning_rate": 9.842957855029065e-06, + "loss": 0.4376, + "step": 3710 + }, + { + "epoch": 0.11, + "grad_norm": 1.8619469684715988, + "learning_rate": 9.842841037502937e-06, + "loss": 0.4227, + "step": 3711 + }, + { + "epoch": 0.11, + "grad_norm": 1.8777324121435937, + "learning_rate": 9.842724177238683e-06, + "loss": 0.4489, + "step": 3712 + }, + { + "epoch": 0.11, + "grad_norm": 1.6741371543123802, + "learning_rate": 9.842607274237334e-06, + "loss": 0.4237, + "step": 3713 + }, + { + "epoch": 0.11, + "grad_norm": 1.5769975844229982, + "learning_rate": 9.842490328499923e-06, + "loss": 0.4352, + "step": 3714 + }, + { + "epoch": 0.11, + "grad_norm": 1.9119197312810616, + "learning_rate": 9.84237334002748e-06, + "loss": 0.4496, + "step": 3715 + }, + { + "epoch": 0.11, + "grad_norm": 1.8665839777149513, + "learning_rate": 9.842256308821038e-06, + "loss": 0.475, + "step": 3716 + }, + { + "epoch": 0.11, + "grad_norm": 1.761504664252578, + "learning_rate": 9.842139234881632e-06, + "loss": 0.4283, + "step": 3717 + }, + { + "epoch": 0.11, + "grad_norm": 2.4156507870817894, + "learning_rate": 9.842022118210292e-06, + "loss": 0.4356, + "step": 3718 + }, + { + "epoch": 0.11, + "grad_norm": 1.6449457945147252, + "learning_rate": 9.841904958808054e-06, + "loss": 0.4013, + "step": 3719 + }, + { + "epoch": 0.11, + "grad_norm": 3.902317750099792, + "learning_rate": 9.84178775667595e-06, + "loss": 0.4237, + "step": 3720 + }, + { + "epoch": 0.11, + "grad_norm": 1.5637915637558655, + "learning_rate": 9.841670511815017e-06, + "loss": 0.3954, + "step": 3721 + }, + { + "epoch": 0.11, + "grad_norm": 1.5809841862623022, + "learning_rate": 9.841553224226288e-06, + "loss": 0.4322, + "step": 3722 + }, + { + "epoch": 0.11, + "grad_norm": 1.7602134320875433, + "learning_rate": 9.841435893910798e-06, + "loss": 0.4174, + "step": 3723 + }, + { + "epoch": 0.11, + "grad_norm": 1.8897087493099312, + "learning_rate": 9.841318520869581e-06, + "loss": 0.4753, + "step": 3724 + }, + { + "epoch": 0.11, + "grad_norm": 1.5944596117762406, + "learning_rate": 9.841201105103674e-06, + "loss": 0.4191, + "step": 3725 + }, + { + "epoch": 0.11, + "grad_norm": 1.5926887264044538, + "learning_rate": 9.841083646614117e-06, + "loss": 0.3808, + "step": 3726 + }, + { + "epoch": 0.11, + "grad_norm": 1.9092606410348498, + "learning_rate": 9.84096614540194e-06, + "loss": 0.426, + "step": 3727 + }, + { + "epoch": 0.11, + "grad_norm": 1.7195896710986138, + "learning_rate": 9.840848601468183e-06, + "loss": 0.4723, + "step": 3728 + }, + { + "epoch": 0.11, + "grad_norm": 3.9699545022283655, + "learning_rate": 9.840731014813885e-06, + "loss": 0.4571, + "step": 3729 + }, + { + "epoch": 0.11, + "grad_norm": 1.8567751953528326, + "learning_rate": 9.84061338544008e-06, + "loss": 0.4247, + "step": 3730 + }, + { + "epoch": 0.11, + "grad_norm": 1.692273500576236, + "learning_rate": 9.84049571334781e-06, + "loss": 0.4416, + "step": 3731 + }, + { + "epoch": 0.11, + "grad_norm": 1.62297064489342, + "learning_rate": 9.84037799853811e-06, + "loss": 0.4498, + "step": 3732 + }, + { + "epoch": 0.11, + "grad_norm": 2.4925421282643936, + "learning_rate": 9.840260241012022e-06, + "loss": 0.4135, + "step": 3733 + }, + { + "epoch": 0.11, + "grad_norm": 1.5983206535436492, + "learning_rate": 9.840142440770583e-06, + "loss": 0.4344, + "step": 3734 + }, + { + "epoch": 0.11, + "grad_norm": 1.6896701416290139, + "learning_rate": 9.840024597814834e-06, + "loss": 0.4604, + "step": 3735 + }, + { + "epoch": 0.11, + "grad_norm": 1.7124262397988637, + "learning_rate": 9.839906712145815e-06, + "loss": 0.4334, + "step": 3736 + }, + { + "epoch": 0.11, + "grad_norm": 1.8875604861332274, + "learning_rate": 9.839788783764564e-06, + "loss": 0.4328, + "step": 3737 + }, + { + "epoch": 0.11, + "grad_norm": 2.1116603164637704, + "learning_rate": 9.839670812672124e-06, + "loss": 0.416, + "step": 3738 + }, + { + "epoch": 0.11, + "grad_norm": 1.9286642564210321, + "learning_rate": 9.839552798869534e-06, + "loss": 0.4794, + "step": 3739 + }, + { + "epoch": 0.11, + "grad_norm": 1.9541119729571685, + "learning_rate": 9.839434742357838e-06, + "loss": 0.4319, + "step": 3740 + }, + { + "epoch": 0.11, + "grad_norm": 1.5834614236495714, + "learning_rate": 9.839316643138078e-06, + "loss": 0.4585, + "step": 3741 + }, + { + "epoch": 0.11, + "grad_norm": 1.9066011239813123, + "learning_rate": 9.839198501211294e-06, + "loss": 0.4588, + "step": 3742 + }, + { + "epoch": 0.11, + "grad_norm": 1.438965420517894, + "learning_rate": 9.839080316578529e-06, + "loss": 0.4175, + "step": 3743 + }, + { + "epoch": 0.11, + "grad_norm": 1.5422705589919508, + "learning_rate": 9.838962089240826e-06, + "loss": 0.4087, + "step": 3744 + }, + { + "epoch": 0.11, + "grad_norm": 1.6834768442971015, + "learning_rate": 9.83884381919923e-06, + "loss": 0.4418, + "step": 3745 + }, + { + "epoch": 0.11, + "grad_norm": 1.5558474917252505, + "learning_rate": 9.838725506454783e-06, + "loss": 0.451, + "step": 3746 + }, + { + "epoch": 0.11, + "grad_norm": 1.8354561620770256, + "learning_rate": 9.83860715100853e-06, + "loss": 0.4053, + "step": 3747 + }, + { + "epoch": 0.11, + "grad_norm": 1.6732426960076414, + "learning_rate": 9.838488752861514e-06, + "loss": 0.4075, + "step": 3748 + }, + { + "epoch": 0.11, + "grad_norm": 1.6282179471761742, + "learning_rate": 9.838370312014783e-06, + "loss": 0.4547, + "step": 3749 + }, + { + "epoch": 0.11, + "grad_norm": 1.59931329249472, + "learning_rate": 9.83825182846938e-06, + "loss": 0.4789, + "step": 3750 + }, + { + "epoch": 0.11, + "grad_norm": 1.6309807868214579, + "learning_rate": 9.83813330222635e-06, + "loss": 0.4616, + "step": 3751 + }, + { + "epoch": 0.11, + "grad_norm": 1.5989027015729833, + "learning_rate": 9.83801473328674e-06, + "loss": 0.5161, + "step": 3752 + }, + { + "epoch": 0.11, + "grad_norm": 1.6205039932347913, + "learning_rate": 9.837896121651597e-06, + "loss": 0.4118, + "step": 3753 + }, + { + "epoch": 0.11, + "grad_norm": 1.686044647459666, + "learning_rate": 9.837777467321968e-06, + "loss": 0.4306, + "step": 3754 + }, + { + "epoch": 0.11, + "grad_norm": 1.7964317805058592, + "learning_rate": 9.837658770298897e-06, + "loss": 0.4357, + "step": 3755 + }, + { + "epoch": 0.11, + "grad_norm": 1.7215497341213268, + "learning_rate": 9.837540030583434e-06, + "loss": 0.433, + "step": 3756 + }, + { + "epoch": 0.11, + "grad_norm": 1.6357298393214394, + "learning_rate": 9.83742124817663e-06, + "loss": 0.4666, + "step": 3757 + }, + { + "epoch": 0.11, + "grad_norm": 1.8823271005644813, + "learning_rate": 9.837302423079525e-06, + "loss": 0.4446, + "step": 3758 + }, + { + "epoch": 0.11, + "grad_norm": 1.5628377604454504, + "learning_rate": 9.837183555293175e-06, + "loss": 0.4348, + "step": 3759 + }, + { + "epoch": 0.11, + "grad_norm": 1.9092859939420008, + "learning_rate": 9.837064644818626e-06, + "loss": 0.4093, + "step": 3760 + }, + { + "epoch": 0.11, + "grad_norm": 1.797987393444634, + "learning_rate": 9.836945691656927e-06, + "loss": 0.4837, + "step": 3761 + }, + { + "epoch": 0.11, + "grad_norm": 2.4137471585021877, + "learning_rate": 9.83682669580913e-06, + "loss": 0.4963, + "step": 3762 + }, + { + "epoch": 0.11, + "grad_norm": 1.7824282272121266, + "learning_rate": 9.836707657276284e-06, + "loss": 0.4478, + "step": 3763 + }, + { + "epoch": 0.11, + "grad_norm": 1.5238537514754256, + "learning_rate": 9.836588576059437e-06, + "loss": 0.46, + "step": 3764 + }, + { + "epoch": 0.11, + "grad_norm": 1.4905091729522069, + "learning_rate": 9.836469452159643e-06, + "loss": 0.437, + "step": 3765 + }, + { + "epoch": 0.11, + "grad_norm": 1.4328979876784695, + "learning_rate": 9.836350285577954e-06, + "loss": 0.3943, + "step": 3766 + }, + { + "epoch": 0.11, + "grad_norm": 1.6567221816746058, + "learning_rate": 9.83623107631542e-06, + "loss": 0.4242, + "step": 3767 + }, + { + "epoch": 0.11, + "grad_norm": 1.5529567739360461, + "learning_rate": 9.836111824373092e-06, + "loss": 0.4036, + "step": 3768 + }, + { + "epoch": 0.11, + "grad_norm": 1.5093499871695941, + "learning_rate": 9.835992529752024e-06, + "loss": 0.416, + "step": 3769 + }, + { + "epoch": 0.11, + "grad_norm": 1.478983827439424, + "learning_rate": 9.835873192453266e-06, + "loss": 0.3918, + "step": 3770 + }, + { + "epoch": 0.11, + "grad_norm": 1.471316560496417, + "learning_rate": 9.835753812477875e-06, + "loss": 0.4106, + "step": 3771 + }, + { + "epoch": 0.11, + "grad_norm": 1.5548287895330402, + "learning_rate": 9.835634389826905e-06, + "loss": 0.4601, + "step": 3772 + }, + { + "epoch": 0.11, + "grad_norm": 1.5030388446812564, + "learning_rate": 9.835514924501406e-06, + "loss": 0.426, + "step": 3773 + }, + { + "epoch": 0.11, + "grad_norm": 1.7909940217601603, + "learning_rate": 9.835395416502433e-06, + "loss": 0.4372, + "step": 3774 + }, + { + "epoch": 0.11, + "grad_norm": 1.7715325742868382, + "learning_rate": 9.835275865831045e-06, + "loss": 0.4011, + "step": 3775 + }, + { + "epoch": 0.11, + "grad_norm": 1.0989299660667806, + "learning_rate": 9.835156272488292e-06, + "loss": 0.5985, + "step": 3776 + }, + { + "epoch": 0.11, + "grad_norm": 1.9205803890861939, + "learning_rate": 9.83503663647523e-06, + "loss": 0.4762, + "step": 3777 + }, + { + "epoch": 0.11, + "grad_norm": 1.7417250368146433, + "learning_rate": 9.834916957792917e-06, + "loss": 0.4638, + "step": 3778 + }, + { + "epoch": 0.11, + "grad_norm": 1.5864176717393674, + "learning_rate": 9.83479723644241e-06, + "loss": 0.4211, + "step": 3779 + }, + { + "epoch": 0.11, + "grad_norm": 1.4530448744635587, + "learning_rate": 9.83467747242476e-06, + "loss": 0.4201, + "step": 3780 + }, + { + "epoch": 0.11, + "grad_norm": 1.6353277815478995, + "learning_rate": 9.834557665741032e-06, + "loss": 0.4058, + "step": 3781 + }, + { + "epoch": 0.11, + "grad_norm": 3.778934299047333, + "learning_rate": 9.834437816392277e-06, + "loss": 0.4836, + "step": 3782 + }, + { + "epoch": 0.11, + "grad_norm": 1.6699134911594136, + "learning_rate": 9.834317924379554e-06, + "loss": 0.4189, + "step": 3783 + }, + { + "epoch": 0.11, + "grad_norm": 2.1840285982705243, + "learning_rate": 9.834197989703923e-06, + "loss": 0.4065, + "step": 3784 + }, + { + "epoch": 0.11, + "grad_norm": 1.6280011295000054, + "learning_rate": 9.83407801236644e-06, + "loss": 0.4587, + "step": 3785 + }, + { + "epoch": 0.11, + "grad_norm": 1.6536787321627413, + "learning_rate": 9.833957992368165e-06, + "loss": 0.4011, + "step": 3786 + }, + { + "epoch": 0.11, + "grad_norm": 1.3566317820719451, + "learning_rate": 9.833837929710158e-06, + "loss": 0.3953, + "step": 3787 + }, + { + "epoch": 0.11, + "grad_norm": 1.4866808993770957, + "learning_rate": 9.833717824393476e-06, + "loss": 0.4534, + "step": 3788 + }, + { + "epoch": 0.11, + "grad_norm": 1.4217993587076152, + "learning_rate": 9.833597676419182e-06, + "loss": 0.3926, + "step": 3789 + }, + { + "epoch": 0.11, + "grad_norm": 1.52353535900885, + "learning_rate": 9.833477485788334e-06, + "loss": 0.4349, + "step": 3790 + }, + { + "epoch": 0.11, + "grad_norm": 1.7204824415065283, + "learning_rate": 9.833357252501994e-06, + "loss": 0.4315, + "step": 3791 + }, + { + "epoch": 0.11, + "grad_norm": 1.6760941416465631, + "learning_rate": 9.833236976561223e-06, + "loss": 0.4491, + "step": 3792 + }, + { + "epoch": 0.11, + "grad_norm": 2.662384424352486, + "learning_rate": 9.833116657967082e-06, + "loss": 0.4459, + "step": 3793 + }, + { + "epoch": 0.11, + "grad_norm": 1.49469968861855, + "learning_rate": 9.832996296720633e-06, + "loss": 0.4377, + "step": 3794 + }, + { + "epoch": 0.11, + "grad_norm": 1.5562290747889878, + "learning_rate": 9.832875892822937e-06, + "loss": 0.4378, + "step": 3795 + }, + { + "epoch": 0.11, + "grad_norm": 1.6848745550600372, + "learning_rate": 9.832755446275058e-06, + "loss": 0.4281, + "step": 3796 + }, + { + "epoch": 0.11, + "grad_norm": 1.8795025738824382, + "learning_rate": 9.83263495707806e-06, + "loss": 0.4341, + "step": 3797 + }, + { + "epoch": 0.11, + "grad_norm": 1.7796827912295485, + "learning_rate": 9.832514425233004e-06, + "loss": 0.4142, + "step": 3798 + }, + { + "epoch": 0.11, + "grad_norm": 1.7460377621250909, + "learning_rate": 9.832393850740953e-06, + "loss": 0.4523, + "step": 3799 + }, + { + "epoch": 0.11, + "grad_norm": 1.5257548154345595, + "learning_rate": 9.832273233602974e-06, + "loss": 0.3935, + "step": 3800 + }, + { + "epoch": 0.11, + "grad_norm": 1.614709374390329, + "learning_rate": 9.832152573820131e-06, + "loss": 0.4555, + "step": 3801 + }, + { + "epoch": 0.11, + "grad_norm": 1.7382477537651324, + "learning_rate": 9.832031871393488e-06, + "loss": 0.4448, + "step": 3802 + }, + { + "epoch": 0.11, + "grad_norm": 1.6015704158447526, + "learning_rate": 9.831911126324107e-06, + "loss": 0.5019, + "step": 3803 + }, + { + "epoch": 0.11, + "grad_norm": 1.209169520763997, + "learning_rate": 9.83179033861306e-06, + "loss": 0.6002, + "step": 3804 + }, + { + "epoch": 0.11, + "grad_norm": 1.5238959513780679, + "learning_rate": 9.831669508261408e-06, + "loss": 0.4343, + "step": 3805 + }, + { + "epoch": 0.11, + "grad_norm": 1.6085701154965009, + "learning_rate": 9.83154863527022e-06, + "loss": 0.442, + "step": 3806 + }, + { + "epoch": 0.11, + "grad_norm": 1.651502088129169, + "learning_rate": 9.831427719640562e-06, + "loss": 0.4021, + "step": 3807 + }, + { + "epoch": 0.11, + "grad_norm": 1.6694291267321404, + "learning_rate": 9.8313067613735e-06, + "loss": 0.4057, + "step": 3808 + }, + { + "epoch": 0.11, + "grad_norm": 1.606851315815376, + "learning_rate": 9.8311857604701e-06, + "loss": 0.4179, + "step": 3809 + }, + { + "epoch": 0.11, + "grad_norm": 1.5488065926588397, + "learning_rate": 9.831064716931436e-06, + "loss": 0.4118, + "step": 3810 + }, + { + "epoch": 0.11, + "grad_norm": 1.835794486410984, + "learning_rate": 9.830943630758572e-06, + "loss": 0.424, + "step": 3811 + }, + { + "epoch": 0.11, + "grad_norm": 1.563924052127031, + "learning_rate": 9.830822501952574e-06, + "loss": 0.4416, + "step": 3812 + }, + { + "epoch": 0.11, + "grad_norm": 1.6611306994667767, + "learning_rate": 9.830701330514516e-06, + "loss": 0.4343, + "step": 3813 + }, + { + "epoch": 0.11, + "grad_norm": 2.6783255620466337, + "learning_rate": 9.830580116445464e-06, + "loss": 0.4184, + "step": 3814 + }, + { + "epoch": 0.11, + "grad_norm": 1.835858719899403, + "learning_rate": 9.83045885974649e-06, + "loss": 0.4661, + "step": 3815 + }, + { + "epoch": 0.11, + "grad_norm": 2.7523270928311647, + "learning_rate": 9.830337560418661e-06, + "loss": 0.3807, + "step": 3816 + }, + { + "epoch": 0.11, + "grad_norm": 1.5288434990414974, + "learning_rate": 9.830216218463051e-06, + "loss": 0.4314, + "step": 3817 + }, + { + "epoch": 0.11, + "grad_norm": 1.85067393493913, + "learning_rate": 9.83009483388073e-06, + "loss": 0.4127, + "step": 3818 + }, + { + "epoch": 0.11, + "grad_norm": 1.6169362035350487, + "learning_rate": 9.829973406672768e-06, + "loss": 0.408, + "step": 3819 + }, + { + "epoch": 0.11, + "grad_norm": 1.866613928775696, + "learning_rate": 9.829851936840235e-06, + "loss": 0.4518, + "step": 3820 + }, + { + "epoch": 0.11, + "grad_norm": 1.589683274314799, + "learning_rate": 9.829730424384207e-06, + "loss": 0.4073, + "step": 3821 + }, + { + "epoch": 0.11, + "grad_norm": 1.758382600973685, + "learning_rate": 9.829608869305755e-06, + "loss": 0.4572, + "step": 3822 + }, + { + "epoch": 0.11, + "grad_norm": 1.4981002468454618, + "learning_rate": 9.82948727160595e-06, + "loss": 0.4479, + "step": 3823 + }, + { + "epoch": 0.11, + "grad_norm": 1.6676610921330428, + "learning_rate": 9.829365631285868e-06, + "loss": 0.415, + "step": 3824 + }, + { + "epoch": 0.11, + "grad_norm": 1.678964197372923, + "learning_rate": 9.829243948346578e-06, + "loss": 0.423, + "step": 3825 + }, + { + "epoch": 0.11, + "grad_norm": 1.4768317063563192, + "learning_rate": 9.829122222789158e-06, + "loss": 0.4121, + "step": 3826 + }, + { + "epoch": 0.11, + "grad_norm": 1.8082102873389332, + "learning_rate": 9.82900045461468e-06, + "loss": 0.4092, + "step": 3827 + }, + { + "epoch": 0.11, + "grad_norm": 1.55724080279887, + "learning_rate": 9.82887864382422e-06, + "loss": 0.4359, + "step": 3828 + }, + { + "epoch": 0.11, + "grad_norm": 1.5055604911982488, + "learning_rate": 9.828756790418854e-06, + "loss": 0.4075, + "step": 3829 + }, + { + "epoch": 0.11, + "grad_norm": 1.5092190864008082, + "learning_rate": 9.828634894399654e-06, + "loss": 0.4314, + "step": 3830 + }, + { + "epoch": 0.11, + "grad_norm": 1.8228142904566769, + "learning_rate": 9.828512955767697e-06, + "loss": 0.4401, + "step": 3831 + }, + { + "epoch": 0.11, + "grad_norm": 1.7913201928730225, + "learning_rate": 9.828390974524062e-06, + "loss": 0.4262, + "step": 3832 + }, + { + "epoch": 0.11, + "grad_norm": 1.4460363824075535, + "learning_rate": 9.828268950669822e-06, + "loss": 0.4489, + "step": 3833 + }, + { + "epoch": 0.11, + "grad_norm": 1.1414473202183264, + "learning_rate": 9.828146884206053e-06, + "loss": 0.5249, + "step": 3834 + }, + { + "epoch": 0.11, + "grad_norm": 1.6595387882185528, + "learning_rate": 9.828024775133835e-06, + "loss": 0.4116, + "step": 3835 + }, + { + "epoch": 0.11, + "grad_norm": 1.6898852779774496, + "learning_rate": 9.827902623454247e-06, + "loss": 0.442, + "step": 3836 + }, + { + "epoch": 0.11, + "grad_norm": 2.215672180897776, + "learning_rate": 9.827780429168363e-06, + "loss": 0.4227, + "step": 3837 + }, + { + "epoch": 0.11, + "grad_norm": 1.5830431158878444, + "learning_rate": 9.827658192277263e-06, + "loss": 0.4474, + "step": 3838 + }, + { + "epoch": 0.11, + "grad_norm": 1.929533258704439, + "learning_rate": 9.827535912782026e-06, + "loss": 0.4246, + "step": 3839 + }, + { + "epoch": 0.11, + "grad_norm": 1.5921997207876002, + "learning_rate": 9.827413590683732e-06, + "loss": 0.4488, + "step": 3840 + }, + { + "epoch": 0.11, + "grad_norm": 1.6565288156873768, + "learning_rate": 9.827291225983458e-06, + "loss": 0.3995, + "step": 3841 + }, + { + "epoch": 0.11, + "grad_norm": 1.604010492621498, + "learning_rate": 9.827168818682287e-06, + "loss": 0.4359, + "step": 3842 + }, + { + "epoch": 0.11, + "grad_norm": 1.630285232036225, + "learning_rate": 9.827046368781296e-06, + "loss": 0.4136, + "step": 3843 + }, + { + "epoch": 0.11, + "grad_norm": 1.5203098314675343, + "learning_rate": 9.82692387628157e-06, + "loss": 0.4013, + "step": 3844 + }, + { + "epoch": 0.11, + "grad_norm": 1.077853413593872, + "learning_rate": 9.826801341184183e-06, + "loss": 0.5591, + "step": 3845 + }, + { + "epoch": 0.11, + "grad_norm": 1.5633907058120506, + "learning_rate": 9.826678763490223e-06, + "loss": 0.4185, + "step": 3846 + }, + { + "epoch": 0.11, + "grad_norm": 1.8455119140821594, + "learning_rate": 9.826556143200769e-06, + "loss": 0.4244, + "step": 3847 + }, + { + "epoch": 0.11, + "grad_norm": 1.5889959922101176, + "learning_rate": 9.826433480316904e-06, + "loss": 0.3949, + "step": 3848 + }, + { + "epoch": 0.11, + "grad_norm": 1.7049270125075793, + "learning_rate": 9.826310774839711e-06, + "loss": 0.4202, + "step": 3849 + }, + { + "epoch": 0.11, + "grad_norm": 1.7664890532454245, + "learning_rate": 9.82618802677027e-06, + "loss": 0.4302, + "step": 3850 + }, + { + "epoch": 0.11, + "grad_norm": 1.8283758005993178, + "learning_rate": 9.826065236109667e-06, + "loss": 0.4097, + "step": 3851 + }, + { + "epoch": 0.11, + "grad_norm": 1.474551566318512, + "learning_rate": 9.825942402858983e-06, + "loss": 0.414, + "step": 3852 + }, + { + "epoch": 0.11, + "grad_norm": 1.5197765449778815, + "learning_rate": 9.825819527019307e-06, + "loss": 0.4561, + "step": 3853 + }, + { + "epoch": 0.11, + "grad_norm": 1.6684537435771882, + "learning_rate": 9.825696608591718e-06, + "loss": 0.4321, + "step": 3854 + }, + { + "epoch": 0.11, + "grad_norm": 2.4371943384418593, + "learning_rate": 9.825573647577302e-06, + "loss": 0.4061, + "step": 3855 + }, + { + "epoch": 0.11, + "grad_norm": 1.6506039219216617, + "learning_rate": 9.825450643977148e-06, + "loss": 0.4212, + "step": 3856 + }, + { + "epoch": 0.11, + "grad_norm": 1.6239122414618967, + "learning_rate": 9.825327597792336e-06, + "loss": 0.4475, + "step": 3857 + }, + { + "epoch": 0.11, + "grad_norm": 1.7769268757048942, + "learning_rate": 9.825204509023956e-06, + "loss": 0.4076, + "step": 3858 + }, + { + "epoch": 0.11, + "grad_norm": 1.9159959523198753, + "learning_rate": 9.825081377673092e-06, + "loss": 0.4333, + "step": 3859 + }, + { + "epoch": 0.11, + "grad_norm": 1.4445456066763183, + "learning_rate": 9.824958203740831e-06, + "loss": 0.4217, + "step": 3860 + }, + { + "epoch": 0.11, + "grad_norm": 1.5179188705064581, + "learning_rate": 9.82483498722826e-06, + "loss": 0.4222, + "step": 3861 + }, + { + "epoch": 0.11, + "grad_norm": 1.8619811368443733, + "learning_rate": 9.824711728136468e-06, + "loss": 0.4633, + "step": 3862 + }, + { + "epoch": 0.11, + "grad_norm": 1.6053756949609093, + "learning_rate": 9.82458842646654e-06, + "loss": 0.4223, + "step": 3863 + }, + { + "epoch": 0.11, + "grad_norm": 1.7998458868455103, + "learning_rate": 9.824465082219567e-06, + "loss": 0.4439, + "step": 3864 + }, + { + "epoch": 0.11, + "grad_norm": 1.7933389500913484, + "learning_rate": 9.824341695396635e-06, + "loss": 0.4094, + "step": 3865 + }, + { + "epoch": 0.11, + "grad_norm": 1.095459655971186, + "learning_rate": 9.824218265998834e-06, + "loss": 0.6216, + "step": 3866 + }, + { + "epoch": 0.11, + "grad_norm": 1.847338337351327, + "learning_rate": 9.824094794027255e-06, + "loss": 0.4601, + "step": 3867 + }, + { + "epoch": 0.11, + "grad_norm": 2.638436612147041, + "learning_rate": 9.823971279482985e-06, + "loss": 0.4625, + "step": 3868 + }, + { + "epoch": 0.11, + "grad_norm": 1.8107088429819354, + "learning_rate": 9.823847722367115e-06, + "loss": 0.459, + "step": 3869 + }, + { + "epoch": 0.11, + "grad_norm": 0.9678441706290579, + "learning_rate": 9.823724122680735e-06, + "loss": 0.6241, + "step": 3870 + }, + { + "epoch": 0.11, + "grad_norm": 1.746374029609193, + "learning_rate": 9.823600480424935e-06, + "loss": 0.4571, + "step": 3871 + }, + { + "epoch": 0.11, + "grad_norm": 1.6274419901981152, + "learning_rate": 9.82347679560081e-06, + "loss": 0.4186, + "step": 3872 + }, + { + "epoch": 0.11, + "grad_norm": 1.5042485263116616, + "learning_rate": 9.823353068209447e-06, + "loss": 0.3993, + "step": 3873 + }, + { + "epoch": 0.11, + "grad_norm": 2.13347005410597, + "learning_rate": 9.82322929825194e-06, + "loss": 0.4446, + "step": 3874 + }, + { + "epoch": 0.11, + "grad_norm": 1.42009189343172, + "learning_rate": 9.823105485729381e-06, + "loss": 0.4278, + "step": 3875 + }, + { + "epoch": 0.11, + "grad_norm": 1.6015144885421886, + "learning_rate": 9.82298163064286e-06, + "loss": 0.389, + "step": 3876 + }, + { + "epoch": 0.11, + "grad_norm": 1.4807970443675829, + "learning_rate": 9.822857732993476e-06, + "loss": 0.4297, + "step": 3877 + }, + { + "epoch": 0.11, + "grad_norm": 2.058266629920264, + "learning_rate": 9.822733792782317e-06, + "loss": 0.4338, + "step": 3878 + }, + { + "epoch": 0.11, + "grad_norm": 1.4574885471063446, + "learning_rate": 9.822609810010482e-06, + "loss": 0.4425, + "step": 3879 + }, + { + "epoch": 0.11, + "grad_norm": 1.5114643772473624, + "learning_rate": 9.82248578467906e-06, + "loss": 0.4329, + "step": 3880 + }, + { + "epoch": 0.11, + "grad_norm": 1.5591791060852942, + "learning_rate": 9.822361716789146e-06, + "loss": 0.3916, + "step": 3881 + }, + { + "epoch": 0.11, + "grad_norm": 1.4330420275055038, + "learning_rate": 9.822237606341837e-06, + "loss": 0.4617, + "step": 3882 + }, + { + "epoch": 0.11, + "grad_norm": 1.4907474172081936, + "learning_rate": 9.82211345333823e-06, + "loss": 0.4073, + "step": 3883 + }, + { + "epoch": 0.11, + "grad_norm": 1.492751260339808, + "learning_rate": 9.821989257779414e-06, + "loss": 0.4684, + "step": 3884 + }, + { + "epoch": 0.11, + "grad_norm": 1.623608868611322, + "learning_rate": 9.821865019666491e-06, + "loss": 0.4485, + "step": 3885 + }, + { + "epoch": 0.11, + "grad_norm": 2.400934090422703, + "learning_rate": 9.821740739000558e-06, + "loss": 0.4331, + "step": 3886 + }, + { + "epoch": 0.11, + "grad_norm": 1.6238872903865627, + "learning_rate": 9.821616415782708e-06, + "loss": 0.4504, + "step": 3887 + }, + { + "epoch": 0.11, + "grad_norm": 1.4286201427232523, + "learning_rate": 9.821492050014038e-06, + "loss": 0.4151, + "step": 3888 + }, + { + "epoch": 0.11, + "grad_norm": 3.1066807352778896, + "learning_rate": 9.821367641695649e-06, + "loss": 0.3882, + "step": 3889 + }, + { + "epoch": 0.11, + "grad_norm": 1.7510705969043068, + "learning_rate": 9.821243190828638e-06, + "loss": 0.4158, + "step": 3890 + }, + { + "epoch": 0.11, + "grad_norm": 1.3656398724753556, + "learning_rate": 9.8211186974141e-06, + "loss": 0.3908, + "step": 3891 + }, + { + "epoch": 0.11, + "grad_norm": 2.004345690835193, + "learning_rate": 9.82099416145314e-06, + "loss": 0.4515, + "step": 3892 + }, + { + "epoch": 0.11, + "grad_norm": 1.5324082664181935, + "learning_rate": 9.820869582946848e-06, + "loss": 0.4061, + "step": 3893 + }, + { + "epoch": 0.11, + "grad_norm": 1.5645592081051019, + "learning_rate": 9.820744961896332e-06, + "loss": 0.4316, + "step": 3894 + }, + { + "epoch": 0.11, + "grad_norm": 1.1386561723513744, + "learning_rate": 9.820620298302685e-06, + "loss": 0.6052, + "step": 3895 + }, + { + "epoch": 0.11, + "grad_norm": 1.609987481114841, + "learning_rate": 9.820495592167013e-06, + "loss": 0.4269, + "step": 3896 + }, + { + "epoch": 0.11, + "grad_norm": 1.5754284535229688, + "learning_rate": 9.820370843490415e-06, + "loss": 0.4362, + "step": 3897 + }, + { + "epoch": 0.11, + "grad_norm": 1.5128864887676512, + "learning_rate": 9.820246052273987e-06, + "loss": 0.4038, + "step": 3898 + }, + { + "epoch": 0.11, + "grad_norm": 1.5014269040855037, + "learning_rate": 9.820121218518837e-06, + "loss": 0.4044, + "step": 3899 + }, + { + "epoch": 0.11, + "grad_norm": 1.6218952108388602, + "learning_rate": 9.819996342226063e-06, + "loss": 0.4197, + "step": 3900 + }, + { + "epoch": 0.11, + "grad_norm": 1.5002586297462635, + "learning_rate": 9.819871423396767e-06, + "loss": 0.3886, + "step": 3901 + }, + { + "epoch": 0.11, + "grad_norm": 1.634330050900594, + "learning_rate": 9.819746462032054e-06, + "loss": 0.4085, + "step": 3902 + }, + { + "epoch": 0.11, + "grad_norm": 2.0406432134758075, + "learning_rate": 9.819621458133024e-06, + "loss": 0.4279, + "step": 3903 + }, + { + "epoch": 0.11, + "grad_norm": 2.0235511097002488, + "learning_rate": 9.819496411700781e-06, + "loss": 0.4719, + "step": 3904 + }, + { + "epoch": 0.11, + "grad_norm": 1.7322217952400991, + "learning_rate": 9.819371322736428e-06, + "loss": 0.4172, + "step": 3905 + }, + { + "epoch": 0.11, + "grad_norm": 1.7067413225339578, + "learning_rate": 9.819246191241071e-06, + "loss": 0.4226, + "step": 3906 + }, + { + "epoch": 0.11, + "grad_norm": 1.771102758686046, + "learning_rate": 9.819121017215812e-06, + "loss": 0.4465, + "step": 3907 + }, + { + "epoch": 0.11, + "grad_norm": 1.577384669379854, + "learning_rate": 9.818995800661758e-06, + "loss": 0.3991, + "step": 3908 + }, + { + "epoch": 0.11, + "grad_norm": 1.916356068994697, + "learning_rate": 9.81887054158001e-06, + "loss": 0.4219, + "step": 3909 + }, + { + "epoch": 0.11, + "grad_norm": 1.094154440934726, + "learning_rate": 9.818745239971679e-06, + "loss": 0.6387, + "step": 3910 + }, + { + "epoch": 0.11, + "grad_norm": 2.5082639818045123, + "learning_rate": 9.818619895837866e-06, + "loss": 0.4251, + "step": 3911 + }, + { + "epoch": 0.11, + "grad_norm": 1.6375141236396464, + "learning_rate": 9.81849450917968e-06, + "loss": 0.4295, + "step": 3912 + }, + { + "epoch": 0.11, + "grad_norm": 1.5590412077506246, + "learning_rate": 9.818369079998224e-06, + "loss": 0.4502, + "step": 3913 + }, + { + "epoch": 0.11, + "grad_norm": 1.4563886785054827, + "learning_rate": 9.81824360829461e-06, + "loss": 0.4183, + "step": 3914 + }, + { + "epoch": 0.11, + "grad_norm": 1.7172028507439017, + "learning_rate": 9.818118094069942e-06, + "loss": 0.4378, + "step": 3915 + }, + { + "epoch": 0.11, + "grad_norm": 1.792152860259966, + "learning_rate": 9.817992537325328e-06, + "loss": 0.4388, + "step": 3916 + }, + { + "epoch": 0.11, + "grad_norm": 2.2197326332311347, + "learning_rate": 9.817866938061876e-06, + "loss": 0.4452, + "step": 3917 + }, + { + "epoch": 0.11, + "grad_norm": 1.4863201996706223, + "learning_rate": 9.817741296280696e-06, + "loss": 0.4084, + "step": 3918 + }, + { + "epoch": 0.11, + "grad_norm": 1.6597855267224173, + "learning_rate": 9.817615611982895e-06, + "loss": 0.4346, + "step": 3919 + }, + { + "epoch": 0.11, + "grad_norm": 1.4355568908142, + "learning_rate": 9.817489885169583e-06, + "loss": 0.4159, + "step": 3920 + }, + { + "epoch": 0.11, + "grad_norm": 1.6673180370494072, + "learning_rate": 9.81736411584187e-06, + "loss": 0.4401, + "step": 3921 + }, + { + "epoch": 0.11, + "grad_norm": 1.4813897800927118, + "learning_rate": 9.817238304000864e-06, + "loss": 0.4355, + "step": 3922 + }, + { + "epoch": 0.11, + "grad_norm": 1.487294658958869, + "learning_rate": 9.817112449647676e-06, + "loss": 0.4101, + "step": 3923 + }, + { + "epoch": 0.11, + "grad_norm": 1.496578402887374, + "learning_rate": 9.816986552783418e-06, + "loss": 0.41, + "step": 3924 + }, + { + "epoch": 0.11, + "grad_norm": 1.5772998124829258, + "learning_rate": 9.816860613409202e-06, + "loss": 0.4372, + "step": 3925 + }, + { + "epoch": 0.11, + "grad_norm": 1.4795945603324676, + "learning_rate": 9.816734631526134e-06, + "loss": 0.4431, + "step": 3926 + }, + { + "epoch": 0.11, + "grad_norm": 1.5978193306340733, + "learning_rate": 9.816608607135333e-06, + "loss": 0.4243, + "step": 3927 + }, + { + "epoch": 0.11, + "grad_norm": 1.5161956745915415, + "learning_rate": 9.816482540237906e-06, + "loss": 0.4053, + "step": 3928 + }, + { + "epoch": 0.11, + "grad_norm": 2.5498353381461114, + "learning_rate": 9.816356430834968e-06, + "loss": 0.4015, + "step": 3929 + }, + { + "epoch": 0.11, + "grad_norm": 1.9257222258315565, + "learning_rate": 9.816230278927629e-06, + "loss": 0.4117, + "step": 3930 + }, + { + "epoch": 0.11, + "grad_norm": 2.0882229834956676, + "learning_rate": 9.816104084517007e-06, + "loss": 0.4331, + "step": 3931 + }, + { + "epoch": 0.11, + "grad_norm": 1.520355676788958, + "learning_rate": 9.815977847604211e-06, + "loss": 0.399, + "step": 3932 + }, + { + "epoch": 0.11, + "grad_norm": 1.6538389833500837, + "learning_rate": 9.815851568190358e-06, + "loss": 0.4143, + "step": 3933 + }, + { + "epoch": 0.11, + "grad_norm": 9.90125486975578, + "learning_rate": 9.815725246276562e-06, + "loss": 0.4015, + "step": 3934 + }, + { + "epoch": 0.11, + "grad_norm": 1.519903180456707, + "learning_rate": 9.815598881863936e-06, + "loss": 0.4231, + "step": 3935 + }, + { + "epoch": 0.11, + "grad_norm": 1.5033995053863116, + "learning_rate": 9.815472474953598e-06, + "loss": 0.4619, + "step": 3936 + }, + { + "epoch": 0.11, + "grad_norm": 2.592447025512635, + "learning_rate": 9.81534602554666e-06, + "loss": 0.4379, + "step": 3937 + }, + { + "epoch": 0.11, + "grad_norm": 1.5682259434165196, + "learning_rate": 9.81521953364424e-06, + "loss": 0.4197, + "step": 3938 + }, + { + "epoch": 0.11, + "grad_norm": 2.697326562856379, + "learning_rate": 9.815092999247455e-06, + "loss": 0.4592, + "step": 3939 + }, + { + "epoch": 0.11, + "grad_norm": 1.4003344264152147, + "learning_rate": 9.814966422357421e-06, + "loss": 0.4008, + "step": 3940 + }, + { + "epoch": 0.11, + "grad_norm": 1.6850448970414706, + "learning_rate": 9.814839802975255e-06, + "loss": 0.4012, + "step": 3941 + }, + { + "epoch": 0.11, + "grad_norm": 2.014996972859048, + "learning_rate": 9.814713141102075e-06, + "loss": 0.4489, + "step": 3942 + }, + { + "epoch": 0.11, + "grad_norm": 1.8511140736445475, + "learning_rate": 9.814586436738998e-06, + "loss": 0.4185, + "step": 3943 + }, + { + "epoch": 0.11, + "grad_norm": 1.1212107821392223, + "learning_rate": 9.814459689887141e-06, + "loss": 0.6253, + "step": 3944 + }, + { + "epoch": 0.11, + "grad_norm": 1.6872962525902266, + "learning_rate": 9.814332900547624e-06, + "loss": 0.4058, + "step": 3945 + }, + { + "epoch": 0.11, + "grad_norm": 2.2792722368749576, + "learning_rate": 9.814206068721567e-06, + "loss": 0.3948, + "step": 3946 + }, + { + "epoch": 0.11, + "grad_norm": 1.8938201752391073, + "learning_rate": 9.814079194410086e-06, + "loss": 0.4733, + "step": 3947 + }, + { + "epoch": 0.11, + "grad_norm": 1.6586874351863368, + "learning_rate": 9.813952277614305e-06, + "loss": 0.4272, + "step": 3948 + }, + { + "epoch": 0.11, + "grad_norm": 1.5333619853153246, + "learning_rate": 9.81382531833534e-06, + "loss": 0.421, + "step": 3949 + }, + { + "epoch": 0.11, + "grad_norm": 1.5823483773958689, + "learning_rate": 9.813698316574314e-06, + "loss": 0.4268, + "step": 3950 + }, + { + "epoch": 0.11, + "grad_norm": 1.9024360772266593, + "learning_rate": 9.813571272332347e-06, + "loss": 0.4433, + "step": 3951 + }, + { + "epoch": 0.11, + "grad_norm": 2.0024743687944886, + "learning_rate": 9.81344418561056e-06, + "loss": 0.4446, + "step": 3952 + }, + { + "epoch": 0.11, + "grad_norm": 1.6721219041509596, + "learning_rate": 9.813317056410075e-06, + "loss": 0.4025, + "step": 3953 + }, + { + "epoch": 0.11, + "grad_norm": 1.0406863730506772, + "learning_rate": 9.813189884732013e-06, + "loss": 0.5932, + "step": 3954 + }, + { + "epoch": 0.11, + "grad_norm": 2.2908003867121858, + "learning_rate": 9.813062670577497e-06, + "loss": 0.4611, + "step": 3955 + }, + { + "epoch": 0.11, + "grad_norm": 1.6632189050918533, + "learning_rate": 9.812935413947649e-06, + "loss": 0.4679, + "step": 3956 + }, + { + "epoch": 0.11, + "grad_norm": 1.6217555868943552, + "learning_rate": 9.812808114843592e-06, + "loss": 0.4346, + "step": 3957 + }, + { + "epoch": 0.11, + "grad_norm": 1.7199055797078744, + "learning_rate": 9.812680773266453e-06, + "loss": 0.4019, + "step": 3958 + }, + { + "epoch": 0.11, + "grad_norm": 2.6320356639194595, + "learning_rate": 9.81255338921735e-06, + "loss": 0.4532, + "step": 3959 + }, + { + "epoch": 0.11, + "grad_norm": 1.4592173976643006, + "learning_rate": 9.812425962697412e-06, + "loss": 0.3814, + "step": 3960 + }, + { + "epoch": 0.11, + "grad_norm": 2.3808694969629114, + "learning_rate": 9.81229849370776e-06, + "loss": 0.3945, + "step": 3961 + }, + { + "epoch": 0.11, + "grad_norm": 1.6736598630494428, + "learning_rate": 9.81217098224952e-06, + "loss": 0.4462, + "step": 3962 + }, + { + "epoch": 0.11, + "grad_norm": 1.5713147666684781, + "learning_rate": 9.812043428323819e-06, + "loss": 0.3835, + "step": 3963 + }, + { + "epoch": 0.11, + "grad_norm": 2.3828650428475053, + "learning_rate": 9.81191583193178e-06, + "loss": 0.4059, + "step": 3964 + }, + { + "epoch": 0.12, + "grad_norm": 1.6845297592833048, + "learning_rate": 9.811788193074532e-06, + "loss": 0.4414, + "step": 3965 + }, + { + "epoch": 0.12, + "grad_norm": 1.6929462637221464, + "learning_rate": 9.811660511753199e-06, + "loss": 0.4784, + "step": 3966 + }, + { + "epoch": 0.12, + "grad_norm": 1.5740877430972173, + "learning_rate": 9.81153278796891e-06, + "loss": 0.44, + "step": 3967 + }, + { + "epoch": 0.12, + "grad_norm": 1.8934820271454096, + "learning_rate": 9.811405021722788e-06, + "loss": 0.4146, + "step": 3968 + }, + { + "epoch": 0.12, + "grad_norm": 1.6329815166791168, + "learning_rate": 9.811277213015965e-06, + "loss": 0.428, + "step": 3969 + }, + { + "epoch": 0.12, + "grad_norm": 1.6300209355760371, + "learning_rate": 9.811149361849569e-06, + "loss": 0.4175, + "step": 3970 + }, + { + "epoch": 0.12, + "grad_norm": 1.6725682735609906, + "learning_rate": 9.811021468224721e-06, + "loss": 0.4281, + "step": 3971 + }, + { + "epoch": 0.12, + "grad_norm": 1.0707950385204157, + "learning_rate": 9.810893532142561e-06, + "loss": 0.5644, + "step": 3972 + }, + { + "epoch": 0.12, + "grad_norm": 2.0140972964790618, + "learning_rate": 9.810765553604209e-06, + "loss": 0.4171, + "step": 3973 + }, + { + "epoch": 0.12, + "grad_norm": 1.8423004620558245, + "learning_rate": 9.810637532610799e-06, + "loss": 0.4174, + "step": 3974 + }, + { + "epoch": 0.12, + "grad_norm": 2.246382265846777, + "learning_rate": 9.810509469163459e-06, + "loss": 0.3855, + "step": 3975 + }, + { + "epoch": 0.12, + "grad_norm": 1.8071614754172796, + "learning_rate": 9.81038136326332e-06, + "loss": 0.4443, + "step": 3976 + }, + { + "epoch": 0.12, + "grad_norm": 1.486047179037751, + "learning_rate": 9.81025321491151e-06, + "loss": 0.4217, + "step": 3977 + }, + { + "epoch": 0.12, + "grad_norm": 1.8813959747632578, + "learning_rate": 9.810125024109164e-06, + "loss": 0.4117, + "step": 3978 + }, + { + "epoch": 0.12, + "grad_norm": 1.5261926454877162, + "learning_rate": 9.80999679085741e-06, + "loss": 0.3921, + "step": 3979 + }, + { + "epoch": 0.12, + "grad_norm": 1.607091043328886, + "learning_rate": 9.809868515157383e-06, + "loss": 0.4114, + "step": 3980 + }, + { + "epoch": 0.12, + "grad_norm": 1.5360257221798985, + "learning_rate": 9.809740197010212e-06, + "loss": 0.42, + "step": 3981 + }, + { + "epoch": 0.12, + "grad_norm": 1.7620425415862546, + "learning_rate": 9.80961183641703e-06, + "loss": 0.4444, + "step": 3982 + }, + { + "epoch": 0.12, + "grad_norm": 1.9173183885421279, + "learning_rate": 9.80948343337897e-06, + "loss": 0.4018, + "step": 3983 + }, + { + "epoch": 0.12, + "grad_norm": 2.583300536487216, + "learning_rate": 9.809354987897166e-06, + "loss": 0.4122, + "step": 3984 + }, + { + "epoch": 0.12, + "grad_norm": 1.4932300501650084, + "learning_rate": 9.809226499972751e-06, + "loss": 0.406, + "step": 3985 + }, + { + "epoch": 0.12, + "grad_norm": 1.568705396295761, + "learning_rate": 9.809097969606857e-06, + "loss": 0.4028, + "step": 3986 + }, + { + "epoch": 0.12, + "grad_norm": 1.4982739900808986, + "learning_rate": 9.808969396800623e-06, + "loss": 0.4191, + "step": 3987 + }, + { + "epoch": 0.12, + "grad_norm": 1.5827113117301383, + "learning_rate": 9.808840781555179e-06, + "loss": 0.4506, + "step": 3988 + }, + { + "epoch": 0.12, + "grad_norm": 1.6967717273089833, + "learning_rate": 9.808712123871663e-06, + "loss": 0.4549, + "step": 3989 + }, + { + "epoch": 0.12, + "grad_norm": 1.6793722816110976, + "learning_rate": 9.808583423751208e-06, + "loss": 0.4527, + "step": 3990 + }, + { + "epoch": 0.12, + "grad_norm": 1.6537346402383053, + "learning_rate": 9.80845468119495e-06, + "loss": 0.4228, + "step": 3991 + }, + { + "epoch": 0.12, + "grad_norm": 1.4426883367365317, + "learning_rate": 9.808325896204027e-06, + "loss": 0.4002, + "step": 3992 + }, + { + "epoch": 0.12, + "grad_norm": 1.84443130875003, + "learning_rate": 9.808197068779574e-06, + "loss": 0.4672, + "step": 3993 + }, + { + "epoch": 0.12, + "grad_norm": 1.5153855493693844, + "learning_rate": 9.80806819892273e-06, + "loss": 0.4133, + "step": 3994 + }, + { + "epoch": 0.12, + "grad_norm": 1.0896421668274887, + "learning_rate": 9.807939286634631e-06, + "loss": 0.6554, + "step": 3995 + }, + { + "epoch": 0.12, + "grad_norm": 1.5713458372689801, + "learning_rate": 9.807810331916415e-06, + "loss": 0.434, + "step": 3996 + }, + { + "epoch": 0.12, + "grad_norm": 1.6605006030585008, + "learning_rate": 9.807681334769217e-06, + "loss": 0.4303, + "step": 3997 + }, + { + "epoch": 0.12, + "grad_norm": 1.767858495364658, + "learning_rate": 9.807552295194178e-06, + "loss": 0.4401, + "step": 3998 + }, + { + "epoch": 0.12, + "grad_norm": 1.4058672265303747, + "learning_rate": 9.807423213192438e-06, + "loss": 0.3694, + "step": 3999 + }, + { + "epoch": 0.12, + "grad_norm": 1.7105781191457423, + "learning_rate": 9.807294088765135e-06, + "loss": 0.4453, + "step": 4000 + }, + { + "epoch": 0.12, + "grad_norm": 2.05679698416272, + "learning_rate": 9.807164921913407e-06, + "loss": 0.4039, + "step": 4001 + }, + { + "epoch": 0.12, + "grad_norm": 1.3733047506520641, + "learning_rate": 9.807035712638397e-06, + "loss": 0.4083, + "step": 4002 + }, + { + "epoch": 0.12, + "grad_norm": 1.4793065931730252, + "learning_rate": 9.806906460941243e-06, + "loss": 0.4298, + "step": 4003 + }, + { + "epoch": 0.12, + "grad_norm": 1.4285638153674227, + "learning_rate": 9.806777166823084e-06, + "loss": 0.4026, + "step": 4004 + }, + { + "epoch": 0.12, + "grad_norm": 1.6535341371153456, + "learning_rate": 9.806647830285065e-06, + "loss": 0.4121, + "step": 4005 + }, + { + "epoch": 0.12, + "grad_norm": 1.45118712041827, + "learning_rate": 9.806518451328327e-06, + "loss": 0.4016, + "step": 4006 + }, + { + "epoch": 0.12, + "grad_norm": 1.4836684209660747, + "learning_rate": 9.806389029954009e-06, + "loss": 0.4022, + "step": 4007 + }, + { + "epoch": 0.12, + "grad_norm": 1.6079483730652142, + "learning_rate": 9.806259566163253e-06, + "loss": 0.4251, + "step": 4008 + }, + { + "epoch": 0.12, + "grad_norm": 1.05922396608415, + "learning_rate": 9.806130059957205e-06, + "loss": 0.6591, + "step": 4009 + }, + { + "epoch": 0.12, + "grad_norm": 1.5473553661622472, + "learning_rate": 9.806000511337004e-06, + "loss": 0.4342, + "step": 4010 + }, + { + "epoch": 0.12, + "grad_norm": 1.6015093464890324, + "learning_rate": 9.805870920303798e-06, + "loss": 0.4598, + "step": 4011 + }, + { + "epoch": 0.12, + "grad_norm": 1.691633848612007, + "learning_rate": 9.805741286858725e-06, + "loss": 0.4235, + "step": 4012 + }, + { + "epoch": 0.12, + "grad_norm": 1.779819920517001, + "learning_rate": 9.805611611002934e-06, + "loss": 0.4483, + "step": 4013 + }, + { + "epoch": 0.12, + "grad_norm": 1.8763612647534198, + "learning_rate": 9.805481892737565e-06, + "loss": 0.4534, + "step": 4014 + }, + { + "epoch": 0.12, + "grad_norm": 4.087826568505183, + "learning_rate": 9.805352132063766e-06, + "loss": 0.4183, + "step": 4015 + }, + { + "epoch": 0.12, + "grad_norm": 1.6328674004771024, + "learning_rate": 9.805222328982681e-06, + "loss": 0.4202, + "step": 4016 + }, + { + "epoch": 0.12, + "grad_norm": 1.5878946478637603, + "learning_rate": 9.805092483495458e-06, + "loss": 0.4074, + "step": 4017 + }, + { + "epoch": 0.12, + "grad_norm": 1.472012645433971, + "learning_rate": 9.804962595603237e-06, + "loss": 0.3763, + "step": 4018 + }, + { + "epoch": 0.12, + "grad_norm": 1.82617620933314, + "learning_rate": 9.80483266530717e-06, + "loss": 0.4384, + "step": 4019 + }, + { + "epoch": 0.12, + "grad_norm": 1.8387318211837655, + "learning_rate": 9.8047026926084e-06, + "loss": 0.4047, + "step": 4020 + }, + { + "epoch": 0.12, + "grad_norm": 1.6309579666511782, + "learning_rate": 9.804572677508077e-06, + "loss": 0.4398, + "step": 4021 + }, + { + "epoch": 0.12, + "grad_norm": 1.7940138265323637, + "learning_rate": 9.804442620007347e-06, + "loss": 0.4309, + "step": 4022 + }, + { + "epoch": 0.12, + "grad_norm": 1.6628311180761064, + "learning_rate": 9.804312520107355e-06, + "loss": 0.4138, + "step": 4023 + }, + { + "epoch": 0.12, + "grad_norm": 1.5083596063614177, + "learning_rate": 9.804182377809255e-06, + "loss": 0.442, + "step": 4024 + }, + { + "epoch": 0.12, + "grad_norm": 1.61043572745935, + "learning_rate": 9.80405219311419e-06, + "loss": 0.4139, + "step": 4025 + }, + { + "epoch": 0.12, + "grad_norm": 1.514294585555458, + "learning_rate": 9.803921966023311e-06, + "loss": 0.4279, + "step": 4026 + }, + { + "epoch": 0.12, + "grad_norm": 1.542350619647203, + "learning_rate": 9.803791696537767e-06, + "loss": 0.4196, + "step": 4027 + }, + { + "epoch": 0.12, + "grad_norm": 1.907110969724602, + "learning_rate": 9.80366138465871e-06, + "loss": 0.468, + "step": 4028 + }, + { + "epoch": 0.12, + "grad_norm": 1.8048772202302497, + "learning_rate": 9.803531030387288e-06, + "loss": 0.4122, + "step": 4029 + }, + { + "epoch": 0.12, + "grad_norm": 2.2539252442413, + "learning_rate": 9.803400633724649e-06, + "loss": 0.4286, + "step": 4030 + }, + { + "epoch": 0.12, + "grad_norm": 1.552082616493252, + "learning_rate": 9.803270194671947e-06, + "loss": 0.4495, + "step": 4031 + }, + { + "epoch": 0.12, + "grad_norm": 1.0984948947815725, + "learning_rate": 9.803139713230333e-06, + "loss": 0.5828, + "step": 4032 + }, + { + "epoch": 0.12, + "grad_norm": 1.997098079425519, + "learning_rate": 9.803009189400958e-06, + "loss": 0.4209, + "step": 4033 + }, + { + "epoch": 0.12, + "grad_norm": 1.8827708604481659, + "learning_rate": 9.802878623184972e-06, + "loss": 0.4639, + "step": 4034 + }, + { + "epoch": 0.12, + "grad_norm": 1.6082602943661626, + "learning_rate": 9.802748014583532e-06, + "loss": 0.3923, + "step": 4035 + }, + { + "epoch": 0.12, + "grad_norm": 1.5708748769669758, + "learning_rate": 9.802617363597785e-06, + "loss": 0.4201, + "step": 4036 + }, + { + "epoch": 0.12, + "grad_norm": 1.6092672144987177, + "learning_rate": 9.802486670228887e-06, + "loss": 0.4191, + "step": 4037 + }, + { + "epoch": 0.12, + "grad_norm": 1.5553187283661134, + "learning_rate": 9.80235593447799e-06, + "loss": 0.3963, + "step": 4038 + }, + { + "epoch": 0.12, + "grad_norm": 1.7793629946700227, + "learning_rate": 9.80222515634625e-06, + "loss": 0.4493, + "step": 4039 + }, + { + "epoch": 0.12, + "grad_norm": 1.5913292989919317, + "learning_rate": 9.802094335834819e-06, + "loss": 0.4498, + "step": 4040 + }, + { + "epoch": 0.12, + "grad_norm": 1.4885876788002523, + "learning_rate": 9.801963472944852e-06, + "loss": 0.396, + "step": 4041 + }, + { + "epoch": 0.12, + "grad_norm": 1.5562436461535045, + "learning_rate": 9.801832567677504e-06, + "loss": 0.4386, + "step": 4042 + }, + { + "epoch": 0.12, + "grad_norm": 1.5805316742128668, + "learning_rate": 9.801701620033932e-06, + "loss": 0.3966, + "step": 4043 + }, + { + "epoch": 0.12, + "grad_norm": 1.5438800078379247, + "learning_rate": 9.801570630015289e-06, + "loss": 0.3891, + "step": 4044 + }, + { + "epoch": 0.12, + "grad_norm": 1.4673537544742414, + "learning_rate": 9.801439597622731e-06, + "loss": 0.4135, + "step": 4045 + }, + { + "epoch": 0.12, + "grad_norm": 1.5576636451380552, + "learning_rate": 9.801308522857415e-06, + "loss": 0.4265, + "step": 4046 + }, + { + "epoch": 0.12, + "grad_norm": 1.9908080690714058, + "learning_rate": 9.801177405720498e-06, + "loss": 0.4832, + "step": 4047 + }, + { + "epoch": 0.12, + "grad_norm": 1.4385981106514478, + "learning_rate": 9.801046246213139e-06, + "loss": 0.4069, + "step": 4048 + }, + { + "epoch": 0.12, + "grad_norm": 1.5822008287810547, + "learning_rate": 9.800915044336494e-06, + "loss": 0.4372, + "step": 4049 + }, + { + "epoch": 0.12, + "grad_norm": 1.5847387228694765, + "learning_rate": 9.800783800091718e-06, + "loss": 0.4119, + "step": 4050 + }, + { + "epoch": 0.12, + "grad_norm": 1.6359913300474027, + "learning_rate": 9.800652513479973e-06, + "loss": 0.516, + "step": 4051 + }, + { + "epoch": 0.12, + "grad_norm": 1.4949122925198284, + "learning_rate": 9.800521184502417e-06, + "loss": 0.3902, + "step": 4052 + }, + { + "epoch": 0.12, + "grad_norm": 1.5447911106467715, + "learning_rate": 9.800389813160207e-06, + "loss": 0.4507, + "step": 4053 + }, + { + "epoch": 0.12, + "grad_norm": 1.713937489240634, + "learning_rate": 9.800258399454504e-06, + "loss": 0.4401, + "step": 4054 + }, + { + "epoch": 0.12, + "grad_norm": 1.651654021332635, + "learning_rate": 9.800126943386468e-06, + "loss": 0.382, + "step": 4055 + }, + { + "epoch": 0.12, + "grad_norm": 1.7652889068850979, + "learning_rate": 9.799995444957258e-06, + "loss": 0.4568, + "step": 4056 + }, + { + "epoch": 0.12, + "grad_norm": 1.5072344684510346, + "learning_rate": 9.799863904168035e-06, + "loss": 0.3995, + "step": 4057 + }, + { + "epoch": 0.12, + "grad_norm": 1.5459441877433313, + "learning_rate": 9.799732321019959e-06, + "loss": 0.4157, + "step": 4058 + }, + { + "epoch": 0.12, + "grad_norm": 1.0343141949070378, + "learning_rate": 9.799600695514193e-06, + "loss": 0.6196, + "step": 4059 + }, + { + "epoch": 0.12, + "grad_norm": 1.5832987440328528, + "learning_rate": 9.799469027651897e-06, + "loss": 0.4263, + "step": 4060 + }, + { + "epoch": 0.12, + "grad_norm": 1.6279830740369432, + "learning_rate": 9.799337317434232e-06, + "loss": 0.4503, + "step": 4061 + }, + { + "epoch": 0.12, + "grad_norm": 1.4883961318230698, + "learning_rate": 9.799205564862366e-06, + "loss": 0.3966, + "step": 4062 + }, + { + "epoch": 0.12, + "grad_norm": 3.698075558567797, + "learning_rate": 9.799073769937455e-06, + "loss": 0.3875, + "step": 4063 + }, + { + "epoch": 0.12, + "grad_norm": 2.0927338937166136, + "learning_rate": 9.798941932660666e-06, + "loss": 0.4395, + "step": 4064 + }, + { + "epoch": 0.12, + "grad_norm": 1.6051596234904857, + "learning_rate": 9.79881005303316e-06, + "loss": 0.38, + "step": 4065 + }, + { + "epoch": 0.12, + "grad_norm": 2.5049718555891682, + "learning_rate": 9.798678131056102e-06, + "loss": 0.4508, + "step": 4066 + }, + { + "epoch": 0.12, + "grad_norm": 1.6644776944186437, + "learning_rate": 9.798546166730657e-06, + "loss": 0.4501, + "step": 4067 + }, + { + "epoch": 0.12, + "grad_norm": 1.7984423092686348, + "learning_rate": 9.79841416005799e-06, + "loss": 0.3955, + "step": 4068 + }, + { + "epoch": 0.12, + "grad_norm": 1.5026018379814687, + "learning_rate": 9.798282111039262e-06, + "loss": 0.3976, + "step": 4069 + }, + { + "epoch": 0.12, + "grad_norm": 1.7713202845703369, + "learning_rate": 9.798150019675642e-06, + "loss": 0.4237, + "step": 4070 + }, + { + "epoch": 0.12, + "grad_norm": 1.548694429634641, + "learning_rate": 9.798017885968295e-06, + "loss": 0.4072, + "step": 4071 + }, + { + "epoch": 0.12, + "grad_norm": 1.4647098942370185, + "learning_rate": 9.797885709918388e-06, + "loss": 0.4017, + "step": 4072 + }, + { + "epoch": 0.12, + "grad_norm": 2.4265938392161823, + "learning_rate": 9.797753491527086e-06, + "loss": 0.4308, + "step": 4073 + }, + { + "epoch": 0.12, + "grad_norm": 1.480940618143284, + "learning_rate": 9.797621230795555e-06, + "loss": 0.3897, + "step": 4074 + }, + { + "epoch": 0.12, + "grad_norm": 1.5792203682819836, + "learning_rate": 9.797488927724964e-06, + "loss": 0.4566, + "step": 4075 + }, + { + "epoch": 0.12, + "grad_norm": 1.758582135864997, + "learning_rate": 9.797356582316481e-06, + "loss": 0.3905, + "step": 4076 + }, + { + "epoch": 0.12, + "grad_norm": 1.6330077000314034, + "learning_rate": 9.797224194571273e-06, + "loss": 0.3934, + "step": 4077 + }, + { + "epoch": 0.12, + "grad_norm": 1.7596608891633945, + "learning_rate": 9.797091764490507e-06, + "loss": 0.398, + "step": 4078 + }, + { + "epoch": 0.12, + "grad_norm": 2.8414248450621624, + "learning_rate": 9.796959292075353e-06, + "loss": 0.4301, + "step": 4079 + }, + { + "epoch": 0.12, + "grad_norm": 1.869972340985288, + "learning_rate": 9.796826777326981e-06, + "loss": 0.4323, + "step": 4080 + }, + { + "epoch": 0.12, + "grad_norm": 1.5574264068175852, + "learning_rate": 9.796694220246558e-06, + "loss": 0.4392, + "step": 4081 + }, + { + "epoch": 0.12, + "grad_norm": 1.6275929971355179, + "learning_rate": 9.796561620835257e-06, + "loss": 0.4532, + "step": 4082 + }, + { + "epoch": 0.12, + "grad_norm": 1.5437344456830122, + "learning_rate": 9.796428979094245e-06, + "loss": 0.3958, + "step": 4083 + }, + { + "epoch": 0.12, + "grad_norm": 1.814898944287242, + "learning_rate": 9.796296295024696e-06, + "loss": 0.4282, + "step": 4084 + }, + { + "epoch": 0.12, + "grad_norm": 1.6056893770173397, + "learning_rate": 9.796163568627778e-06, + "loss": 0.3856, + "step": 4085 + }, + { + "epoch": 0.12, + "grad_norm": 1.8673534722110656, + "learning_rate": 9.796030799904664e-06, + "loss": 0.4699, + "step": 4086 + }, + { + "epoch": 0.12, + "grad_norm": 1.5457419807435857, + "learning_rate": 9.795897988856523e-06, + "loss": 0.4614, + "step": 4087 + }, + { + "epoch": 0.12, + "grad_norm": 2.017549807414, + "learning_rate": 9.795765135484529e-06, + "loss": 0.3915, + "step": 4088 + }, + { + "epoch": 0.12, + "grad_norm": 1.4684678303333347, + "learning_rate": 9.795632239789856e-06, + "loss": 0.4089, + "step": 4089 + }, + { + "epoch": 0.12, + "grad_norm": 1.6457437409193811, + "learning_rate": 9.795499301773677e-06, + "loss": 0.3853, + "step": 4090 + }, + { + "epoch": 0.12, + "grad_norm": 1.6513571806410339, + "learning_rate": 9.79536632143716e-06, + "loss": 0.4567, + "step": 4091 + }, + { + "epoch": 0.12, + "grad_norm": 1.6254082746548162, + "learning_rate": 9.795233298781483e-06, + "loss": 0.4044, + "step": 4092 + }, + { + "epoch": 0.12, + "grad_norm": 1.992560152686689, + "learning_rate": 9.79510023380782e-06, + "loss": 0.4349, + "step": 4093 + }, + { + "epoch": 0.12, + "grad_norm": 1.694178773339725, + "learning_rate": 9.794967126517342e-06, + "loss": 0.5109, + "step": 4094 + }, + { + "epoch": 0.12, + "grad_norm": 1.8466698819180625, + "learning_rate": 9.794833976911228e-06, + "loss": 0.4393, + "step": 4095 + }, + { + "epoch": 0.12, + "grad_norm": 1.6143472771985432, + "learning_rate": 9.794700784990652e-06, + "loss": 0.4214, + "step": 4096 + }, + { + "epoch": 0.12, + "grad_norm": 1.642873705947386, + "learning_rate": 9.794567550756786e-06, + "loss": 0.457, + "step": 4097 + }, + { + "epoch": 0.12, + "grad_norm": 1.693521244338555, + "learning_rate": 9.79443427421081e-06, + "loss": 0.434, + "step": 4098 + }, + { + "epoch": 0.12, + "grad_norm": 1.9868942540676047, + "learning_rate": 9.794300955353898e-06, + "loss": 0.4398, + "step": 4099 + }, + { + "epoch": 0.12, + "grad_norm": 1.0597617692493557, + "learning_rate": 9.794167594187226e-06, + "loss": 0.5605, + "step": 4100 + }, + { + "epoch": 0.12, + "grad_norm": 1.8043430600570805, + "learning_rate": 9.794034190711971e-06, + "loss": 0.4509, + "step": 4101 + }, + { + "epoch": 0.12, + "grad_norm": 1.7095800586018255, + "learning_rate": 9.793900744929312e-06, + "loss": 0.4034, + "step": 4102 + }, + { + "epoch": 0.12, + "grad_norm": 1.5298021211097679, + "learning_rate": 9.793767256840426e-06, + "loss": 0.4477, + "step": 4103 + }, + { + "epoch": 0.12, + "grad_norm": 1.5222103609500528, + "learning_rate": 9.793633726446492e-06, + "loss": 0.4001, + "step": 4104 + }, + { + "epoch": 0.12, + "grad_norm": 1.7539889857455697, + "learning_rate": 9.793500153748684e-06, + "loss": 0.4313, + "step": 4105 + }, + { + "epoch": 0.12, + "grad_norm": 1.565757154096113, + "learning_rate": 9.793366538748186e-06, + "loss": 0.4385, + "step": 4106 + }, + { + "epoch": 0.12, + "grad_norm": 1.624686344773956, + "learning_rate": 9.793232881446175e-06, + "loss": 0.411, + "step": 4107 + }, + { + "epoch": 0.12, + "grad_norm": 1.9756788796682263, + "learning_rate": 9.79309918184383e-06, + "loss": 0.3941, + "step": 4108 + }, + { + "epoch": 0.12, + "grad_norm": 1.5908236470390948, + "learning_rate": 9.792965439942333e-06, + "loss": 0.4273, + "step": 4109 + }, + { + "epoch": 0.12, + "grad_norm": 1.0295891897464344, + "learning_rate": 9.792831655742862e-06, + "loss": 0.5691, + "step": 4110 + }, + { + "epoch": 0.12, + "grad_norm": 1.6015473068217858, + "learning_rate": 9.792697829246597e-06, + "loss": 0.4574, + "step": 4111 + }, + { + "epoch": 0.12, + "grad_norm": 1.0333011216106374, + "learning_rate": 9.792563960454723e-06, + "loss": 0.5816, + "step": 4112 + }, + { + "epoch": 0.12, + "grad_norm": 1.8700860859075896, + "learning_rate": 9.792430049368418e-06, + "loss": 0.4501, + "step": 4113 + }, + { + "epoch": 0.12, + "grad_norm": 1.7901069045892726, + "learning_rate": 9.792296095988865e-06, + "loss": 0.4093, + "step": 4114 + }, + { + "epoch": 0.12, + "grad_norm": 1.876068109006697, + "learning_rate": 9.792162100317245e-06, + "loss": 0.4026, + "step": 4115 + }, + { + "epoch": 0.12, + "grad_norm": 1.5587976790279503, + "learning_rate": 9.79202806235474e-06, + "loss": 0.4078, + "step": 4116 + }, + { + "epoch": 0.12, + "grad_norm": 1.7762401309006162, + "learning_rate": 9.791893982102537e-06, + "loss": 0.4268, + "step": 4117 + }, + { + "epoch": 0.12, + "grad_norm": 1.6811377554627307, + "learning_rate": 9.791759859561816e-06, + "loss": 0.4009, + "step": 4118 + }, + { + "epoch": 0.12, + "grad_norm": 2.9085749836339545, + "learning_rate": 9.79162569473376e-06, + "loss": 0.378, + "step": 4119 + }, + { + "epoch": 0.12, + "grad_norm": 1.5922367244362459, + "learning_rate": 9.791491487619552e-06, + "loss": 0.3967, + "step": 4120 + }, + { + "epoch": 0.12, + "grad_norm": 3.136274366598684, + "learning_rate": 9.791357238220383e-06, + "loss": 0.4336, + "step": 4121 + }, + { + "epoch": 0.12, + "grad_norm": 1.6663938933035107, + "learning_rate": 9.79122294653743e-06, + "loss": 0.4361, + "step": 4122 + }, + { + "epoch": 0.12, + "grad_norm": 1.605378333312756, + "learning_rate": 9.791088612571884e-06, + "loss": 0.4063, + "step": 4123 + }, + { + "epoch": 0.12, + "grad_norm": 1.622194969783268, + "learning_rate": 9.790954236324926e-06, + "loss": 0.4916, + "step": 4124 + }, + { + "epoch": 0.12, + "grad_norm": 2.376280572136697, + "learning_rate": 9.790819817797744e-06, + "loss": 0.467, + "step": 4125 + }, + { + "epoch": 0.12, + "grad_norm": 1.5447004899695072, + "learning_rate": 9.790685356991522e-06, + "loss": 0.4281, + "step": 4126 + }, + { + "epoch": 0.12, + "grad_norm": 1.9883501536823192, + "learning_rate": 9.79055085390745e-06, + "loss": 0.4301, + "step": 4127 + }, + { + "epoch": 0.12, + "grad_norm": 1.6088923818845675, + "learning_rate": 9.790416308546715e-06, + "loss": 0.4103, + "step": 4128 + }, + { + "epoch": 0.12, + "grad_norm": 1.5612625152820165, + "learning_rate": 9.790281720910504e-06, + "loss": 0.4324, + "step": 4129 + }, + { + "epoch": 0.12, + "grad_norm": 1.927882319402844, + "learning_rate": 9.790147091e-06, + "loss": 0.4108, + "step": 4130 + }, + { + "epoch": 0.12, + "grad_norm": 2.422570553807996, + "learning_rate": 9.790012418816397e-06, + "loss": 0.4122, + "step": 4131 + }, + { + "epoch": 0.12, + "grad_norm": 1.8298972950930066, + "learning_rate": 9.789877704360879e-06, + "loss": 0.4074, + "step": 4132 + }, + { + "epoch": 0.12, + "grad_norm": 1.8660205561049852, + "learning_rate": 9.789742947634639e-06, + "loss": 0.398, + "step": 4133 + }, + { + "epoch": 0.12, + "grad_norm": 1.8160278450971996, + "learning_rate": 9.789608148638865e-06, + "loss": 0.4621, + "step": 4134 + }, + { + "epoch": 0.12, + "grad_norm": 1.6747226935940347, + "learning_rate": 9.789473307374746e-06, + "loss": 0.4099, + "step": 4135 + }, + { + "epoch": 0.12, + "grad_norm": 1.631310852776947, + "learning_rate": 9.789338423843471e-06, + "loss": 0.4051, + "step": 4136 + }, + { + "epoch": 0.12, + "grad_norm": 1.5024176167746897, + "learning_rate": 9.789203498046231e-06, + "loss": 0.4208, + "step": 4137 + }, + { + "epoch": 0.12, + "grad_norm": 1.4725022503116156, + "learning_rate": 9.789068529984217e-06, + "loss": 0.3962, + "step": 4138 + }, + { + "epoch": 0.12, + "grad_norm": 1.7772469234290311, + "learning_rate": 9.78893351965862e-06, + "loss": 0.3741, + "step": 4139 + }, + { + "epoch": 0.12, + "grad_norm": 1.5873386647103698, + "learning_rate": 9.788798467070633e-06, + "loss": 0.3837, + "step": 4140 + }, + { + "epoch": 0.12, + "grad_norm": 1.4010386202651002, + "learning_rate": 9.788663372221445e-06, + "loss": 0.4078, + "step": 4141 + }, + { + "epoch": 0.12, + "grad_norm": 1.628011027993518, + "learning_rate": 9.788528235112251e-06, + "loss": 0.4625, + "step": 4142 + }, + { + "epoch": 0.12, + "grad_norm": 2.552795023920832, + "learning_rate": 9.788393055744241e-06, + "loss": 0.4318, + "step": 4143 + }, + { + "epoch": 0.12, + "grad_norm": 1.7441027637960667, + "learning_rate": 9.788257834118608e-06, + "loss": 0.4365, + "step": 4144 + }, + { + "epoch": 0.12, + "grad_norm": 1.7202595146132118, + "learning_rate": 9.788122570236548e-06, + "loss": 0.4132, + "step": 4145 + }, + { + "epoch": 0.12, + "grad_norm": 1.548146421281086, + "learning_rate": 9.787987264099253e-06, + "loss": 0.4417, + "step": 4146 + }, + { + "epoch": 0.12, + "grad_norm": 1.4354105609043573, + "learning_rate": 9.787851915707918e-06, + "loss": 0.3992, + "step": 4147 + }, + { + "epoch": 0.12, + "grad_norm": 1.4191769598866748, + "learning_rate": 9.787716525063737e-06, + "loss": 0.3991, + "step": 4148 + }, + { + "epoch": 0.12, + "grad_norm": 1.6926115965945407, + "learning_rate": 9.787581092167902e-06, + "loss": 0.4354, + "step": 4149 + }, + { + "epoch": 0.12, + "grad_norm": 1.7228189470706388, + "learning_rate": 9.787445617021613e-06, + "loss": 0.4347, + "step": 4150 + }, + { + "epoch": 0.12, + "grad_norm": 1.5543090430489208, + "learning_rate": 9.787310099626063e-06, + "loss": 0.4165, + "step": 4151 + }, + { + "epoch": 0.12, + "grad_norm": 1.4518709294523728, + "learning_rate": 9.787174539982447e-06, + "loss": 0.412, + "step": 4152 + }, + { + "epoch": 0.12, + "grad_norm": 1.5675889737084112, + "learning_rate": 9.787038938091964e-06, + "loss": 0.3963, + "step": 4153 + }, + { + "epoch": 0.12, + "grad_norm": 1.464900094692388, + "learning_rate": 9.78690329395581e-06, + "loss": 0.3981, + "step": 4154 + }, + { + "epoch": 0.12, + "grad_norm": 1.6447045570558183, + "learning_rate": 9.78676760757518e-06, + "loss": 0.4268, + "step": 4155 + }, + { + "epoch": 0.12, + "grad_norm": 1.5606372180828494, + "learning_rate": 9.786631878951273e-06, + "loss": 0.4483, + "step": 4156 + }, + { + "epoch": 0.12, + "grad_norm": 1.5176571275111062, + "learning_rate": 9.786496108085286e-06, + "loss": 0.4318, + "step": 4157 + }, + { + "epoch": 0.12, + "grad_norm": 1.582295502271281, + "learning_rate": 9.786360294978418e-06, + "loss": 0.4201, + "step": 4158 + }, + { + "epoch": 0.12, + "grad_norm": 1.7005693226516776, + "learning_rate": 9.786224439631869e-06, + "loss": 0.4438, + "step": 4159 + }, + { + "epoch": 0.12, + "grad_norm": 2.2227063849445154, + "learning_rate": 9.786088542046834e-06, + "loss": 0.4268, + "step": 4160 + }, + { + "epoch": 0.12, + "grad_norm": 1.7858798366369457, + "learning_rate": 9.785952602224516e-06, + "loss": 0.4319, + "step": 4161 + }, + { + "epoch": 0.12, + "grad_norm": 2.4641317526056525, + "learning_rate": 9.78581662016611e-06, + "loss": 0.4335, + "step": 4162 + }, + { + "epoch": 0.12, + "grad_norm": 1.9101268883116447, + "learning_rate": 9.785680595872824e-06, + "loss": 0.4179, + "step": 4163 + }, + { + "epoch": 0.12, + "grad_norm": 2.288820677977794, + "learning_rate": 9.78554452934585e-06, + "loss": 0.4069, + "step": 4164 + }, + { + "epoch": 0.12, + "grad_norm": 2.18453989473548, + "learning_rate": 9.785408420586395e-06, + "loss": 0.4593, + "step": 4165 + }, + { + "epoch": 0.12, + "grad_norm": 1.8932809910167105, + "learning_rate": 9.785272269595657e-06, + "loss": 0.4129, + "step": 4166 + }, + { + "epoch": 0.12, + "grad_norm": 1.9236368557126484, + "learning_rate": 9.785136076374836e-06, + "loss": 0.4139, + "step": 4167 + }, + { + "epoch": 0.12, + "grad_norm": 1.6364956343852906, + "learning_rate": 9.784999840925138e-06, + "loss": 0.4002, + "step": 4168 + }, + { + "epoch": 0.12, + "grad_norm": 1.4835596348085005, + "learning_rate": 9.784863563247762e-06, + "loss": 0.3974, + "step": 4169 + }, + { + "epoch": 0.12, + "grad_norm": 1.5807003688065588, + "learning_rate": 9.784727243343913e-06, + "loss": 0.4125, + "step": 4170 + }, + { + "epoch": 0.12, + "grad_norm": 2.242938516536883, + "learning_rate": 9.784590881214793e-06, + "loss": 0.4222, + "step": 4171 + }, + { + "epoch": 0.12, + "grad_norm": 1.7505308265087438, + "learning_rate": 9.784454476861606e-06, + "loss": 0.4186, + "step": 4172 + }, + { + "epoch": 0.12, + "grad_norm": 1.9115946972413693, + "learning_rate": 9.784318030285553e-06, + "loss": 0.4261, + "step": 4173 + }, + { + "epoch": 0.12, + "grad_norm": 1.77163389691782, + "learning_rate": 9.784181541487843e-06, + "loss": 0.4611, + "step": 4174 + }, + { + "epoch": 0.12, + "grad_norm": 1.459409636305784, + "learning_rate": 9.784045010469676e-06, + "loss": 0.3905, + "step": 4175 + }, + { + "epoch": 0.12, + "grad_norm": 1.6876704762737784, + "learning_rate": 9.78390843723226e-06, + "loss": 0.4436, + "step": 4176 + }, + { + "epoch": 0.12, + "grad_norm": 1.769386599967865, + "learning_rate": 9.783771821776798e-06, + "loss": 0.4053, + "step": 4177 + }, + { + "epoch": 0.12, + "grad_norm": 1.50476717941946, + "learning_rate": 9.783635164104497e-06, + "loss": 0.3961, + "step": 4178 + }, + { + "epoch": 0.12, + "grad_norm": 1.664244154839388, + "learning_rate": 9.783498464216563e-06, + "loss": 0.4323, + "step": 4179 + }, + { + "epoch": 0.12, + "grad_norm": 2.0596126279284723, + "learning_rate": 9.7833617221142e-06, + "loss": 0.41, + "step": 4180 + }, + { + "epoch": 0.12, + "grad_norm": 1.6795064053691624, + "learning_rate": 9.783224937798618e-06, + "loss": 0.4467, + "step": 4181 + }, + { + "epoch": 0.12, + "grad_norm": 1.5919608964987064, + "learning_rate": 9.783088111271024e-06, + "loss": 0.4025, + "step": 4182 + }, + { + "epoch": 0.12, + "grad_norm": 1.719854866572291, + "learning_rate": 9.782951242532623e-06, + "loss": 0.4237, + "step": 4183 + }, + { + "epoch": 0.12, + "grad_norm": 3.8943991103969817, + "learning_rate": 9.782814331584624e-06, + "loss": 0.3927, + "step": 4184 + }, + { + "epoch": 0.12, + "grad_norm": 1.5506668565392274, + "learning_rate": 9.782677378428236e-06, + "loss": 0.4507, + "step": 4185 + }, + { + "epoch": 0.12, + "grad_norm": 2.038860823510374, + "learning_rate": 9.782540383064668e-06, + "loss": 0.4117, + "step": 4186 + }, + { + "epoch": 0.12, + "grad_norm": 1.7798323432895153, + "learning_rate": 9.782403345495127e-06, + "loss": 0.3994, + "step": 4187 + }, + { + "epoch": 0.12, + "grad_norm": 1.706617540759462, + "learning_rate": 9.782266265720824e-06, + "loss": 0.4329, + "step": 4188 + }, + { + "epoch": 0.12, + "grad_norm": 1.4134095749205975, + "learning_rate": 9.782129143742968e-06, + "loss": 0.3974, + "step": 4189 + }, + { + "epoch": 0.12, + "grad_norm": 1.502466569764641, + "learning_rate": 9.781991979562769e-06, + "loss": 0.4073, + "step": 4190 + }, + { + "epoch": 0.12, + "grad_norm": 1.8324982332746118, + "learning_rate": 9.781854773181438e-06, + "loss": 0.4161, + "step": 4191 + }, + { + "epoch": 0.12, + "grad_norm": 1.8190431442458563, + "learning_rate": 9.781717524600184e-06, + "loss": 0.3966, + "step": 4192 + }, + { + "epoch": 0.12, + "grad_norm": 7.014808839946926, + "learning_rate": 9.781580233820223e-06, + "loss": 0.4162, + "step": 4193 + }, + { + "epoch": 0.12, + "grad_norm": 1.7263500409850658, + "learning_rate": 9.78144290084276e-06, + "loss": 0.4201, + "step": 4194 + }, + { + "epoch": 0.12, + "grad_norm": 1.53012128441775, + "learning_rate": 9.781305525669014e-06, + "loss": 0.3995, + "step": 4195 + }, + { + "epoch": 0.12, + "grad_norm": 1.7387169919348129, + "learning_rate": 9.78116810830019e-06, + "loss": 0.3996, + "step": 4196 + }, + { + "epoch": 0.12, + "grad_norm": 1.8083277226505081, + "learning_rate": 9.781030648737507e-06, + "loss": 0.394, + "step": 4197 + }, + { + "epoch": 0.12, + "grad_norm": 1.7283368018370724, + "learning_rate": 9.780893146982174e-06, + "loss": 0.4366, + "step": 4198 + }, + { + "epoch": 0.12, + "grad_norm": 1.7449633910614955, + "learning_rate": 9.780755603035406e-06, + "loss": 0.4208, + "step": 4199 + }, + { + "epoch": 0.12, + "grad_norm": 1.5523277981567183, + "learning_rate": 9.780618016898417e-06, + "loss": 0.4311, + "step": 4200 + }, + { + "epoch": 0.12, + "grad_norm": 3.2546873038621884, + "learning_rate": 9.780480388572422e-06, + "loss": 0.4273, + "step": 4201 + }, + { + "epoch": 0.12, + "grad_norm": 1.6827065306127802, + "learning_rate": 9.780342718058634e-06, + "loss": 0.4216, + "step": 4202 + }, + { + "epoch": 0.12, + "grad_norm": 1.693504251738823, + "learning_rate": 9.780205005358268e-06, + "loss": 0.4299, + "step": 4203 + }, + { + "epoch": 0.12, + "grad_norm": 1.586608475047345, + "learning_rate": 9.78006725047254e-06, + "loss": 0.3916, + "step": 4204 + }, + { + "epoch": 0.12, + "grad_norm": 1.689065904242566, + "learning_rate": 9.779929453402666e-06, + "loss": 0.4698, + "step": 4205 + }, + { + "epoch": 0.12, + "grad_norm": 1.8687248278049653, + "learning_rate": 9.77979161414986e-06, + "loss": 0.449, + "step": 4206 + }, + { + "epoch": 0.12, + "grad_norm": 1.5497023862935002, + "learning_rate": 9.779653732715341e-06, + "loss": 0.418, + "step": 4207 + }, + { + "epoch": 0.12, + "grad_norm": 1.9281057506502515, + "learning_rate": 9.779515809100326e-06, + "loss": 0.4061, + "step": 4208 + }, + { + "epoch": 0.12, + "grad_norm": 1.489085045158692, + "learning_rate": 9.77937784330603e-06, + "loss": 0.3945, + "step": 4209 + }, + { + "epoch": 0.12, + "grad_norm": 1.5997761871774052, + "learning_rate": 9.77923983533367e-06, + "loss": 0.4239, + "step": 4210 + }, + { + "epoch": 0.12, + "grad_norm": 1.9536799489137227, + "learning_rate": 9.779101785184467e-06, + "loss": 0.4379, + "step": 4211 + }, + { + "epoch": 0.12, + "grad_norm": 1.6344957656772128, + "learning_rate": 9.778963692859638e-06, + "loss": 0.4221, + "step": 4212 + }, + { + "epoch": 0.12, + "grad_norm": 1.9001378233676642, + "learning_rate": 9.7788255583604e-06, + "loss": 0.5067, + "step": 4213 + }, + { + "epoch": 0.12, + "grad_norm": 1.738251298279968, + "learning_rate": 9.778687381687975e-06, + "loss": 0.397, + "step": 4214 + }, + { + "epoch": 0.12, + "grad_norm": 1.626827861132122, + "learning_rate": 9.77854916284358e-06, + "loss": 0.4271, + "step": 4215 + }, + { + "epoch": 0.12, + "grad_norm": 2.289389318431805, + "learning_rate": 9.778410901828435e-06, + "loss": 0.395, + "step": 4216 + }, + { + "epoch": 0.12, + "grad_norm": 3.253751477429855, + "learning_rate": 9.778272598643764e-06, + "loss": 0.4216, + "step": 4217 + }, + { + "epoch": 0.12, + "grad_norm": 1.8541844170067432, + "learning_rate": 9.778134253290781e-06, + "loss": 0.3958, + "step": 4218 + }, + { + "epoch": 0.12, + "grad_norm": 1.7312469169079787, + "learning_rate": 9.77799586577071e-06, + "loss": 0.4097, + "step": 4219 + }, + { + "epoch": 0.12, + "grad_norm": 3.193588294625411, + "learning_rate": 9.777857436084775e-06, + "loss": 0.4044, + "step": 4220 + }, + { + "epoch": 0.12, + "grad_norm": 2.3078643290213505, + "learning_rate": 9.777718964234193e-06, + "loss": 0.41, + "step": 4221 + }, + { + "epoch": 0.12, + "grad_norm": 1.4366026759186434, + "learning_rate": 9.777580450220188e-06, + "loss": 0.4022, + "step": 4222 + }, + { + "epoch": 0.12, + "grad_norm": 1.6200548253764546, + "learning_rate": 9.777441894043983e-06, + "loss": 0.4348, + "step": 4223 + }, + { + "epoch": 0.12, + "grad_norm": 1.9912370238919714, + "learning_rate": 9.777303295706802e-06, + "loss": 0.4531, + "step": 4224 + }, + { + "epoch": 0.12, + "grad_norm": 3.244670813585775, + "learning_rate": 9.777164655209865e-06, + "loss": 0.4695, + "step": 4225 + }, + { + "epoch": 0.12, + "grad_norm": 2.249930002663218, + "learning_rate": 9.777025972554397e-06, + "loss": 0.447, + "step": 4226 + }, + { + "epoch": 0.12, + "grad_norm": 1.6044883697936378, + "learning_rate": 9.776887247741622e-06, + "loss": 0.4628, + "step": 4227 + }, + { + "epoch": 0.12, + "grad_norm": 1.530830969671836, + "learning_rate": 9.776748480772764e-06, + "loss": 0.4028, + "step": 4228 + }, + { + "epoch": 0.12, + "grad_norm": 1.6720774121082753, + "learning_rate": 9.776609671649046e-06, + "loss": 0.3785, + "step": 4229 + }, + { + "epoch": 0.12, + "grad_norm": 1.7998819730328823, + "learning_rate": 9.776470820371697e-06, + "loss": 0.3803, + "step": 4230 + }, + { + "epoch": 0.12, + "grad_norm": 2.0587901224664265, + "learning_rate": 9.776331926941939e-06, + "loss": 0.4612, + "step": 4231 + }, + { + "epoch": 0.12, + "grad_norm": 1.5290521013691667, + "learning_rate": 9.776192991360998e-06, + "loss": 0.3856, + "step": 4232 + }, + { + "epoch": 0.12, + "grad_norm": 1.9457156223911805, + "learning_rate": 9.776054013630104e-06, + "loss": 0.3844, + "step": 4233 + }, + { + "epoch": 0.12, + "grad_norm": 1.5395844285076667, + "learning_rate": 9.775914993750477e-06, + "loss": 0.4143, + "step": 4234 + }, + { + "epoch": 0.12, + "grad_norm": 1.7647335778123885, + "learning_rate": 9.775775931723349e-06, + "loss": 0.3996, + "step": 4235 + }, + { + "epoch": 0.12, + "grad_norm": 2.6733132251747738, + "learning_rate": 9.775636827549943e-06, + "loss": 0.4708, + "step": 4236 + }, + { + "epoch": 0.12, + "grad_norm": 1.6981901011603993, + "learning_rate": 9.775497681231491e-06, + "loss": 0.4026, + "step": 4237 + }, + { + "epoch": 0.12, + "grad_norm": 2.199800234071306, + "learning_rate": 9.77535849276922e-06, + "loss": 0.4448, + "step": 4238 + }, + { + "epoch": 0.12, + "grad_norm": 1.998610552914727, + "learning_rate": 9.775219262164356e-06, + "loss": 0.4102, + "step": 4239 + }, + { + "epoch": 0.12, + "grad_norm": 1.9589702873583907, + "learning_rate": 9.775079989418128e-06, + "loss": 0.4394, + "step": 4240 + }, + { + "epoch": 0.12, + "grad_norm": 1.5311242973313615, + "learning_rate": 9.774940674531768e-06, + "loss": 0.4005, + "step": 4241 + }, + { + "epoch": 0.12, + "grad_norm": 1.540659473644172, + "learning_rate": 9.774801317506503e-06, + "loss": 0.3973, + "step": 4242 + }, + { + "epoch": 0.12, + "grad_norm": 2.254623884013141, + "learning_rate": 9.774661918343562e-06, + "loss": 0.4282, + "step": 4243 + }, + { + "epoch": 0.12, + "grad_norm": 1.8107289406451363, + "learning_rate": 9.77452247704418e-06, + "loss": 0.4055, + "step": 4244 + }, + { + "epoch": 0.12, + "grad_norm": 1.5956902755192794, + "learning_rate": 9.774382993609581e-06, + "loss": 0.3868, + "step": 4245 + }, + { + "epoch": 0.12, + "grad_norm": 1.5292074788431425, + "learning_rate": 9.774243468041e-06, + "loss": 0.4351, + "step": 4246 + }, + { + "epoch": 0.12, + "grad_norm": 1.5311739372440039, + "learning_rate": 9.774103900339666e-06, + "loss": 0.4276, + "step": 4247 + }, + { + "epoch": 0.12, + "grad_norm": 1.6396323242582904, + "learning_rate": 9.773964290506815e-06, + "loss": 0.4131, + "step": 4248 + }, + { + "epoch": 0.12, + "grad_norm": 1.6267777549345677, + "learning_rate": 9.773824638543673e-06, + "loss": 0.3905, + "step": 4249 + }, + { + "epoch": 0.12, + "grad_norm": 1.5864274286231044, + "learning_rate": 9.77368494445148e-06, + "loss": 0.4162, + "step": 4250 + }, + { + "epoch": 0.12, + "grad_norm": 1.5580349580525614, + "learning_rate": 9.773545208231461e-06, + "loss": 0.4207, + "step": 4251 + }, + { + "epoch": 0.12, + "grad_norm": 1.5309454117313273, + "learning_rate": 9.773405429884854e-06, + "loss": 0.4114, + "step": 4252 + }, + { + "epoch": 0.12, + "grad_norm": 1.8625149165373218, + "learning_rate": 9.77326560941289e-06, + "loss": 0.4572, + "step": 4253 + }, + { + "epoch": 0.12, + "grad_norm": 1.9345655121820404, + "learning_rate": 9.773125746816805e-06, + "loss": 0.4481, + "step": 4254 + }, + { + "epoch": 0.12, + "grad_norm": 2.295208001634518, + "learning_rate": 9.772985842097832e-06, + "loss": 0.4306, + "step": 4255 + }, + { + "epoch": 0.12, + "grad_norm": 1.4991260206499568, + "learning_rate": 9.772845895257207e-06, + "loss": 0.408, + "step": 4256 + }, + { + "epoch": 0.12, + "grad_norm": 1.5292407553858078, + "learning_rate": 9.772705906296162e-06, + "loss": 0.4051, + "step": 4257 + }, + { + "epoch": 0.12, + "grad_norm": 2.0745163821967805, + "learning_rate": 9.772565875215935e-06, + "loss": 0.4268, + "step": 4258 + }, + { + "epoch": 0.12, + "grad_norm": 2.013186760622085, + "learning_rate": 9.772425802017762e-06, + "loss": 0.4093, + "step": 4259 + }, + { + "epoch": 0.12, + "grad_norm": 1.5356090962873719, + "learning_rate": 9.772285686702879e-06, + "loss": 0.4061, + "step": 4260 + }, + { + "epoch": 0.12, + "grad_norm": 1.8632714543534767, + "learning_rate": 9.772145529272519e-06, + "loss": 0.4248, + "step": 4261 + }, + { + "epoch": 0.12, + "grad_norm": 1.7885035809426046, + "learning_rate": 9.772005329727923e-06, + "loss": 0.4189, + "step": 4262 + }, + { + "epoch": 0.12, + "grad_norm": 1.3905847712811996, + "learning_rate": 9.77186508807033e-06, + "loss": 0.6694, + "step": 4263 + }, + { + "epoch": 0.12, + "grad_norm": 2.3531710687523666, + "learning_rate": 9.771724804300971e-06, + "loss": 0.4291, + "step": 4264 + }, + { + "epoch": 0.12, + "grad_norm": 1.9251828480989752, + "learning_rate": 9.77158447842109e-06, + "loss": 0.4191, + "step": 4265 + }, + { + "epoch": 0.12, + "grad_norm": 1.841930193133947, + "learning_rate": 9.77144411043192e-06, + "loss": 0.402, + "step": 4266 + }, + { + "epoch": 0.12, + "grad_norm": 1.7594611532257032, + "learning_rate": 9.771303700334705e-06, + "loss": 0.4236, + "step": 4267 + }, + { + "epoch": 0.12, + "grad_norm": 1.4394423289493068, + "learning_rate": 9.771163248130681e-06, + "loss": 0.3768, + "step": 4268 + }, + { + "epoch": 0.12, + "grad_norm": 1.5938903552453954, + "learning_rate": 9.771022753821092e-06, + "loss": 0.3904, + "step": 4269 + }, + { + "epoch": 0.12, + "grad_norm": 1.4697353559558535, + "learning_rate": 9.77088221740717e-06, + "loss": 0.4054, + "step": 4270 + }, + { + "epoch": 0.12, + "grad_norm": 1.588994138281938, + "learning_rate": 9.77074163889016e-06, + "loss": 0.4201, + "step": 4271 + }, + { + "epoch": 0.12, + "grad_norm": 1.6857573544283475, + "learning_rate": 9.770601018271304e-06, + "loss": 0.4238, + "step": 4272 + }, + { + "epoch": 0.12, + "grad_norm": 1.5386042813032812, + "learning_rate": 9.77046035555184e-06, + "loss": 0.4357, + "step": 4273 + }, + { + "epoch": 0.12, + "grad_norm": 1.753436847240475, + "learning_rate": 9.770319650733012e-06, + "loss": 0.4226, + "step": 4274 + }, + { + "epoch": 0.12, + "grad_norm": 2.097007297358318, + "learning_rate": 9.770178903816057e-06, + "loss": 0.4353, + "step": 4275 + }, + { + "epoch": 0.12, + "grad_norm": 1.726472263348216, + "learning_rate": 9.770038114802223e-06, + "loss": 0.4191, + "step": 4276 + }, + { + "epoch": 0.12, + "grad_norm": 1.5902447464009393, + "learning_rate": 9.769897283692748e-06, + "loss": 0.4253, + "step": 4277 + }, + { + "epoch": 0.12, + "grad_norm": 1.5581107426658858, + "learning_rate": 9.769756410488877e-06, + "loss": 0.4013, + "step": 4278 + }, + { + "epoch": 0.12, + "grad_norm": 1.6652108946177193, + "learning_rate": 9.769615495191853e-06, + "loss": 0.3948, + "step": 4279 + }, + { + "epoch": 0.12, + "grad_norm": 1.523157971948488, + "learning_rate": 9.769474537802918e-06, + "loss": 0.4035, + "step": 4280 + }, + { + "epoch": 0.12, + "grad_norm": 1.6841982579286223, + "learning_rate": 9.76933353832332e-06, + "loss": 0.4523, + "step": 4281 + }, + { + "epoch": 0.12, + "grad_norm": 2.5046413974894692, + "learning_rate": 9.7691924967543e-06, + "loss": 0.4211, + "step": 4282 + }, + { + "epoch": 0.12, + "grad_norm": 1.5545832808775588, + "learning_rate": 9.769051413097101e-06, + "loss": 0.4056, + "step": 4283 + }, + { + "epoch": 0.12, + "grad_norm": 1.658048324327943, + "learning_rate": 9.768910287352973e-06, + "loss": 0.3894, + "step": 4284 + }, + { + "epoch": 0.12, + "grad_norm": 1.764247658485769, + "learning_rate": 9.768769119523158e-06, + "loss": 0.4241, + "step": 4285 + }, + { + "epoch": 0.12, + "grad_norm": 1.7282753509197253, + "learning_rate": 9.768627909608903e-06, + "loss": 0.4107, + "step": 4286 + }, + { + "epoch": 0.12, + "grad_norm": 1.6662638666078098, + "learning_rate": 9.768486657611452e-06, + "loss": 0.3961, + "step": 4287 + }, + { + "epoch": 0.12, + "grad_norm": 1.7789780779567512, + "learning_rate": 9.768345363532054e-06, + "loss": 0.4249, + "step": 4288 + }, + { + "epoch": 0.12, + "grad_norm": 1.6372786740968053, + "learning_rate": 9.768204027371956e-06, + "loss": 0.402, + "step": 4289 + }, + { + "epoch": 0.12, + "grad_norm": 1.6535196536483523, + "learning_rate": 9.768062649132405e-06, + "loss": 0.4266, + "step": 4290 + }, + { + "epoch": 0.12, + "grad_norm": 1.5543577496448455, + "learning_rate": 9.767921228814647e-06, + "loss": 0.4089, + "step": 4291 + }, + { + "epoch": 0.12, + "grad_norm": 1.5971691456516597, + "learning_rate": 9.767779766419933e-06, + "loss": 0.395, + "step": 4292 + }, + { + "epoch": 0.12, + "grad_norm": 1.7534797345427162, + "learning_rate": 9.767638261949508e-06, + "loss": 0.3964, + "step": 4293 + }, + { + "epoch": 0.12, + "grad_norm": 1.50330447914733, + "learning_rate": 9.767496715404624e-06, + "loss": 0.4088, + "step": 4294 + }, + { + "epoch": 0.12, + "grad_norm": 2.3976461025043374, + "learning_rate": 9.767355126786527e-06, + "loss": 0.4213, + "step": 4295 + }, + { + "epoch": 0.12, + "grad_norm": 1.5861943413462785, + "learning_rate": 9.767213496096468e-06, + "loss": 0.3772, + "step": 4296 + }, + { + "epoch": 0.12, + "grad_norm": 2.1552966719734905, + "learning_rate": 9.767071823335697e-06, + "loss": 0.4186, + "step": 4297 + }, + { + "epoch": 0.12, + "grad_norm": 1.4952589296569374, + "learning_rate": 9.766930108505466e-06, + "loss": 0.4242, + "step": 4298 + }, + { + "epoch": 0.12, + "grad_norm": 2.3413981671998565, + "learning_rate": 9.766788351607022e-06, + "loss": 0.4282, + "step": 4299 + }, + { + "epoch": 0.12, + "grad_norm": 1.4634544508952945, + "learning_rate": 9.766646552641618e-06, + "loss": 0.3773, + "step": 4300 + }, + { + "epoch": 0.12, + "grad_norm": 1.6346198134247047, + "learning_rate": 9.766504711610507e-06, + "loss": 0.4015, + "step": 4301 + }, + { + "epoch": 0.12, + "grad_norm": 1.7578018626753946, + "learning_rate": 9.76636282851494e-06, + "loss": 0.4332, + "step": 4302 + }, + { + "epoch": 0.12, + "grad_norm": 1.7661668344344994, + "learning_rate": 9.766220903356165e-06, + "loss": 0.4502, + "step": 4303 + }, + { + "epoch": 0.12, + "grad_norm": 1.4970727664912422, + "learning_rate": 9.766078936135439e-06, + "loss": 0.3878, + "step": 4304 + }, + { + "epoch": 0.12, + "grad_norm": 1.5983421664656075, + "learning_rate": 9.765936926854014e-06, + "loss": 0.4083, + "step": 4305 + }, + { + "epoch": 0.12, + "grad_norm": 1.6843434346577209, + "learning_rate": 9.765794875513142e-06, + "loss": 0.4194, + "step": 4306 + }, + { + "epoch": 0.12, + "grad_norm": 2.0262843521354448, + "learning_rate": 9.765652782114076e-06, + "loss": 0.463, + "step": 4307 + }, + { + "epoch": 0.12, + "grad_norm": 1.5148977036414775, + "learning_rate": 9.765510646658073e-06, + "loss": 0.3945, + "step": 4308 + }, + { + "epoch": 0.12, + "grad_norm": 1.9250569449029158, + "learning_rate": 9.765368469146386e-06, + "loss": 0.4377, + "step": 4309 + }, + { + "epoch": 0.13, + "grad_norm": 1.187939819332679, + "learning_rate": 9.76522624958027e-06, + "loss": 0.6148, + "step": 4310 + }, + { + "epoch": 0.13, + "grad_norm": 1.0223515191140942, + "learning_rate": 9.765083987960977e-06, + "loss": 0.6057, + "step": 4311 + }, + { + "epoch": 0.13, + "grad_norm": 1.6727383507381852, + "learning_rate": 9.764941684289767e-06, + "loss": 0.4168, + "step": 4312 + }, + { + "epoch": 0.13, + "grad_norm": 2.604741721722529, + "learning_rate": 9.764799338567894e-06, + "loss": 0.4048, + "step": 4313 + }, + { + "epoch": 0.13, + "grad_norm": 1.6943950698720227, + "learning_rate": 9.764656950796612e-06, + "loss": 0.4196, + "step": 4314 + }, + { + "epoch": 0.13, + "grad_norm": 1.7554660438535659, + "learning_rate": 9.764514520977182e-06, + "loss": 0.3852, + "step": 4315 + }, + { + "epoch": 0.13, + "grad_norm": 1.213646568830564, + "learning_rate": 9.764372049110858e-06, + "loss": 0.5815, + "step": 4316 + }, + { + "epoch": 0.13, + "grad_norm": 2.4252890685841932, + "learning_rate": 9.764229535198896e-06, + "loss": 0.4271, + "step": 4317 + }, + { + "epoch": 0.13, + "grad_norm": 1.9308684726815033, + "learning_rate": 9.764086979242556e-06, + "loss": 0.4562, + "step": 4318 + }, + { + "epoch": 0.13, + "grad_norm": 1.7486351747865199, + "learning_rate": 9.763944381243096e-06, + "loss": 0.4155, + "step": 4319 + }, + { + "epoch": 0.13, + "grad_norm": 1.6693347265168148, + "learning_rate": 9.763801741201775e-06, + "loss": 0.4153, + "step": 4320 + }, + { + "epoch": 0.13, + "grad_norm": 3.322625839912329, + "learning_rate": 9.76365905911985e-06, + "loss": 0.4252, + "step": 4321 + }, + { + "epoch": 0.13, + "grad_norm": 1.5303353696364113, + "learning_rate": 9.763516334998579e-06, + "loss": 0.4568, + "step": 4322 + }, + { + "epoch": 0.13, + "grad_norm": 2.049140798436554, + "learning_rate": 9.763373568839226e-06, + "loss": 0.3938, + "step": 4323 + }, + { + "epoch": 0.13, + "grad_norm": 2.73060598341181, + "learning_rate": 9.763230760643048e-06, + "loss": 0.425, + "step": 4324 + }, + { + "epoch": 0.13, + "grad_norm": 1.5783799346925627, + "learning_rate": 9.763087910411306e-06, + "loss": 0.4529, + "step": 4325 + }, + { + "epoch": 0.13, + "grad_norm": 1.727480545986658, + "learning_rate": 9.762945018145259e-06, + "loss": 0.4377, + "step": 4326 + }, + { + "epoch": 0.13, + "grad_norm": 1.551358478526767, + "learning_rate": 9.76280208384617e-06, + "loss": 0.4075, + "step": 4327 + }, + { + "epoch": 0.13, + "grad_norm": 1.494054903475883, + "learning_rate": 9.7626591075153e-06, + "loss": 0.4488, + "step": 4328 + }, + { + "epoch": 0.13, + "grad_norm": 1.7048061171588142, + "learning_rate": 9.762516089153911e-06, + "loss": 0.4193, + "step": 4329 + }, + { + "epoch": 0.13, + "grad_norm": 2.293189643541097, + "learning_rate": 9.762373028763263e-06, + "loss": 0.4621, + "step": 4330 + }, + { + "epoch": 0.13, + "grad_norm": 2.0356534371538166, + "learning_rate": 9.762229926344622e-06, + "loss": 0.4155, + "step": 4331 + }, + { + "epoch": 0.13, + "grad_norm": 1.6449650302469236, + "learning_rate": 9.762086781899249e-06, + "loss": 0.3927, + "step": 4332 + }, + { + "epoch": 0.13, + "grad_norm": 1.6344434624349038, + "learning_rate": 9.761943595428407e-06, + "loss": 0.4737, + "step": 4333 + }, + { + "epoch": 0.13, + "grad_norm": 1.7217534137557653, + "learning_rate": 9.761800366933358e-06, + "loss": 0.4501, + "step": 4334 + }, + { + "epoch": 0.13, + "grad_norm": 1.5685680130776316, + "learning_rate": 9.76165709641537e-06, + "loss": 0.403, + "step": 4335 + }, + { + "epoch": 0.13, + "grad_norm": 1.5458188860547273, + "learning_rate": 9.761513783875706e-06, + "loss": 0.4121, + "step": 4336 + }, + { + "epoch": 0.13, + "grad_norm": 1.7220632708785273, + "learning_rate": 9.761370429315628e-06, + "loss": 0.4041, + "step": 4337 + }, + { + "epoch": 0.13, + "grad_norm": 1.577366998687488, + "learning_rate": 9.761227032736405e-06, + "loss": 0.4252, + "step": 4338 + }, + { + "epoch": 0.13, + "grad_norm": 1.593658559139213, + "learning_rate": 9.7610835941393e-06, + "loss": 0.4068, + "step": 4339 + }, + { + "epoch": 0.13, + "grad_norm": 1.8056745333569306, + "learning_rate": 9.760940113525579e-06, + "loss": 0.3938, + "step": 4340 + }, + { + "epoch": 0.13, + "grad_norm": 1.540382690446299, + "learning_rate": 9.760796590896508e-06, + "loss": 0.3863, + "step": 4341 + }, + { + "epoch": 0.13, + "grad_norm": 1.8047391279827414, + "learning_rate": 9.760653026253355e-06, + "loss": 0.4602, + "step": 4342 + }, + { + "epoch": 0.13, + "grad_norm": 2.1315521532319774, + "learning_rate": 9.760509419597386e-06, + "loss": 0.4508, + "step": 4343 + }, + { + "epoch": 0.13, + "grad_norm": 3.5074898698486976, + "learning_rate": 9.76036577092987e-06, + "loss": 0.3899, + "step": 4344 + }, + { + "epoch": 0.13, + "grad_norm": 2.1775346631184, + "learning_rate": 9.760222080252073e-06, + "loss": 0.4745, + "step": 4345 + }, + { + "epoch": 0.13, + "grad_norm": 2.0797267464267946, + "learning_rate": 9.760078347565262e-06, + "loss": 0.3871, + "step": 4346 + }, + { + "epoch": 0.13, + "grad_norm": 2.681295694249617, + "learning_rate": 9.759934572870706e-06, + "loss": 0.4115, + "step": 4347 + }, + { + "epoch": 0.13, + "grad_norm": 1.8043608456973101, + "learning_rate": 9.759790756169675e-06, + "loss": 0.4255, + "step": 4348 + }, + { + "epoch": 0.13, + "grad_norm": 1.5909448273196405, + "learning_rate": 9.759646897463439e-06, + "loss": 0.4425, + "step": 4349 + }, + { + "epoch": 0.13, + "grad_norm": 1.9592621673640824, + "learning_rate": 9.759502996753267e-06, + "loss": 0.466, + "step": 4350 + }, + { + "epoch": 0.13, + "grad_norm": 1.5564568317651533, + "learning_rate": 9.759359054040427e-06, + "loss": 0.4105, + "step": 4351 + }, + { + "epoch": 0.13, + "grad_norm": 1.6607457221518318, + "learning_rate": 9.759215069326192e-06, + "loss": 0.4071, + "step": 4352 + }, + { + "epoch": 0.13, + "grad_norm": 1.65749970397995, + "learning_rate": 9.759071042611829e-06, + "loss": 0.4325, + "step": 4353 + }, + { + "epoch": 0.13, + "grad_norm": 1.6971421907662458, + "learning_rate": 9.758926973898613e-06, + "loss": 0.3904, + "step": 4354 + }, + { + "epoch": 0.13, + "grad_norm": 1.553135422774051, + "learning_rate": 9.758782863187812e-06, + "loss": 0.4122, + "step": 4355 + }, + { + "epoch": 0.13, + "grad_norm": 1.5213234255796968, + "learning_rate": 9.758638710480701e-06, + "loss": 0.3856, + "step": 4356 + }, + { + "epoch": 0.13, + "grad_norm": 1.5367006734261812, + "learning_rate": 9.75849451577855e-06, + "loss": 0.3896, + "step": 4357 + }, + { + "epoch": 0.13, + "grad_norm": 1.5480371181760688, + "learning_rate": 9.75835027908263e-06, + "loss": 0.3743, + "step": 4358 + }, + { + "epoch": 0.13, + "grad_norm": 1.2672311192329078, + "learning_rate": 9.75820600039422e-06, + "loss": 0.6522, + "step": 4359 + }, + { + "epoch": 0.13, + "grad_norm": 1.797541468365491, + "learning_rate": 9.758061679714588e-06, + "loss": 0.421, + "step": 4360 + }, + { + "epoch": 0.13, + "grad_norm": 1.7861067688180563, + "learning_rate": 9.757917317045006e-06, + "loss": 0.4502, + "step": 4361 + }, + { + "epoch": 0.13, + "grad_norm": 1.0239593333222274, + "learning_rate": 9.757772912386755e-06, + "loss": 0.5819, + "step": 4362 + }, + { + "epoch": 0.13, + "grad_norm": 1.5473353721583623, + "learning_rate": 9.757628465741103e-06, + "loss": 0.4168, + "step": 4363 + }, + { + "epoch": 0.13, + "grad_norm": 1.8701797221261656, + "learning_rate": 9.757483977109328e-06, + "loss": 0.4851, + "step": 4364 + }, + { + "epoch": 0.13, + "grad_norm": 1.534085036023721, + "learning_rate": 9.757339446492704e-06, + "loss": 0.4021, + "step": 4365 + }, + { + "epoch": 0.13, + "grad_norm": 2.2687851451446472, + "learning_rate": 9.757194873892505e-06, + "loss": 0.4346, + "step": 4366 + }, + { + "epoch": 0.13, + "grad_norm": 1.7119399221316105, + "learning_rate": 9.757050259310011e-06, + "loss": 0.4447, + "step": 4367 + }, + { + "epoch": 0.13, + "grad_norm": 1.5046812831817933, + "learning_rate": 9.756905602746494e-06, + "loss": 0.4252, + "step": 4368 + }, + { + "epoch": 0.13, + "grad_norm": 1.568733550894657, + "learning_rate": 9.756760904203231e-06, + "loss": 0.4111, + "step": 4369 + }, + { + "epoch": 0.13, + "grad_norm": 1.7447434263841046, + "learning_rate": 9.756616163681503e-06, + "loss": 0.4405, + "step": 4370 + }, + { + "epoch": 0.13, + "grad_norm": 1.8078855303276524, + "learning_rate": 9.756471381182583e-06, + "loss": 0.3916, + "step": 4371 + }, + { + "epoch": 0.13, + "grad_norm": 1.5119568015278821, + "learning_rate": 9.75632655670775e-06, + "loss": 0.4016, + "step": 4372 + }, + { + "epoch": 0.13, + "grad_norm": 1.5341775998012346, + "learning_rate": 9.756181690258283e-06, + "loss": 0.3939, + "step": 4373 + }, + { + "epoch": 0.13, + "grad_norm": 1.528157148394079, + "learning_rate": 9.75603678183546e-06, + "loss": 0.3938, + "step": 4374 + }, + { + "epoch": 0.13, + "grad_norm": 1.8569649383338933, + "learning_rate": 9.755891831440561e-06, + "loss": 0.3974, + "step": 4375 + }, + { + "epoch": 0.13, + "grad_norm": 1.8445995817623742, + "learning_rate": 9.75574683907486e-06, + "loss": 0.4651, + "step": 4376 + }, + { + "epoch": 0.13, + "grad_norm": 1.9184400339711987, + "learning_rate": 9.755601804739643e-06, + "loss": 0.4635, + "step": 4377 + }, + { + "epoch": 0.13, + "grad_norm": 2.372070041689457, + "learning_rate": 9.755456728436188e-06, + "loss": 0.3956, + "step": 4378 + }, + { + "epoch": 0.13, + "grad_norm": 1.5828694747350456, + "learning_rate": 9.755311610165774e-06, + "loss": 0.4037, + "step": 4379 + }, + { + "epoch": 0.13, + "grad_norm": 1.5546755133752532, + "learning_rate": 9.755166449929681e-06, + "loss": 0.3886, + "step": 4380 + }, + { + "epoch": 0.13, + "grad_norm": 1.596838804810758, + "learning_rate": 9.755021247729193e-06, + "loss": 0.3884, + "step": 4381 + }, + { + "epoch": 0.13, + "grad_norm": 1.602560631811661, + "learning_rate": 9.75487600356559e-06, + "loss": 0.4153, + "step": 4382 + }, + { + "epoch": 0.13, + "grad_norm": 1.6578425889753055, + "learning_rate": 9.754730717440153e-06, + "loss": 0.4182, + "step": 4383 + }, + { + "epoch": 0.13, + "grad_norm": 1.6813672029661473, + "learning_rate": 9.754585389354163e-06, + "loss": 0.4161, + "step": 4384 + }, + { + "epoch": 0.13, + "grad_norm": 1.7512193983277997, + "learning_rate": 9.754440019308906e-06, + "loss": 0.4149, + "step": 4385 + }, + { + "epoch": 0.13, + "grad_norm": 1.6399450434214484, + "learning_rate": 9.754294607305662e-06, + "loss": 0.4488, + "step": 4386 + }, + { + "epoch": 0.13, + "grad_norm": 1.4815972291113428, + "learning_rate": 9.754149153345716e-06, + "loss": 0.4108, + "step": 4387 + }, + { + "epoch": 0.13, + "grad_norm": 1.6652172362714424, + "learning_rate": 9.754003657430351e-06, + "loss": 0.3792, + "step": 4388 + }, + { + "epoch": 0.13, + "grad_norm": 1.4533980013337944, + "learning_rate": 9.75385811956085e-06, + "loss": 0.4215, + "step": 4389 + }, + { + "epoch": 0.13, + "grad_norm": 1.6827384033934487, + "learning_rate": 9.7537125397385e-06, + "loss": 0.4401, + "step": 4390 + }, + { + "epoch": 0.13, + "grad_norm": 1.8155491155264927, + "learning_rate": 9.753566917964584e-06, + "loss": 0.4555, + "step": 4391 + }, + { + "epoch": 0.13, + "grad_norm": 1.5236618137585558, + "learning_rate": 9.753421254240384e-06, + "loss": 0.4043, + "step": 4392 + }, + { + "epoch": 0.13, + "grad_norm": 1.7004862782733798, + "learning_rate": 9.753275548567192e-06, + "loss": 0.435, + "step": 4393 + }, + { + "epoch": 0.13, + "grad_norm": 1.8301566604661217, + "learning_rate": 9.75312980094629e-06, + "loss": 0.4317, + "step": 4394 + }, + { + "epoch": 0.13, + "grad_norm": 1.7759197429154614, + "learning_rate": 9.752984011378965e-06, + "loss": 0.4033, + "step": 4395 + }, + { + "epoch": 0.13, + "grad_norm": 1.5356182485925756, + "learning_rate": 9.752838179866503e-06, + "loss": 0.4144, + "step": 4396 + }, + { + "epoch": 0.13, + "grad_norm": 1.5189926858053773, + "learning_rate": 9.752692306410191e-06, + "loss": 0.3919, + "step": 4397 + }, + { + "epoch": 0.13, + "grad_norm": 1.5845391835002742, + "learning_rate": 9.752546391011318e-06, + "loss": 0.4336, + "step": 4398 + }, + { + "epoch": 0.13, + "grad_norm": 1.6409665110190068, + "learning_rate": 9.752400433671169e-06, + "loss": 0.3789, + "step": 4399 + }, + { + "epoch": 0.13, + "grad_norm": 1.0991786595932351, + "learning_rate": 9.752254434391032e-06, + "loss": 0.6369, + "step": 4400 + }, + { + "epoch": 0.13, + "grad_norm": 3.855744528637259, + "learning_rate": 9.752108393172199e-06, + "loss": 0.4174, + "step": 4401 + }, + { + "epoch": 0.13, + "grad_norm": 1.687611965839017, + "learning_rate": 9.751962310015956e-06, + "loss": 0.4074, + "step": 4402 + }, + { + "epoch": 0.13, + "grad_norm": 1.6569889868204062, + "learning_rate": 9.751816184923595e-06, + "loss": 0.4437, + "step": 4403 + }, + { + "epoch": 0.13, + "grad_norm": 1.5769253983224354, + "learning_rate": 9.751670017896402e-06, + "loss": 0.3808, + "step": 4404 + }, + { + "epoch": 0.13, + "grad_norm": 1.6498874017467207, + "learning_rate": 9.751523808935668e-06, + "loss": 0.4253, + "step": 4405 + }, + { + "epoch": 0.13, + "grad_norm": 2.241903913756371, + "learning_rate": 9.751377558042683e-06, + "loss": 0.4194, + "step": 4406 + }, + { + "epoch": 0.13, + "grad_norm": 2.372532784679725, + "learning_rate": 9.751231265218739e-06, + "loss": 0.4831, + "step": 4407 + }, + { + "epoch": 0.13, + "grad_norm": 2.7000506914824784, + "learning_rate": 9.751084930465127e-06, + "loss": 0.4105, + "step": 4408 + }, + { + "epoch": 0.13, + "grad_norm": 1.8562742852652971, + "learning_rate": 9.750938553783137e-06, + "loss": 0.4241, + "step": 4409 + }, + { + "epoch": 0.13, + "grad_norm": 1.7340143482766266, + "learning_rate": 9.750792135174063e-06, + "loss": 0.389, + "step": 4410 + }, + { + "epoch": 0.13, + "grad_norm": 1.5188361338562977, + "learning_rate": 9.750645674639194e-06, + "loss": 0.3955, + "step": 4411 + }, + { + "epoch": 0.13, + "grad_norm": 1.5928382741714713, + "learning_rate": 9.750499172179824e-06, + "loss": 0.4236, + "step": 4412 + }, + { + "epoch": 0.13, + "grad_norm": 1.4331851397099802, + "learning_rate": 9.750352627797247e-06, + "loss": 0.4075, + "step": 4413 + }, + { + "epoch": 0.13, + "grad_norm": 1.6482285707118558, + "learning_rate": 9.750206041492756e-06, + "loss": 0.4239, + "step": 4414 + }, + { + "epoch": 0.13, + "grad_norm": 1.5807259593985195, + "learning_rate": 9.750059413267642e-06, + "loss": 0.4258, + "step": 4415 + }, + { + "epoch": 0.13, + "grad_norm": 1.8309599718248897, + "learning_rate": 9.749912743123202e-06, + "loss": 0.3667, + "step": 4416 + }, + { + "epoch": 0.13, + "grad_norm": 1.5361265155387744, + "learning_rate": 9.74976603106073e-06, + "loss": 0.4029, + "step": 4417 + }, + { + "epoch": 0.13, + "grad_norm": 1.7432301054972432, + "learning_rate": 9.749619277081519e-06, + "loss": 0.4009, + "step": 4418 + }, + { + "epoch": 0.13, + "grad_norm": 1.9634656768894676, + "learning_rate": 9.749472481186866e-06, + "loss": 0.4102, + "step": 4419 + }, + { + "epoch": 0.13, + "grad_norm": 1.894063926652665, + "learning_rate": 9.749325643378066e-06, + "loss": 0.3998, + "step": 4420 + }, + { + "epoch": 0.13, + "grad_norm": 2.3058161974382085, + "learning_rate": 9.749178763656414e-06, + "loss": 0.3925, + "step": 4421 + }, + { + "epoch": 0.13, + "grad_norm": 1.8708263223132438, + "learning_rate": 9.749031842023207e-06, + "loss": 0.4194, + "step": 4422 + }, + { + "epoch": 0.13, + "grad_norm": 1.6936681632901849, + "learning_rate": 9.74888487847974e-06, + "loss": 0.4188, + "step": 4423 + }, + { + "epoch": 0.13, + "grad_norm": 1.0955702445677644, + "learning_rate": 9.748737873027313e-06, + "loss": 0.6105, + "step": 4424 + }, + { + "epoch": 0.13, + "grad_norm": 1.6287926639264783, + "learning_rate": 9.748590825667222e-06, + "loss": 0.4402, + "step": 4425 + }, + { + "epoch": 0.13, + "grad_norm": 1.6711018940916007, + "learning_rate": 9.748443736400762e-06, + "loss": 0.3947, + "step": 4426 + }, + { + "epoch": 0.13, + "grad_norm": 1.515966937861845, + "learning_rate": 9.748296605229236e-06, + "loss": 0.4086, + "step": 4427 + }, + { + "epoch": 0.13, + "grad_norm": 1.6986068601283493, + "learning_rate": 9.748149432153938e-06, + "loss": 0.4025, + "step": 4428 + }, + { + "epoch": 0.13, + "grad_norm": 1.6803700217592379, + "learning_rate": 9.748002217176167e-06, + "loss": 0.3887, + "step": 4429 + }, + { + "epoch": 0.13, + "grad_norm": 1.4350212095920083, + "learning_rate": 9.747854960297227e-06, + "loss": 0.4483, + "step": 4430 + }, + { + "epoch": 0.13, + "grad_norm": 1.4172257688282404, + "learning_rate": 9.747707661518413e-06, + "loss": 0.3948, + "step": 4431 + }, + { + "epoch": 0.13, + "grad_norm": 1.5056571131934513, + "learning_rate": 9.747560320841025e-06, + "loss": 0.4308, + "step": 4432 + }, + { + "epoch": 0.13, + "grad_norm": 1.667014268595427, + "learning_rate": 9.747412938266367e-06, + "loss": 0.4168, + "step": 4433 + }, + { + "epoch": 0.13, + "grad_norm": 1.839424721821603, + "learning_rate": 9.747265513795735e-06, + "loss": 0.4329, + "step": 4434 + }, + { + "epoch": 0.13, + "grad_norm": 1.761549538166481, + "learning_rate": 9.747118047430433e-06, + "loss": 0.3867, + "step": 4435 + }, + { + "epoch": 0.13, + "grad_norm": 2.2598050962222267, + "learning_rate": 9.746970539171761e-06, + "loss": 0.4383, + "step": 4436 + }, + { + "epoch": 0.13, + "grad_norm": 1.7508506357661702, + "learning_rate": 9.746822989021022e-06, + "loss": 0.4146, + "step": 4437 + }, + { + "epoch": 0.13, + "grad_norm": 1.6422954725419079, + "learning_rate": 9.746675396979519e-06, + "loss": 0.4024, + "step": 4438 + }, + { + "epoch": 0.13, + "grad_norm": 2.2545261370665326, + "learning_rate": 9.74652776304855e-06, + "loss": 0.419, + "step": 4439 + }, + { + "epoch": 0.13, + "grad_norm": 1.6492451338266818, + "learning_rate": 9.746380087229424e-06, + "loss": 0.4241, + "step": 4440 + }, + { + "epoch": 0.13, + "grad_norm": 2.1606371768456065, + "learning_rate": 9.746232369523438e-06, + "loss": 0.4495, + "step": 4441 + }, + { + "epoch": 0.13, + "grad_norm": 1.5976198471462482, + "learning_rate": 9.7460846099319e-06, + "loss": 0.4256, + "step": 4442 + }, + { + "epoch": 0.13, + "grad_norm": 1.8700501625928558, + "learning_rate": 9.745936808456114e-06, + "loss": 0.4084, + "step": 4443 + }, + { + "epoch": 0.13, + "grad_norm": 1.5957666317931367, + "learning_rate": 9.74578896509738e-06, + "loss": 0.4066, + "step": 4444 + }, + { + "epoch": 0.13, + "grad_norm": 2.920253476953895, + "learning_rate": 9.745641079857008e-06, + "loss": 0.3983, + "step": 4445 + }, + { + "epoch": 0.13, + "grad_norm": 1.5243319165877838, + "learning_rate": 9.745493152736301e-06, + "loss": 0.3839, + "step": 4446 + }, + { + "epoch": 0.13, + "grad_norm": 1.6811743502659038, + "learning_rate": 9.745345183736562e-06, + "loss": 0.4573, + "step": 4447 + }, + { + "epoch": 0.13, + "grad_norm": 1.5455432943935101, + "learning_rate": 9.7451971728591e-06, + "loss": 0.4405, + "step": 4448 + }, + { + "epoch": 0.13, + "grad_norm": 1.6687447033255696, + "learning_rate": 9.745049120105221e-06, + "loss": 0.4073, + "step": 4449 + }, + { + "epoch": 0.13, + "grad_norm": 1.624940648010449, + "learning_rate": 9.744901025476232e-06, + "loss": 0.4065, + "step": 4450 + }, + { + "epoch": 0.13, + "grad_norm": 1.5868293427276634, + "learning_rate": 9.744752888973437e-06, + "loss": 0.3994, + "step": 4451 + }, + { + "epoch": 0.13, + "grad_norm": 1.6213788333917412, + "learning_rate": 9.744604710598146e-06, + "loss": 0.415, + "step": 4452 + }, + { + "epoch": 0.13, + "grad_norm": 1.9955756449950046, + "learning_rate": 9.744456490351665e-06, + "loss": 0.4171, + "step": 4453 + }, + { + "epoch": 0.13, + "grad_norm": 1.9989473247367744, + "learning_rate": 9.744308228235304e-06, + "loss": 0.3872, + "step": 4454 + }, + { + "epoch": 0.13, + "grad_norm": 1.5848871337414845, + "learning_rate": 9.74415992425037e-06, + "loss": 0.395, + "step": 4455 + }, + { + "epoch": 0.13, + "grad_norm": 1.6237065152871235, + "learning_rate": 9.744011578398171e-06, + "loss": 0.4412, + "step": 4456 + }, + { + "epoch": 0.13, + "grad_norm": 1.6167687670774082, + "learning_rate": 9.743863190680019e-06, + "loss": 0.3986, + "step": 4457 + }, + { + "epoch": 0.13, + "grad_norm": 1.5966414663807498, + "learning_rate": 9.743714761097219e-06, + "loss": 0.4034, + "step": 4458 + }, + { + "epoch": 0.13, + "grad_norm": 2.918090972678188, + "learning_rate": 9.743566289651085e-06, + "loss": 0.4467, + "step": 4459 + }, + { + "epoch": 0.13, + "grad_norm": 1.6317338847471274, + "learning_rate": 9.743417776342927e-06, + "loss": 0.4512, + "step": 4460 + }, + { + "epoch": 0.13, + "grad_norm": 1.9899321580686626, + "learning_rate": 9.743269221174054e-06, + "loss": 0.4435, + "step": 4461 + }, + { + "epoch": 0.13, + "grad_norm": 1.6250252101555445, + "learning_rate": 9.743120624145776e-06, + "loss": 0.4077, + "step": 4462 + }, + { + "epoch": 0.13, + "grad_norm": 1.5123656713683902, + "learning_rate": 9.742971985259406e-06, + "loss": 0.3918, + "step": 4463 + }, + { + "epoch": 0.13, + "grad_norm": 1.2012305069746247, + "learning_rate": 9.742823304516258e-06, + "loss": 0.6554, + "step": 4464 + }, + { + "epoch": 0.13, + "grad_norm": 1.6107862656147325, + "learning_rate": 9.74267458191764e-06, + "loss": 0.4446, + "step": 4465 + }, + { + "epoch": 0.13, + "grad_norm": 1.6118588390608062, + "learning_rate": 9.742525817464866e-06, + "loss": 0.4124, + "step": 4466 + }, + { + "epoch": 0.13, + "grad_norm": 2.747186498039594, + "learning_rate": 9.74237701115925e-06, + "loss": 0.4137, + "step": 4467 + }, + { + "epoch": 0.13, + "grad_norm": 2.5953317528358717, + "learning_rate": 9.742228163002103e-06, + "loss": 0.4137, + "step": 4468 + }, + { + "epoch": 0.13, + "grad_norm": 1.5914919604678424, + "learning_rate": 9.742079272994742e-06, + "loss": 0.4123, + "step": 4469 + }, + { + "epoch": 0.13, + "grad_norm": 1.9176108294293726, + "learning_rate": 9.741930341138477e-06, + "loss": 0.3888, + "step": 4470 + }, + { + "epoch": 0.13, + "grad_norm": 1.7044083994733898, + "learning_rate": 9.741781367434624e-06, + "loss": 0.4502, + "step": 4471 + }, + { + "epoch": 0.13, + "grad_norm": 1.6805393247852924, + "learning_rate": 9.741632351884497e-06, + "loss": 0.4611, + "step": 4472 + }, + { + "epoch": 0.13, + "grad_norm": 2.118212606166327, + "learning_rate": 9.741483294489415e-06, + "loss": 0.3884, + "step": 4473 + }, + { + "epoch": 0.13, + "grad_norm": 1.3810760080661237, + "learning_rate": 9.741334195250689e-06, + "loss": 0.3777, + "step": 4474 + }, + { + "epoch": 0.13, + "grad_norm": 1.6948180117133975, + "learning_rate": 9.741185054169636e-06, + "loss": 0.4273, + "step": 4475 + }, + { + "epoch": 0.13, + "grad_norm": 1.0728386675864587, + "learning_rate": 9.74103587124757e-06, + "loss": 0.626, + "step": 4476 + }, + { + "epoch": 0.13, + "grad_norm": 1.7987097283688251, + "learning_rate": 9.740886646485813e-06, + "loss": 0.4041, + "step": 4477 + }, + { + "epoch": 0.13, + "grad_norm": 1.494894180318438, + "learning_rate": 9.740737379885677e-06, + "loss": 0.3989, + "step": 4478 + }, + { + "epoch": 0.13, + "grad_norm": 1.7811477349382254, + "learning_rate": 9.740588071448483e-06, + "loss": 0.4246, + "step": 4479 + }, + { + "epoch": 0.13, + "grad_norm": 1.9888059830339888, + "learning_rate": 9.740438721175545e-06, + "loss": 0.3879, + "step": 4480 + }, + { + "epoch": 0.13, + "grad_norm": 1.543840594794082, + "learning_rate": 9.740289329068182e-06, + "loss": 0.3828, + "step": 4481 + }, + { + "epoch": 0.13, + "grad_norm": 1.892107366111554, + "learning_rate": 9.740139895127715e-06, + "loss": 0.3823, + "step": 4482 + }, + { + "epoch": 0.13, + "grad_norm": 1.5797594606635668, + "learning_rate": 9.73999041935546e-06, + "loss": 0.3946, + "step": 4483 + }, + { + "epoch": 0.13, + "grad_norm": 1.604655813743733, + "learning_rate": 9.739840901752737e-06, + "loss": 0.4237, + "step": 4484 + }, + { + "epoch": 0.13, + "grad_norm": 1.7153874539138803, + "learning_rate": 9.739691342320866e-06, + "loss": 0.3725, + "step": 4485 + }, + { + "epoch": 0.13, + "grad_norm": 2.5298077213492958, + "learning_rate": 9.739541741061164e-06, + "loss": 0.4202, + "step": 4486 + }, + { + "epoch": 0.13, + "grad_norm": 1.5668937497744475, + "learning_rate": 9.739392097974956e-06, + "loss": 0.4107, + "step": 4487 + }, + { + "epoch": 0.13, + "grad_norm": 1.502642152669806, + "learning_rate": 9.739242413063558e-06, + "loss": 0.4378, + "step": 4488 + }, + { + "epoch": 0.13, + "grad_norm": 1.5069606303368248, + "learning_rate": 9.739092686328296e-06, + "loss": 0.4209, + "step": 4489 + }, + { + "epoch": 0.13, + "grad_norm": 1.7470445706045112, + "learning_rate": 9.738942917770487e-06, + "loss": 0.3552, + "step": 4490 + }, + { + "epoch": 0.13, + "grad_norm": 2.3418928677789586, + "learning_rate": 9.738793107391455e-06, + "loss": 0.4565, + "step": 4491 + }, + { + "epoch": 0.13, + "grad_norm": 1.5254245560826696, + "learning_rate": 9.738643255192519e-06, + "loss": 0.4298, + "step": 4492 + }, + { + "epoch": 0.13, + "grad_norm": 1.0076672427188424, + "learning_rate": 9.738493361175005e-06, + "loss": 0.6304, + "step": 4493 + }, + { + "epoch": 0.13, + "grad_norm": 1.6171773199173731, + "learning_rate": 9.738343425340236e-06, + "loss": 0.4066, + "step": 4494 + }, + { + "epoch": 0.13, + "grad_norm": 1.6256064296198434, + "learning_rate": 9.738193447689532e-06, + "loss": 0.4165, + "step": 4495 + }, + { + "epoch": 0.13, + "grad_norm": 1.5496972177877286, + "learning_rate": 9.738043428224218e-06, + "loss": 0.4112, + "step": 4496 + }, + { + "epoch": 0.13, + "grad_norm": 1.527889166981219, + "learning_rate": 9.73789336694562e-06, + "loss": 0.4156, + "step": 4497 + }, + { + "epoch": 0.13, + "grad_norm": 1.6567224348257497, + "learning_rate": 9.73774326385506e-06, + "loss": 0.4584, + "step": 4498 + }, + { + "epoch": 0.13, + "grad_norm": 1.5634516588422205, + "learning_rate": 9.73759311895386e-06, + "loss": 0.392, + "step": 4499 + }, + { + "epoch": 0.13, + "grad_norm": 1.4655159180213808, + "learning_rate": 9.737442932243353e-06, + "loss": 0.4141, + "step": 4500 + }, + { + "epoch": 0.13, + "grad_norm": 1.6384173410014393, + "learning_rate": 9.737292703724856e-06, + "loss": 0.3944, + "step": 4501 + }, + { + "epoch": 0.13, + "grad_norm": 1.6118077215498972, + "learning_rate": 9.7371424333997e-06, + "loss": 0.4317, + "step": 4502 + }, + { + "epoch": 0.13, + "grad_norm": 2.2031316323873247, + "learning_rate": 9.736992121269209e-06, + "loss": 0.4109, + "step": 4503 + }, + { + "epoch": 0.13, + "grad_norm": 1.6649005620535873, + "learning_rate": 9.736841767334711e-06, + "loss": 0.3886, + "step": 4504 + }, + { + "epoch": 0.13, + "grad_norm": 1.797074860749737, + "learning_rate": 9.73669137159753e-06, + "loss": 0.4139, + "step": 4505 + }, + { + "epoch": 0.13, + "grad_norm": 1.5102392571049559, + "learning_rate": 9.736540934058996e-06, + "loss": 0.4078, + "step": 4506 + }, + { + "epoch": 0.13, + "grad_norm": 1.5791559716616523, + "learning_rate": 9.736390454720435e-06, + "loss": 0.3923, + "step": 4507 + }, + { + "epoch": 0.13, + "grad_norm": 1.4859657163134181, + "learning_rate": 9.736239933583177e-06, + "loss": 0.3975, + "step": 4508 + }, + { + "epoch": 0.13, + "grad_norm": 1.4420395531454053, + "learning_rate": 9.736089370648548e-06, + "loss": 0.3659, + "step": 4509 + }, + { + "epoch": 0.13, + "grad_norm": 1.7483101833470025, + "learning_rate": 9.735938765917879e-06, + "loss": 0.4223, + "step": 4510 + }, + { + "epoch": 0.13, + "grad_norm": 1.7922971752927002, + "learning_rate": 9.735788119392497e-06, + "loss": 0.4051, + "step": 4511 + }, + { + "epoch": 0.13, + "grad_norm": 2.1091512255546077, + "learning_rate": 9.735637431073732e-06, + "loss": 0.4207, + "step": 4512 + }, + { + "epoch": 0.13, + "grad_norm": 1.8160316083316637, + "learning_rate": 9.735486700962915e-06, + "loss": 0.4066, + "step": 4513 + }, + { + "epoch": 0.13, + "grad_norm": 2.075780200483053, + "learning_rate": 9.735335929061373e-06, + "loss": 0.4099, + "step": 4514 + }, + { + "epoch": 0.13, + "grad_norm": 1.5287839309055251, + "learning_rate": 9.735185115370443e-06, + "loss": 0.394, + "step": 4515 + }, + { + "epoch": 0.13, + "grad_norm": 1.6881674020068695, + "learning_rate": 9.735034259891449e-06, + "loss": 0.4196, + "step": 4516 + }, + { + "epoch": 0.13, + "grad_norm": 1.6156486245967574, + "learning_rate": 9.734883362625724e-06, + "loss": 0.4021, + "step": 4517 + }, + { + "epoch": 0.13, + "grad_norm": 0.9768161575680866, + "learning_rate": 9.734732423574603e-06, + "loss": 0.5343, + "step": 4518 + }, + { + "epoch": 0.13, + "grad_norm": 1.667165529998519, + "learning_rate": 9.734581442739415e-06, + "loss": 0.404, + "step": 4519 + }, + { + "epoch": 0.13, + "grad_norm": 1.6489223513336226, + "learning_rate": 9.734430420121494e-06, + "loss": 0.4043, + "step": 4520 + }, + { + "epoch": 0.13, + "grad_norm": 2.017346075864371, + "learning_rate": 9.734279355722171e-06, + "loss": 0.4591, + "step": 4521 + }, + { + "epoch": 0.13, + "grad_norm": 1.677607776208376, + "learning_rate": 9.734128249542781e-06, + "loss": 0.4109, + "step": 4522 + }, + { + "epoch": 0.13, + "grad_norm": 1.5385756414951994, + "learning_rate": 9.733977101584657e-06, + "loss": 0.4191, + "step": 4523 + }, + { + "epoch": 0.13, + "grad_norm": 1.6066837806392722, + "learning_rate": 9.733825911849131e-06, + "loss": 0.4015, + "step": 4524 + }, + { + "epoch": 0.13, + "grad_norm": 1.5177054316440912, + "learning_rate": 9.733674680337541e-06, + "loss": 0.3985, + "step": 4525 + }, + { + "epoch": 0.13, + "grad_norm": 2.040731062262422, + "learning_rate": 9.733523407051217e-06, + "loss": 0.409, + "step": 4526 + }, + { + "epoch": 0.13, + "grad_norm": 4.566593794277507, + "learning_rate": 9.733372091991499e-06, + "loss": 0.4297, + "step": 4527 + }, + { + "epoch": 0.13, + "grad_norm": 1.5246606204437034, + "learning_rate": 9.733220735159719e-06, + "loss": 0.3923, + "step": 4528 + }, + { + "epoch": 0.13, + "grad_norm": 1.971165084004477, + "learning_rate": 9.733069336557212e-06, + "loss": 0.3877, + "step": 4529 + }, + { + "epoch": 0.13, + "grad_norm": 1.6500049318215857, + "learning_rate": 9.732917896185317e-06, + "loss": 0.3881, + "step": 4530 + }, + { + "epoch": 0.13, + "grad_norm": 1.4784180751526073, + "learning_rate": 9.732766414045368e-06, + "loss": 0.3909, + "step": 4531 + }, + { + "epoch": 0.13, + "grad_norm": 1.572444266517001, + "learning_rate": 9.732614890138704e-06, + "loss": 0.4269, + "step": 4532 + }, + { + "epoch": 0.13, + "grad_norm": 1.7364867037411595, + "learning_rate": 9.73246332446666e-06, + "loss": 0.4244, + "step": 4533 + }, + { + "epoch": 0.13, + "grad_norm": 1.6628378528046357, + "learning_rate": 9.732311717030576e-06, + "loss": 0.4324, + "step": 4534 + }, + { + "epoch": 0.13, + "grad_norm": 1.5479849443722842, + "learning_rate": 9.732160067831786e-06, + "loss": 0.4257, + "step": 4535 + }, + { + "epoch": 0.13, + "grad_norm": 1.7183122110796667, + "learning_rate": 9.732008376871633e-06, + "loss": 0.4413, + "step": 4536 + }, + { + "epoch": 0.13, + "grad_norm": 1.539713716600118, + "learning_rate": 9.731856644151454e-06, + "loss": 0.3934, + "step": 4537 + }, + { + "epoch": 0.13, + "grad_norm": 1.7843863286498227, + "learning_rate": 9.731704869672587e-06, + "loss": 0.4304, + "step": 4538 + }, + { + "epoch": 0.13, + "grad_norm": 1.521067789011093, + "learning_rate": 9.731553053436373e-06, + "loss": 0.4195, + "step": 4539 + }, + { + "epoch": 0.13, + "grad_norm": 1.768146209383897, + "learning_rate": 9.73140119544415e-06, + "loss": 0.4285, + "step": 4540 + }, + { + "epoch": 0.13, + "grad_norm": 1.5308530561852804, + "learning_rate": 9.731249295697259e-06, + "loss": 0.4539, + "step": 4541 + }, + { + "epoch": 0.13, + "grad_norm": 1.7297015608436686, + "learning_rate": 9.73109735419704e-06, + "loss": 0.4295, + "step": 4542 + }, + { + "epoch": 0.13, + "grad_norm": 1.4047156754859569, + "learning_rate": 9.730945370944833e-06, + "loss": 0.418, + "step": 4543 + }, + { + "epoch": 0.13, + "grad_norm": 1.5414139947755627, + "learning_rate": 9.730793345941982e-06, + "loss": 0.3756, + "step": 4544 + }, + { + "epoch": 0.13, + "grad_norm": 1.578812046757625, + "learning_rate": 9.730641279189827e-06, + "loss": 0.3952, + "step": 4545 + }, + { + "epoch": 0.13, + "grad_norm": 1.4480799668890096, + "learning_rate": 9.730489170689712e-06, + "loss": 0.412, + "step": 4546 + }, + { + "epoch": 0.13, + "grad_norm": 1.668179072834836, + "learning_rate": 9.730337020442976e-06, + "loss": 0.4371, + "step": 4547 + }, + { + "epoch": 0.13, + "grad_norm": 1.5708983031224053, + "learning_rate": 9.730184828450964e-06, + "loss": 0.4036, + "step": 4548 + }, + { + "epoch": 0.13, + "grad_norm": 1.6322447148253172, + "learning_rate": 9.730032594715018e-06, + "loss": 0.4215, + "step": 4549 + }, + { + "epoch": 0.13, + "grad_norm": 1.646054282388587, + "learning_rate": 9.729880319236482e-06, + "loss": 0.4049, + "step": 4550 + }, + { + "epoch": 0.13, + "grad_norm": 1.8617388877774466, + "learning_rate": 9.729728002016698e-06, + "loss": 0.411, + "step": 4551 + }, + { + "epoch": 0.13, + "grad_norm": 1.577058394175779, + "learning_rate": 9.729575643057014e-06, + "loss": 0.3976, + "step": 4552 + }, + { + "epoch": 0.13, + "grad_norm": 1.6993833037813089, + "learning_rate": 9.729423242358772e-06, + "loss": 0.419, + "step": 4553 + }, + { + "epoch": 0.13, + "grad_norm": 1.947020955000427, + "learning_rate": 9.729270799923319e-06, + "loss": 0.404, + "step": 4554 + }, + { + "epoch": 0.13, + "grad_norm": 2.043983700254477, + "learning_rate": 9.729118315751996e-06, + "loss": 0.4413, + "step": 4555 + }, + { + "epoch": 0.13, + "grad_norm": 1.6171839212069623, + "learning_rate": 9.728965789846152e-06, + "loss": 0.4541, + "step": 4556 + }, + { + "epoch": 0.13, + "grad_norm": 1.645419715489864, + "learning_rate": 9.728813222207133e-06, + "loss": 0.3984, + "step": 4557 + }, + { + "epoch": 0.13, + "grad_norm": 1.7995018903475672, + "learning_rate": 9.728660612836285e-06, + "loss": 0.409, + "step": 4558 + }, + { + "epoch": 0.13, + "grad_norm": 3.590712049895447, + "learning_rate": 9.728507961734955e-06, + "loss": 0.3867, + "step": 4559 + }, + { + "epoch": 0.13, + "grad_norm": 1.5722218612115528, + "learning_rate": 9.728355268904489e-06, + "loss": 0.3954, + "step": 4560 + }, + { + "epoch": 0.13, + "grad_norm": 1.0803539449369004, + "learning_rate": 9.728202534346235e-06, + "loss": 0.6112, + "step": 4561 + }, + { + "epoch": 0.13, + "grad_norm": 1.4413214173381588, + "learning_rate": 9.728049758061541e-06, + "loss": 0.3891, + "step": 4562 + }, + { + "epoch": 0.13, + "grad_norm": 1.490712980453709, + "learning_rate": 9.727896940051757e-06, + "loss": 0.4217, + "step": 4563 + }, + { + "epoch": 0.13, + "grad_norm": 1.733733508722957, + "learning_rate": 9.72774408031823e-06, + "loss": 0.3774, + "step": 4564 + }, + { + "epoch": 0.13, + "grad_norm": 2.3293980531391973, + "learning_rate": 9.727591178862308e-06, + "loss": 0.3974, + "step": 4565 + }, + { + "epoch": 0.13, + "grad_norm": 2.0663165989698156, + "learning_rate": 9.727438235685342e-06, + "loss": 0.4191, + "step": 4566 + }, + { + "epoch": 0.13, + "grad_norm": 1.4311663381211817, + "learning_rate": 9.727285250788681e-06, + "loss": 0.4005, + "step": 4567 + }, + { + "epoch": 0.13, + "grad_norm": 1.8587609423536067, + "learning_rate": 9.727132224173674e-06, + "loss": 0.4618, + "step": 4568 + }, + { + "epoch": 0.13, + "grad_norm": 1.6212080556388198, + "learning_rate": 9.726979155841675e-06, + "loss": 0.4104, + "step": 4569 + }, + { + "epoch": 0.13, + "grad_norm": 2.2103632318853763, + "learning_rate": 9.72682604579403e-06, + "loss": 0.4316, + "step": 4570 + }, + { + "epoch": 0.13, + "grad_norm": 1.717066342424692, + "learning_rate": 9.726672894032095e-06, + "loss": 0.4284, + "step": 4571 + }, + { + "epoch": 0.13, + "grad_norm": 1.5699385530305174, + "learning_rate": 9.726519700557218e-06, + "loss": 0.4102, + "step": 4572 + }, + { + "epoch": 0.13, + "grad_norm": 5.7407500670668465, + "learning_rate": 9.726366465370753e-06, + "loss": 0.3771, + "step": 4573 + }, + { + "epoch": 0.13, + "grad_norm": 3.839596796785492, + "learning_rate": 9.726213188474049e-06, + "loss": 0.4344, + "step": 4574 + }, + { + "epoch": 0.13, + "grad_norm": 1.4984068081022075, + "learning_rate": 9.726059869868465e-06, + "loss": 0.3944, + "step": 4575 + }, + { + "epoch": 0.13, + "grad_norm": 1.8081804083077802, + "learning_rate": 9.725906509555347e-06, + "loss": 0.4199, + "step": 4576 + }, + { + "epoch": 0.13, + "grad_norm": 1.6704015547810387, + "learning_rate": 9.725753107536053e-06, + "loss": 0.4424, + "step": 4577 + }, + { + "epoch": 0.13, + "grad_norm": 1.719681015975043, + "learning_rate": 9.725599663811936e-06, + "loss": 0.4067, + "step": 4578 + }, + { + "epoch": 0.13, + "grad_norm": 1.5992140810060222, + "learning_rate": 9.725446178384349e-06, + "loss": 0.4122, + "step": 4579 + }, + { + "epoch": 0.13, + "grad_norm": 1.5646815901096855, + "learning_rate": 9.725292651254647e-06, + "loss": 0.429, + "step": 4580 + }, + { + "epoch": 0.13, + "grad_norm": 1.4364982043945163, + "learning_rate": 9.725139082424184e-06, + "loss": 0.4005, + "step": 4581 + }, + { + "epoch": 0.13, + "grad_norm": 1.879658767064717, + "learning_rate": 9.724985471894316e-06, + "loss": 0.4266, + "step": 4582 + }, + { + "epoch": 0.13, + "grad_norm": 1.8852016372669254, + "learning_rate": 9.724831819666402e-06, + "loss": 0.4282, + "step": 4583 + }, + { + "epoch": 0.13, + "grad_norm": 1.5393022071691915, + "learning_rate": 9.724678125741792e-06, + "loss": 0.3939, + "step": 4584 + }, + { + "epoch": 0.13, + "grad_norm": 1.6894951296860092, + "learning_rate": 9.724524390121846e-06, + "loss": 0.4155, + "step": 4585 + }, + { + "epoch": 0.13, + "grad_norm": 1.6081624678267306, + "learning_rate": 9.72437061280792e-06, + "loss": 0.3921, + "step": 4586 + }, + { + "epoch": 0.13, + "grad_norm": 1.066637215490094, + "learning_rate": 9.724216793801371e-06, + "loss": 0.6226, + "step": 4587 + }, + { + "epoch": 0.13, + "grad_norm": 2.0526450548932593, + "learning_rate": 9.724062933103555e-06, + "loss": 0.4314, + "step": 4588 + }, + { + "epoch": 0.13, + "grad_norm": 1.6939401776789687, + "learning_rate": 9.723909030715832e-06, + "loss": 0.4121, + "step": 4589 + }, + { + "epoch": 0.13, + "grad_norm": 2.197120677580238, + "learning_rate": 9.72375508663956e-06, + "loss": 0.4004, + "step": 4590 + }, + { + "epoch": 0.13, + "grad_norm": 1.8536068794886909, + "learning_rate": 9.723601100876097e-06, + "loss": 0.4592, + "step": 4591 + }, + { + "epoch": 0.13, + "grad_norm": 2.007438282260308, + "learning_rate": 9.7234470734268e-06, + "loss": 0.4063, + "step": 4592 + }, + { + "epoch": 0.13, + "grad_norm": 1.4611037811957728, + "learning_rate": 9.723293004293032e-06, + "loss": 0.4124, + "step": 4593 + }, + { + "epoch": 0.13, + "grad_norm": 2.022748423575175, + "learning_rate": 9.72313889347615e-06, + "loss": 0.4412, + "step": 4594 + }, + { + "epoch": 0.13, + "grad_norm": 2.334519378419276, + "learning_rate": 9.722984740977515e-06, + "loss": 0.4155, + "step": 4595 + }, + { + "epoch": 0.13, + "grad_norm": 1.7628853926664654, + "learning_rate": 9.722830546798489e-06, + "loss": 0.3977, + "step": 4596 + }, + { + "epoch": 0.13, + "grad_norm": 1.8041875045427713, + "learning_rate": 9.722676310940428e-06, + "loss": 0.3954, + "step": 4597 + }, + { + "epoch": 0.13, + "grad_norm": 1.640351700336822, + "learning_rate": 9.722522033404698e-06, + "loss": 0.457, + "step": 4598 + }, + { + "epoch": 0.13, + "grad_norm": 1.5671226443233626, + "learning_rate": 9.72236771419266e-06, + "loss": 0.3908, + "step": 4599 + }, + { + "epoch": 0.13, + "grad_norm": 1.689816574933849, + "learning_rate": 9.722213353305672e-06, + "loss": 0.4314, + "step": 4600 + }, + { + "epoch": 0.13, + "grad_norm": 1.8193992925288698, + "learning_rate": 9.7220589507451e-06, + "loss": 0.3936, + "step": 4601 + }, + { + "epoch": 0.13, + "grad_norm": 1.9762801543629915, + "learning_rate": 9.721904506512306e-06, + "loss": 0.4153, + "step": 4602 + }, + { + "epoch": 0.13, + "grad_norm": 1.9160174239615237, + "learning_rate": 9.721750020608651e-06, + "loss": 0.3796, + "step": 4603 + }, + { + "epoch": 0.13, + "grad_norm": 1.7289790338717137, + "learning_rate": 9.7215954930355e-06, + "loss": 0.3898, + "step": 4604 + }, + { + "epoch": 0.13, + "grad_norm": 2.224926415283138, + "learning_rate": 9.721440923794216e-06, + "loss": 0.3933, + "step": 4605 + }, + { + "epoch": 0.13, + "grad_norm": 1.5974422338749066, + "learning_rate": 9.721286312886164e-06, + "loss": 0.4039, + "step": 4606 + }, + { + "epoch": 0.13, + "grad_norm": 3.2655648173316316, + "learning_rate": 9.721131660312709e-06, + "loss": 0.4522, + "step": 4607 + }, + { + "epoch": 0.13, + "grad_norm": 1.798865157046175, + "learning_rate": 9.720976966075213e-06, + "loss": 0.3991, + "step": 4608 + }, + { + "epoch": 0.13, + "grad_norm": 1.6858795111130571, + "learning_rate": 9.720822230175042e-06, + "loss": 0.404, + "step": 4609 + }, + { + "epoch": 0.13, + "grad_norm": 1.9770609142137692, + "learning_rate": 9.720667452613564e-06, + "loss": 0.4171, + "step": 4610 + }, + { + "epoch": 0.13, + "grad_norm": 1.8150138417828128, + "learning_rate": 9.720512633392145e-06, + "loss": 0.3849, + "step": 4611 + }, + { + "epoch": 0.13, + "grad_norm": 1.7930836924610545, + "learning_rate": 9.720357772512147e-06, + "loss": 0.4274, + "step": 4612 + }, + { + "epoch": 0.13, + "grad_norm": 1.8127256125990583, + "learning_rate": 9.72020286997494e-06, + "loss": 0.4444, + "step": 4613 + }, + { + "epoch": 0.13, + "grad_norm": 1.8437209324137274, + "learning_rate": 9.720047925781892e-06, + "loss": 0.4363, + "step": 4614 + }, + { + "epoch": 0.13, + "grad_norm": 2.6003318981973482, + "learning_rate": 9.719892939934365e-06, + "loss": 0.4488, + "step": 4615 + }, + { + "epoch": 0.13, + "grad_norm": 2.1272254120363687, + "learning_rate": 9.719737912433734e-06, + "loss": 0.4321, + "step": 4616 + }, + { + "epoch": 0.13, + "grad_norm": 2.551514844627666, + "learning_rate": 9.719582843281362e-06, + "loss": 0.4021, + "step": 4617 + }, + { + "epoch": 0.13, + "grad_norm": 1.7208513419775964, + "learning_rate": 9.719427732478618e-06, + "loss": 0.4177, + "step": 4618 + }, + { + "epoch": 0.13, + "grad_norm": 1.9036626371905816, + "learning_rate": 9.719272580026875e-06, + "loss": 0.3784, + "step": 4619 + }, + { + "epoch": 0.13, + "grad_norm": 1.7286861439010752, + "learning_rate": 9.719117385927497e-06, + "loss": 0.4027, + "step": 4620 + }, + { + "epoch": 0.13, + "grad_norm": 1.8401902237279382, + "learning_rate": 9.718962150181856e-06, + "loss": 0.4124, + "step": 4621 + }, + { + "epoch": 0.13, + "grad_norm": 1.7242549770536497, + "learning_rate": 9.718806872791321e-06, + "loss": 0.3868, + "step": 4622 + }, + { + "epoch": 0.13, + "grad_norm": 1.6146804640686805, + "learning_rate": 9.718651553757266e-06, + "loss": 0.3884, + "step": 4623 + }, + { + "epoch": 0.13, + "grad_norm": 1.7854351819750789, + "learning_rate": 9.718496193081055e-06, + "loss": 0.4007, + "step": 4624 + }, + { + "epoch": 0.13, + "grad_norm": 1.1791193550342431, + "learning_rate": 9.718340790764065e-06, + "loss": 0.621, + "step": 4625 + }, + { + "epoch": 0.13, + "grad_norm": 2.2619280521355263, + "learning_rate": 9.718185346807664e-06, + "loss": 0.4109, + "step": 4626 + }, + { + "epoch": 0.13, + "grad_norm": 1.8251178752745612, + "learning_rate": 9.718029861213226e-06, + "loss": 0.4095, + "step": 4627 + }, + { + "epoch": 0.13, + "grad_norm": 1.5565157907040428, + "learning_rate": 9.717874333982121e-06, + "loss": 0.3697, + "step": 4628 + }, + { + "epoch": 0.13, + "grad_norm": 2.057620338882761, + "learning_rate": 9.717718765115723e-06, + "loss": 0.4701, + "step": 4629 + }, + { + "epoch": 0.13, + "grad_norm": 3.525021883225925, + "learning_rate": 9.717563154615404e-06, + "loss": 0.4601, + "step": 4630 + }, + { + "epoch": 0.13, + "grad_norm": 1.5146155333253652, + "learning_rate": 9.717407502482538e-06, + "loss": 0.3901, + "step": 4631 + }, + { + "epoch": 0.13, + "grad_norm": 1.6368541829720102, + "learning_rate": 9.7172518087185e-06, + "loss": 0.4091, + "step": 4632 + }, + { + "epoch": 0.13, + "grad_norm": 2.0463412625134314, + "learning_rate": 9.71709607332466e-06, + "loss": 0.4217, + "step": 4633 + }, + { + "epoch": 0.13, + "grad_norm": 1.6208624740074684, + "learning_rate": 9.716940296302397e-06, + "loss": 0.3866, + "step": 4634 + }, + { + "epoch": 0.13, + "grad_norm": 1.586232877601452, + "learning_rate": 9.716784477653085e-06, + "loss": 0.3913, + "step": 4635 + }, + { + "epoch": 0.13, + "grad_norm": 1.910342967240493, + "learning_rate": 9.716628617378095e-06, + "loss": 0.4007, + "step": 4636 + }, + { + "epoch": 0.13, + "grad_norm": 1.559101752282601, + "learning_rate": 9.716472715478806e-06, + "loss": 0.4084, + "step": 4637 + }, + { + "epoch": 0.13, + "grad_norm": 1.742691078303129, + "learning_rate": 9.716316771956591e-06, + "loss": 0.3841, + "step": 4638 + }, + { + "epoch": 0.13, + "grad_norm": 1.8057040772114386, + "learning_rate": 9.71616078681283e-06, + "loss": 0.4096, + "step": 4639 + }, + { + "epoch": 0.13, + "grad_norm": 2.3018708520853277, + "learning_rate": 9.716004760048899e-06, + "loss": 0.5385, + "step": 4640 + }, + { + "epoch": 0.13, + "grad_norm": 1.6766355770013182, + "learning_rate": 9.71584869166617e-06, + "loss": 0.3937, + "step": 4641 + }, + { + "epoch": 0.13, + "grad_norm": 1.7427008351617417, + "learning_rate": 9.715692581666028e-06, + "loss": 0.4106, + "step": 4642 + }, + { + "epoch": 0.13, + "grad_norm": 1.8110424909551905, + "learning_rate": 9.715536430049843e-06, + "loss": 0.3961, + "step": 4643 + }, + { + "epoch": 0.13, + "grad_norm": 1.6039584204631614, + "learning_rate": 9.715380236818998e-06, + "loss": 0.3955, + "step": 4644 + }, + { + "epoch": 0.13, + "grad_norm": 1.6812570707767496, + "learning_rate": 9.715224001974873e-06, + "loss": 0.4451, + "step": 4645 + }, + { + "epoch": 0.13, + "grad_norm": 1.7382243332051706, + "learning_rate": 9.715067725518842e-06, + "loss": 0.4637, + "step": 4646 + }, + { + "epoch": 0.13, + "grad_norm": 1.5393120749396612, + "learning_rate": 9.714911407452285e-06, + "loss": 0.3775, + "step": 4647 + }, + { + "epoch": 0.13, + "grad_norm": 1.612207079199025, + "learning_rate": 9.714755047776582e-06, + "loss": 0.4298, + "step": 4648 + }, + { + "epoch": 0.13, + "grad_norm": 2.314699249626796, + "learning_rate": 9.714598646493114e-06, + "loss": 0.4495, + "step": 4649 + }, + { + "epoch": 0.13, + "grad_norm": 1.7645074805554732, + "learning_rate": 9.71444220360326e-06, + "loss": 0.4111, + "step": 4650 + }, + { + "epoch": 0.13, + "grad_norm": 1.5914024902377917, + "learning_rate": 9.714285719108404e-06, + "loss": 0.4093, + "step": 4651 + }, + { + "epoch": 0.13, + "grad_norm": 1.7527619826195286, + "learning_rate": 9.714129193009923e-06, + "loss": 0.4206, + "step": 4652 + }, + { + "epoch": 0.13, + "grad_norm": 1.747843155354243, + "learning_rate": 9.713972625309199e-06, + "loss": 0.4324, + "step": 4653 + }, + { + "epoch": 0.13, + "grad_norm": 1.8448667229488982, + "learning_rate": 9.713816016007614e-06, + "loss": 0.4107, + "step": 4654 + }, + { + "epoch": 0.14, + "grad_norm": 1.6623439520961065, + "learning_rate": 9.713659365106551e-06, + "loss": 0.4561, + "step": 4655 + }, + { + "epoch": 0.14, + "grad_norm": 1.616430309741022, + "learning_rate": 9.713502672607391e-06, + "loss": 0.3764, + "step": 4656 + }, + { + "epoch": 0.14, + "grad_norm": 1.8383692011617785, + "learning_rate": 9.713345938511518e-06, + "loss": 0.408, + "step": 4657 + }, + { + "epoch": 0.14, + "grad_norm": 1.556907976018011, + "learning_rate": 9.713189162820316e-06, + "loss": 0.415, + "step": 4658 + }, + { + "epoch": 0.14, + "grad_norm": 1.0592187386766545, + "learning_rate": 9.713032345535166e-06, + "loss": 0.596, + "step": 4659 + }, + { + "epoch": 0.14, + "grad_norm": 1.4817453657178477, + "learning_rate": 9.712875486657453e-06, + "loss": 0.3734, + "step": 4660 + }, + { + "epoch": 0.14, + "grad_norm": 0.9643584718710592, + "learning_rate": 9.712718586188564e-06, + "loss": 0.608, + "step": 4661 + }, + { + "epoch": 0.14, + "grad_norm": 1.5904093646187007, + "learning_rate": 9.712561644129878e-06, + "loss": 0.3829, + "step": 4662 + }, + { + "epoch": 0.14, + "grad_norm": 1.767237593994972, + "learning_rate": 9.712404660482785e-06, + "loss": 0.4066, + "step": 4663 + }, + { + "epoch": 0.14, + "grad_norm": 1.1145446457177006, + "learning_rate": 9.712247635248668e-06, + "loss": 0.649, + "step": 4664 + }, + { + "epoch": 0.14, + "grad_norm": 1.680771376762497, + "learning_rate": 9.712090568428914e-06, + "loss": 0.3776, + "step": 4665 + }, + { + "epoch": 0.14, + "grad_norm": 2.283379549469378, + "learning_rate": 9.711933460024908e-06, + "loss": 0.4652, + "step": 4666 + }, + { + "epoch": 0.14, + "grad_norm": 1.5595180487826155, + "learning_rate": 9.711776310038037e-06, + "loss": 0.4036, + "step": 4667 + }, + { + "epoch": 0.14, + "grad_norm": 1.5959131217050724, + "learning_rate": 9.711619118469688e-06, + "loss": 0.401, + "step": 4668 + }, + { + "epoch": 0.14, + "grad_norm": 1.5804670209362564, + "learning_rate": 9.711461885321247e-06, + "loss": 0.3899, + "step": 4669 + }, + { + "epoch": 0.14, + "grad_norm": 1.1326417284605892, + "learning_rate": 9.711304610594104e-06, + "loss": 0.6105, + "step": 4670 + }, + { + "epoch": 0.14, + "grad_norm": 2.0990458510903323, + "learning_rate": 9.711147294289645e-06, + "loss": 0.4474, + "step": 4671 + }, + { + "epoch": 0.14, + "grad_norm": 1.823370185771148, + "learning_rate": 9.710989936409259e-06, + "loss": 0.3726, + "step": 4672 + }, + { + "epoch": 0.14, + "grad_norm": 1.624802539540672, + "learning_rate": 9.710832536954334e-06, + "loss": 0.4348, + "step": 4673 + }, + { + "epoch": 0.14, + "grad_norm": 5.25404514946986, + "learning_rate": 9.71067509592626e-06, + "loss": 0.4114, + "step": 4674 + }, + { + "epoch": 0.14, + "grad_norm": 1.7202481685514044, + "learning_rate": 9.710517613326424e-06, + "loss": 0.4214, + "step": 4675 + }, + { + "epoch": 0.14, + "grad_norm": 1.6145741367709576, + "learning_rate": 9.710360089156221e-06, + "loss": 0.3794, + "step": 4676 + }, + { + "epoch": 0.14, + "grad_norm": 1.6285966657512005, + "learning_rate": 9.710202523417036e-06, + "loss": 0.4702, + "step": 4677 + }, + { + "epoch": 0.14, + "grad_norm": 1.5279054613027783, + "learning_rate": 9.710044916110263e-06, + "loss": 0.4061, + "step": 4678 + }, + { + "epoch": 0.14, + "grad_norm": 1.6252357928043673, + "learning_rate": 9.70988726723729e-06, + "loss": 0.4558, + "step": 4679 + }, + { + "epoch": 0.14, + "grad_norm": 1.9578035195630554, + "learning_rate": 9.70972957679951e-06, + "loss": 0.4258, + "step": 4680 + }, + { + "epoch": 0.14, + "grad_norm": 1.5733266992733517, + "learning_rate": 9.709571844798315e-06, + "loss": 0.4133, + "step": 4681 + }, + { + "epoch": 0.14, + "grad_norm": 1.3984138464717184, + "learning_rate": 9.709414071235095e-06, + "loss": 0.3694, + "step": 4682 + }, + { + "epoch": 0.14, + "grad_norm": 1.5281912540468914, + "learning_rate": 9.709256256111245e-06, + "loss": 0.3799, + "step": 4683 + }, + { + "epoch": 0.14, + "grad_norm": 1.7507454857490599, + "learning_rate": 9.709098399428155e-06, + "loss": 0.3989, + "step": 4684 + }, + { + "epoch": 0.14, + "grad_norm": 1.9569757166574324, + "learning_rate": 9.70894050118722e-06, + "loss": 0.465, + "step": 4685 + }, + { + "epoch": 0.14, + "grad_norm": 1.733905641610906, + "learning_rate": 9.708782561389834e-06, + "loss": 0.3774, + "step": 4686 + }, + { + "epoch": 0.14, + "grad_norm": 1.6079031831548478, + "learning_rate": 9.708624580037388e-06, + "loss": 0.4071, + "step": 4687 + }, + { + "epoch": 0.14, + "grad_norm": 1.6873863877450805, + "learning_rate": 9.708466557131278e-06, + "loss": 0.361, + "step": 4688 + }, + { + "epoch": 0.14, + "grad_norm": 1.7687974585001713, + "learning_rate": 9.708308492672897e-06, + "loss": 0.4492, + "step": 4689 + }, + { + "epoch": 0.14, + "grad_norm": 1.5228807412504004, + "learning_rate": 9.708150386663643e-06, + "loss": 0.4276, + "step": 4690 + }, + { + "epoch": 0.14, + "grad_norm": 1.6473981720016764, + "learning_rate": 9.707992239104911e-06, + "loss": 0.4141, + "step": 4691 + }, + { + "epoch": 0.14, + "grad_norm": 1.469265978276985, + "learning_rate": 9.707834049998093e-06, + "loss": 0.3754, + "step": 4692 + }, + { + "epoch": 0.14, + "grad_norm": 1.7487324447929138, + "learning_rate": 9.707675819344589e-06, + "loss": 0.4158, + "step": 4693 + }, + { + "epoch": 0.14, + "grad_norm": 1.7382351651826757, + "learning_rate": 9.707517547145792e-06, + "loss": 0.4195, + "step": 4694 + }, + { + "epoch": 0.14, + "grad_norm": 1.0115799159617376, + "learning_rate": 9.707359233403102e-06, + "loss": 0.6022, + "step": 4695 + }, + { + "epoch": 0.14, + "grad_norm": 1.91488575375818, + "learning_rate": 9.707200878117912e-06, + "loss": 0.453, + "step": 4696 + }, + { + "epoch": 0.14, + "grad_norm": 1.7075963515861077, + "learning_rate": 9.707042481291624e-06, + "loss": 0.4124, + "step": 4697 + }, + { + "epoch": 0.14, + "grad_norm": 1.5784746543960468, + "learning_rate": 9.706884042925636e-06, + "loss": 0.4861, + "step": 4698 + }, + { + "epoch": 0.14, + "grad_norm": 1.630233049359054, + "learning_rate": 9.70672556302134e-06, + "loss": 0.425, + "step": 4699 + }, + { + "epoch": 0.14, + "grad_norm": 1.45650977317694, + "learning_rate": 9.70656704158014e-06, + "loss": 0.4071, + "step": 4700 + }, + { + "epoch": 0.14, + "grad_norm": 1.503678588434156, + "learning_rate": 9.706408478603435e-06, + "loss": 0.4127, + "step": 4701 + }, + { + "epoch": 0.14, + "grad_norm": 1.509564114922726, + "learning_rate": 9.706249874092622e-06, + "loss": 0.4367, + "step": 4702 + }, + { + "epoch": 0.14, + "grad_norm": 1.650506183110988, + "learning_rate": 9.706091228049102e-06, + "loss": 0.4194, + "step": 4703 + }, + { + "epoch": 0.14, + "grad_norm": 1.497514882522731, + "learning_rate": 9.705932540474274e-06, + "loss": 0.4175, + "step": 4704 + }, + { + "epoch": 0.14, + "grad_norm": 1.546259206064244, + "learning_rate": 9.705773811369539e-06, + "loss": 0.4237, + "step": 4705 + }, + { + "epoch": 0.14, + "grad_norm": 1.7720970150879136, + "learning_rate": 9.705615040736298e-06, + "loss": 0.3969, + "step": 4706 + }, + { + "epoch": 0.14, + "grad_norm": 1.6408182879913538, + "learning_rate": 9.705456228575954e-06, + "loss": 0.4747, + "step": 4707 + }, + { + "epoch": 0.14, + "grad_norm": 1.427205347309043, + "learning_rate": 9.705297374889905e-06, + "loss": 0.3906, + "step": 4708 + }, + { + "epoch": 0.14, + "grad_norm": 1.5994870336760423, + "learning_rate": 9.705138479679554e-06, + "loss": 0.3814, + "step": 4709 + }, + { + "epoch": 0.14, + "grad_norm": 1.4191182690845205, + "learning_rate": 9.704979542946305e-06, + "loss": 0.4119, + "step": 4710 + }, + { + "epoch": 0.14, + "grad_norm": 1.7497872334135025, + "learning_rate": 9.704820564691559e-06, + "loss": 0.3861, + "step": 4711 + }, + { + "epoch": 0.14, + "grad_norm": 1.6075461453799136, + "learning_rate": 9.704661544916717e-06, + "loss": 0.4302, + "step": 4712 + }, + { + "epoch": 0.14, + "grad_norm": 1.6339654043524967, + "learning_rate": 9.704502483623189e-06, + "loss": 0.3891, + "step": 4713 + }, + { + "epoch": 0.14, + "grad_norm": 2.1008828007911338, + "learning_rate": 9.704343380812371e-06, + "loss": 0.4177, + "step": 4714 + }, + { + "epoch": 0.14, + "grad_norm": 1.5076252420888387, + "learning_rate": 9.704184236485672e-06, + "loss": 0.3908, + "step": 4715 + }, + { + "epoch": 0.14, + "grad_norm": 1.5928367835344412, + "learning_rate": 9.704025050644494e-06, + "loss": 0.401, + "step": 4716 + }, + { + "epoch": 0.14, + "grad_norm": 1.8714422447274632, + "learning_rate": 9.703865823290242e-06, + "loss": 0.3925, + "step": 4717 + }, + { + "epoch": 0.14, + "grad_norm": 2.09458380500543, + "learning_rate": 9.703706554424324e-06, + "loss": 0.4195, + "step": 4718 + }, + { + "epoch": 0.14, + "grad_norm": 2.6432210917639916, + "learning_rate": 9.703547244048142e-06, + "loss": 0.3812, + "step": 4719 + }, + { + "epoch": 0.14, + "grad_norm": 1.5758252147004304, + "learning_rate": 9.703387892163104e-06, + "loss": 0.408, + "step": 4720 + }, + { + "epoch": 0.14, + "grad_norm": 1.5193138349199176, + "learning_rate": 9.703228498770617e-06, + "loss": 0.4057, + "step": 4721 + }, + { + "epoch": 0.14, + "grad_norm": 1.568150409387961, + "learning_rate": 9.703069063872084e-06, + "loss": 0.3996, + "step": 4722 + }, + { + "epoch": 0.14, + "grad_norm": 1.5609544370978587, + "learning_rate": 9.702909587468916e-06, + "loss": 0.38, + "step": 4723 + }, + { + "epoch": 0.14, + "grad_norm": 1.4850064839705792, + "learning_rate": 9.702750069562518e-06, + "loss": 0.3923, + "step": 4724 + }, + { + "epoch": 0.14, + "grad_norm": 1.747035110944103, + "learning_rate": 9.702590510154297e-06, + "loss": 0.4204, + "step": 4725 + }, + { + "epoch": 0.14, + "grad_norm": 1.614699913704587, + "learning_rate": 9.702430909245665e-06, + "loss": 0.4055, + "step": 4726 + }, + { + "epoch": 0.14, + "grad_norm": 1.503283179174868, + "learning_rate": 9.702271266838027e-06, + "loss": 0.4337, + "step": 4727 + }, + { + "epoch": 0.14, + "grad_norm": 1.5965497388968126, + "learning_rate": 9.702111582932794e-06, + "loss": 0.4036, + "step": 4728 + }, + { + "epoch": 0.14, + "grad_norm": 1.5442605903874818, + "learning_rate": 9.701951857531372e-06, + "loss": 0.427, + "step": 4729 + }, + { + "epoch": 0.14, + "grad_norm": 1.8565011666453932, + "learning_rate": 9.701792090635174e-06, + "loss": 0.4929, + "step": 4730 + }, + { + "epoch": 0.14, + "grad_norm": 1.499070337506586, + "learning_rate": 9.70163228224561e-06, + "loss": 0.3967, + "step": 4731 + }, + { + "epoch": 0.14, + "grad_norm": 3.088110431635222, + "learning_rate": 9.701472432364087e-06, + "loss": 0.4062, + "step": 4732 + }, + { + "epoch": 0.14, + "grad_norm": 1.5423753356742897, + "learning_rate": 9.701312540992018e-06, + "loss": 0.3937, + "step": 4733 + }, + { + "epoch": 0.14, + "grad_norm": 2.28123377467236, + "learning_rate": 9.701152608130813e-06, + "loss": 0.4121, + "step": 4734 + }, + { + "epoch": 0.14, + "grad_norm": 2.1563765546464886, + "learning_rate": 9.700992633781885e-06, + "loss": 0.3975, + "step": 4735 + }, + { + "epoch": 0.14, + "grad_norm": 1.4909810948251294, + "learning_rate": 9.700832617946645e-06, + "loss": 0.4318, + "step": 4736 + }, + { + "epoch": 0.14, + "grad_norm": 1.525563923899845, + "learning_rate": 9.700672560626503e-06, + "loss": 0.409, + "step": 4737 + }, + { + "epoch": 0.14, + "grad_norm": 1.6388488552997298, + "learning_rate": 9.700512461822875e-06, + "loss": 0.4173, + "step": 4738 + }, + { + "epoch": 0.14, + "grad_norm": 2.0180277570785434, + "learning_rate": 9.700352321537174e-06, + "loss": 0.4325, + "step": 4739 + }, + { + "epoch": 0.14, + "grad_norm": 1.6102623201689672, + "learning_rate": 9.70019213977081e-06, + "loss": 0.4203, + "step": 4740 + }, + { + "epoch": 0.14, + "grad_norm": 1.508939061307203, + "learning_rate": 9.700031916525199e-06, + "loss": 0.4096, + "step": 4741 + }, + { + "epoch": 0.14, + "grad_norm": 1.7261090166264281, + "learning_rate": 9.699871651801754e-06, + "loss": 0.3939, + "step": 4742 + }, + { + "epoch": 0.14, + "grad_norm": 1.7538518043331262, + "learning_rate": 9.699711345601887e-06, + "loss": 0.4147, + "step": 4743 + }, + { + "epoch": 0.14, + "grad_norm": 1.5998093019377722, + "learning_rate": 9.69955099792702e-06, + "loss": 0.4143, + "step": 4744 + }, + { + "epoch": 0.14, + "grad_norm": 1.4394503000029504, + "learning_rate": 9.69939060877856e-06, + "loss": 0.4143, + "step": 4745 + }, + { + "epoch": 0.14, + "grad_norm": 1.7747604479018302, + "learning_rate": 9.699230178157926e-06, + "loss": 0.3813, + "step": 4746 + }, + { + "epoch": 0.14, + "grad_norm": 1.6577267638118798, + "learning_rate": 9.699069706066534e-06, + "loss": 0.4174, + "step": 4747 + }, + { + "epoch": 0.14, + "grad_norm": 1.4517477869391482, + "learning_rate": 9.698909192505799e-06, + "loss": 0.4033, + "step": 4748 + }, + { + "epoch": 0.14, + "grad_norm": 1.5298363027262263, + "learning_rate": 9.69874863747714e-06, + "loss": 0.3868, + "step": 4749 + }, + { + "epoch": 0.14, + "grad_norm": 1.6362871565505281, + "learning_rate": 9.69858804098197e-06, + "loss": 0.4015, + "step": 4750 + }, + { + "epoch": 0.14, + "grad_norm": 1.6051088025823728, + "learning_rate": 9.698427403021711e-06, + "loss": 0.4393, + "step": 4751 + }, + { + "epoch": 0.14, + "grad_norm": 1.6264932733854844, + "learning_rate": 9.698266723597775e-06, + "loss": 0.4135, + "step": 4752 + }, + { + "epoch": 0.14, + "grad_norm": 1.7617134325632875, + "learning_rate": 9.698106002711586e-06, + "loss": 0.4238, + "step": 4753 + }, + { + "epoch": 0.14, + "grad_norm": 1.6772657890437537, + "learning_rate": 9.697945240364558e-06, + "loss": 0.3843, + "step": 4754 + }, + { + "epoch": 0.14, + "grad_norm": 1.4947920172205797, + "learning_rate": 9.697784436558111e-06, + "loss": 0.405, + "step": 4755 + }, + { + "epoch": 0.14, + "grad_norm": 1.618949817847978, + "learning_rate": 9.697623591293663e-06, + "loss": 0.4733, + "step": 4756 + }, + { + "epoch": 0.14, + "grad_norm": 1.5560064278283656, + "learning_rate": 9.697462704572638e-06, + "loss": 0.3889, + "step": 4757 + }, + { + "epoch": 0.14, + "grad_norm": 2.0446972362247937, + "learning_rate": 9.69730177639645e-06, + "loss": 0.486, + "step": 4758 + }, + { + "epoch": 0.14, + "grad_norm": 1.599794510246416, + "learning_rate": 9.697140806766522e-06, + "loss": 0.3995, + "step": 4759 + }, + { + "epoch": 0.14, + "grad_norm": 1.6958075104281602, + "learning_rate": 9.696979795684273e-06, + "loss": 0.4168, + "step": 4760 + }, + { + "epoch": 0.14, + "grad_norm": 1.5584332269195305, + "learning_rate": 9.696818743151128e-06, + "loss": 0.4024, + "step": 4761 + }, + { + "epoch": 0.14, + "grad_norm": 1.6838490759715719, + "learning_rate": 9.696657649168504e-06, + "loss": 0.4141, + "step": 4762 + }, + { + "epoch": 0.14, + "grad_norm": 1.6673034378315652, + "learning_rate": 9.696496513737826e-06, + "loss": 0.4259, + "step": 4763 + }, + { + "epoch": 0.14, + "grad_norm": 1.8538589273084718, + "learning_rate": 9.696335336860512e-06, + "loss": 0.4309, + "step": 4764 + }, + { + "epoch": 0.14, + "grad_norm": 1.6027924598648218, + "learning_rate": 9.696174118537986e-06, + "loss": 0.3995, + "step": 4765 + }, + { + "epoch": 0.14, + "grad_norm": 1.595533535155205, + "learning_rate": 9.696012858771673e-06, + "loss": 0.3902, + "step": 4766 + }, + { + "epoch": 0.14, + "grad_norm": 1.6533753584691098, + "learning_rate": 9.695851557562994e-06, + "loss": 0.396, + "step": 4767 + }, + { + "epoch": 0.14, + "grad_norm": 1.6333599750322287, + "learning_rate": 9.695690214913372e-06, + "loss": 0.3828, + "step": 4768 + }, + { + "epoch": 0.14, + "grad_norm": 1.4230345378747304, + "learning_rate": 9.695528830824232e-06, + "loss": 0.3701, + "step": 4769 + }, + { + "epoch": 0.14, + "grad_norm": 1.5245760232053227, + "learning_rate": 9.695367405297e-06, + "loss": 0.4646, + "step": 4770 + }, + { + "epoch": 0.14, + "grad_norm": 1.5532000324865467, + "learning_rate": 9.695205938333096e-06, + "loss": 0.3964, + "step": 4771 + }, + { + "epoch": 0.14, + "grad_norm": 1.3714593439141782, + "learning_rate": 9.69504442993395e-06, + "loss": 0.3955, + "step": 4772 + }, + { + "epoch": 0.14, + "grad_norm": 1.5999407425599672, + "learning_rate": 9.694882880100981e-06, + "loss": 0.3839, + "step": 4773 + }, + { + "epoch": 0.14, + "grad_norm": 1.4495082588592865, + "learning_rate": 9.694721288835622e-06, + "loss": 0.399, + "step": 4774 + }, + { + "epoch": 0.14, + "grad_norm": 1.9233590424442617, + "learning_rate": 9.694559656139295e-06, + "loss": 0.4313, + "step": 4775 + }, + { + "epoch": 0.14, + "grad_norm": 1.7430072862144834, + "learning_rate": 9.694397982013426e-06, + "loss": 0.395, + "step": 4776 + }, + { + "epoch": 0.14, + "grad_norm": 1.6042088044987912, + "learning_rate": 9.694236266459443e-06, + "loss": 0.4263, + "step": 4777 + }, + { + "epoch": 0.14, + "grad_norm": 1.5340804708755968, + "learning_rate": 9.694074509478774e-06, + "loss": 0.4033, + "step": 4778 + }, + { + "epoch": 0.14, + "grad_norm": 1.4821068567975115, + "learning_rate": 9.693912711072845e-06, + "loss": 0.3776, + "step": 4779 + }, + { + "epoch": 0.14, + "grad_norm": 1.1268083682161179, + "learning_rate": 9.693750871243082e-06, + "loss": 0.653, + "step": 4780 + }, + { + "epoch": 0.14, + "grad_norm": 1.6706982609322247, + "learning_rate": 9.693588989990918e-06, + "loss": 0.4151, + "step": 4781 + }, + { + "epoch": 0.14, + "grad_norm": 1.5498563189150871, + "learning_rate": 9.693427067317777e-06, + "loss": 0.4562, + "step": 4782 + }, + { + "epoch": 0.14, + "grad_norm": 1.5849214816187167, + "learning_rate": 9.693265103225093e-06, + "loss": 0.3912, + "step": 4783 + }, + { + "epoch": 0.14, + "grad_norm": 1.4314594629108626, + "learning_rate": 9.69310309771429e-06, + "loss": 0.3884, + "step": 4784 + }, + { + "epoch": 0.14, + "grad_norm": 1.400024067078867, + "learning_rate": 9.692941050786802e-06, + "loss": 0.3899, + "step": 4785 + }, + { + "epoch": 0.14, + "grad_norm": 1.8052525654436535, + "learning_rate": 9.692778962444056e-06, + "loss": 0.4191, + "step": 4786 + }, + { + "epoch": 0.14, + "grad_norm": 1.5193789284025894, + "learning_rate": 9.692616832687482e-06, + "loss": 0.4253, + "step": 4787 + }, + { + "epoch": 0.14, + "grad_norm": 1.4773954271775438, + "learning_rate": 9.692454661518515e-06, + "loss": 0.4159, + "step": 4788 + }, + { + "epoch": 0.14, + "grad_norm": 1.5167077972269674, + "learning_rate": 9.692292448938582e-06, + "loss": 0.41, + "step": 4789 + }, + { + "epoch": 0.14, + "grad_norm": 1.4192944847506688, + "learning_rate": 9.692130194949116e-06, + "loss": 0.4053, + "step": 4790 + }, + { + "epoch": 0.14, + "grad_norm": 1.5009875687635408, + "learning_rate": 9.69196789955155e-06, + "loss": 0.4126, + "step": 4791 + }, + { + "epoch": 0.14, + "grad_norm": 1.5180497356618394, + "learning_rate": 9.691805562747316e-06, + "loss": 0.4304, + "step": 4792 + }, + { + "epoch": 0.14, + "grad_norm": 1.71924147147255, + "learning_rate": 9.691643184537845e-06, + "loss": 0.3931, + "step": 4793 + }, + { + "epoch": 0.14, + "grad_norm": 1.8473847241346848, + "learning_rate": 9.691480764924569e-06, + "loss": 0.3962, + "step": 4794 + }, + { + "epoch": 0.14, + "grad_norm": 1.4391419810659414, + "learning_rate": 9.691318303908926e-06, + "loss": 0.4215, + "step": 4795 + }, + { + "epoch": 0.14, + "grad_norm": 1.5331054289051675, + "learning_rate": 9.691155801492344e-06, + "loss": 0.4311, + "step": 4796 + }, + { + "epoch": 0.14, + "grad_norm": 1.4808346253007467, + "learning_rate": 9.690993257676262e-06, + "loss": 0.4061, + "step": 4797 + }, + { + "epoch": 0.14, + "grad_norm": 1.449834152885452, + "learning_rate": 9.690830672462112e-06, + "loss": 0.3922, + "step": 4798 + }, + { + "epoch": 0.14, + "grad_norm": 1.8174523866158292, + "learning_rate": 9.690668045851331e-06, + "loss": 0.3858, + "step": 4799 + }, + { + "epoch": 0.14, + "grad_norm": 1.3801507084127904, + "learning_rate": 9.69050537784535e-06, + "loss": 0.4038, + "step": 4800 + }, + { + "epoch": 0.14, + "grad_norm": 1.5165716921653178, + "learning_rate": 9.690342668445607e-06, + "loss": 0.4298, + "step": 4801 + }, + { + "epoch": 0.14, + "grad_norm": 1.5185703500309666, + "learning_rate": 9.69017991765354e-06, + "loss": 0.3904, + "step": 4802 + }, + { + "epoch": 0.14, + "grad_norm": 1.183857768757179, + "learning_rate": 9.69001712547058e-06, + "loss": 0.5913, + "step": 4803 + }, + { + "epoch": 0.14, + "grad_norm": 4.787767808281908, + "learning_rate": 9.689854291898171e-06, + "loss": 0.3768, + "step": 4804 + }, + { + "epoch": 0.14, + "grad_norm": 1.571831451648885, + "learning_rate": 9.689691416937743e-06, + "loss": 0.4074, + "step": 4805 + }, + { + "epoch": 0.14, + "grad_norm": 1.5740395229426656, + "learning_rate": 9.689528500590737e-06, + "loss": 0.4297, + "step": 4806 + }, + { + "epoch": 0.14, + "grad_norm": 1.6448375791083063, + "learning_rate": 9.68936554285859e-06, + "loss": 0.3971, + "step": 4807 + }, + { + "epoch": 0.14, + "grad_norm": 3.418098796282251, + "learning_rate": 9.68920254374274e-06, + "loss": 0.4635, + "step": 4808 + }, + { + "epoch": 0.14, + "grad_norm": 3.1722923908373253, + "learning_rate": 9.689039503244625e-06, + "loss": 0.4129, + "step": 4809 + }, + { + "epoch": 0.14, + "grad_norm": 1.619491185936546, + "learning_rate": 9.688876421365685e-06, + "loss": 0.43, + "step": 4810 + }, + { + "epoch": 0.14, + "grad_norm": 1.005512860852687, + "learning_rate": 9.688713298107359e-06, + "loss": 0.5783, + "step": 4811 + }, + { + "epoch": 0.14, + "grad_norm": 1.7308689309267415, + "learning_rate": 9.688550133471083e-06, + "loss": 0.4422, + "step": 4812 + }, + { + "epoch": 0.14, + "grad_norm": 1.729349598928337, + "learning_rate": 9.688386927458304e-06, + "loss": 0.42, + "step": 4813 + }, + { + "epoch": 0.14, + "grad_norm": 1.8643311067673787, + "learning_rate": 9.688223680070457e-06, + "loss": 0.4449, + "step": 4814 + }, + { + "epoch": 0.14, + "grad_norm": 1.8526501583517816, + "learning_rate": 9.688060391308985e-06, + "loss": 0.4251, + "step": 4815 + }, + { + "epoch": 0.14, + "grad_norm": 1.6042531887241274, + "learning_rate": 9.687897061175326e-06, + "loss": 0.3931, + "step": 4816 + }, + { + "epoch": 0.14, + "grad_norm": 1.9885044926203195, + "learning_rate": 9.687733689670925e-06, + "loss": 0.3881, + "step": 4817 + }, + { + "epoch": 0.14, + "grad_norm": 1.4480517460248186, + "learning_rate": 9.687570276797221e-06, + "loss": 0.3846, + "step": 4818 + }, + { + "epoch": 0.14, + "grad_norm": 1.6666869070179127, + "learning_rate": 9.687406822555658e-06, + "loss": 0.3851, + "step": 4819 + }, + { + "epoch": 0.14, + "grad_norm": 1.4949769671237965, + "learning_rate": 9.687243326947677e-06, + "loss": 0.3826, + "step": 4820 + }, + { + "epoch": 0.14, + "grad_norm": 1.9068076308442852, + "learning_rate": 9.687079789974723e-06, + "loss": 0.4004, + "step": 4821 + }, + { + "epoch": 0.14, + "grad_norm": 1.8751737605786385, + "learning_rate": 9.686916211638237e-06, + "loss": 0.469, + "step": 4822 + }, + { + "epoch": 0.14, + "grad_norm": 1.579118564574448, + "learning_rate": 9.686752591939663e-06, + "loss": 0.4031, + "step": 4823 + }, + { + "epoch": 0.14, + "grad_norm": 1.5236234819704706, + "learning_rate": 9.686588930880448e-06, + "loss": 0.4251, + "step": 4824 + }, + { + "epoch": 0.14, + "grad_norm": 1.9422863112246684, + "learning_rate": 9.68642522846203e-06, + "loss": 0.4308, + "step": 4825 + }, + { + "epoch": 0.14, + "grad_norm": 1.6276590173922916, + "learning_rate": 9.68626148468586e-06, + "loss": 0.3743, + "step": 4826 + }, + { + "epoch": 0.14, + "grad_norm": 2.188298691902715, + "learning_rate": 9.686097699553378e-06, + "loss": 0.4202, + "step": 4827 + }, + { + "epoch": 0.14, + "grad_norm": 1.8196198348942216, + "learning_rate": 9.685933873066034e-06, + "loss": 0.4126, + "step": 4828 + }, + { + "epoch": 0.14, + "grad_norm": 1.4852215623598028, + "learning_rate": 9.685770005225271e-06, + "loss": 0.3846, + "step": 4829 + }, + { + "epoch": 0.14, + "grad_norm": 2.423914120284326, + "learning_rate": 9.685606096032536e-06, + "loss": 0.4171, + "step": 4830 + }, + { + "epoch": 0.14, + "grad_norm": 1.4683594455246827, + "learning_rate": 9.685442145489275e-06, + "loss": 0.3683, + "step": 4831 + }, + { + "epoch": 0.14, + "grad_norm": 1.5258843345099538, + "learning_rate": 9.685278153596935e-06, + "loss": 0.4336, + "step": 4832 + }, + { + "epoch": 0.14, + "grad_norm": 1.5438537351955506, + "learning_rate": 9.685114120356963e-06, + "loss": 0.3846, + "step": 4833 + }, + { + "epoch": 0.14, + "grad_norm": 1.578078292459915, + "learning_rate": 9.684950045770807e-06, + "loss": 0.4095, + "step": 4834 + }, + { + "epoch": 0.14, + "grad_norm": 1.1507794798717734, + "learning_rate": 9.684785929839915e-06, + "loss": 0.6082, + "step": 4835 + }, + { + "epoch": 0.14, + "grad_norm": 1.8390904407254587, + "learning_rate": 9.684621772565735e-06, + "loss": 0.3898, + "step": 4836 + }, + { + "epoch": 0.14, + "grad_norm": 1.5955438810480096, + "learning_rate": 9.684457573949717e-06, + "loss": 0.4077, + "step": 4837 + }, + { + "epoch": 0.14, + "grad_norm": 1.5371825866624313, + "learning_rate": 9.684293333993308e-06, + "loss": 0.3864, + "step": 4838 + }, + { + "epoch": 0.14, + "grad_norm": 1.4461638523999185, + "learning_rate": 9.684129052697959e-06, + "loss": 0.392, + "step": 4839 + }, + { + "epoch": 0.14, + "grad_norm": 1.5077257096706371, + "learning_rate": 9.683964730065118e-06, + "loss": 0.4162, + "step": 4840 + }, + { + "epoch": 0.14, + "grad_norm": 1.4776035270974994, + "learning_rate": 9.683800366096236e-06, + "loss": 0.3819, + "step": 4841 + }, + { + "epoch": 0.14, + "grad_norm": 1.4077812460188914, + "learning_rate": 9.683635960792766e-06, + "loss": 0.4139, + "step": 4842 + }, + { + "epoch": 0.14, + "grad_norm": 1.6046107046281117, + "learning_rate": 9.683471514156155e-06, + "loss": 0.4268, + "step": 4843 + }, + { + "epoch": 0.14, + "grad_norm": 1.605121569219075, + "learning_rate": 9.683307026187857e-06, + "loss": 0.4173, + "step": 4844 + }, + { + "epoch": 0.14, + "grad_norm": 1.4351202012221707, + "learning_rate": 9.683142496889322e-06, + "loss": 0.4039, + "step": 4845 + }, + { + "epoch": 0.14, + "grad_norm": 1.4055542157629701, + "learning_rate": 9.682977926262001e-06, + "loss": 0.3646, + "step": 4846 + }, + { + "epoch": 0.14, + "grad_norm": 1.4286942312553168, + "learning_rate": 9.682813314307349e-06, + "loss": 0.3731, + "step": 4847 + }, + { + "epoch": 0.14, + "grad_norm": 2.035634283070366, + "learning_rate": 9.682648661026818e-06, + "loss": 0.4734, + "step": 4848 + }, + { + "epoch": 0.14, + "grad_norm": 1.6820631807236395, + "learning_rate": 9.682483966421861e-06, + "loss": 0.4092, + "step": 4849 + }, + { + "epoch": 0.14, + "grad_norm": 1.5326513414609981, + "learning_rate": 9.68231923049393e-06, + "loss": 0.4147, + "step": 4850 + }, + { + "epoch": 0.14, + "grad_norm": 1.8587898061856047, + "learning_rate": 9.68215445324448e-06, + "loss": 0.3859, + "step": 4851 + }, + { + "epoch": 0.14, + "grad_norm": 1.6045706430930726, + "learning_rate": 9.681989634674967e-06, + "loss": 0.4066, + "step": 4852 + }, + { + "epoch": 0.14, + "grad_norm": 1.555212528109795, + "learning_rate": 9.68182477478684e-06, + "loss": 0.3884, + "step": 4853 + }, + { + "epoch": 0.14, + "grad_norm": 1.7108739500410086, + "learning_rate": 9.681659873581559e-06, + "loss": 0.3869, + "step": 4854 + }, + { + "epoch": 0.14, + "grad_norm": 1.5948185157353036, + "learning_rate": 9.681494931060578e-06, + "loss": 0.4126, + "step": 4855 + }, + { + "epoch": 0.14, + "grad_norm": 1.8825075694224198, + "learning_rate": 9.681329947225353e-06, + "loss": 0.4152, + "step": 4856 + }, + { + "epoch": 0.14, + "grad_norm": 1.4751976635898305, + "learning_rate": 9.681164922077339e-06, + "loss": 0.402, + "step": 4857 + }, + { + "epoch": 0.14, + "grad_norm": 1.4402430538838553, + "learning_rate": 9.680999855617992e-06, + "loss": 0.3909, + "step": 4858 + }, + { + "epoch": 0.14, + "grad_norm": 1.4096379384931093, + "learning_rate": 9.680834747848769e-06, + "loss": 0.3901, + "step": 4859 + }, + { + "epoch": 0.14, + "grad_norm": 1.976146850511432, + "learning_rate": 9.680669598771128e-06, + "loss": 0.4238, + "step": 4860 + }, + { + "epoch": 0.14, + "grad_norm": 1.695415449546633, + "learning_rate": 9.680504408386526e-06, + "loss": 0.4521, + "step": 4861 + }, + { + "epoch": 0.14, + "grad_norm": 1.5629643687006955, + "learning_rate": 9.68033917669642e-06, + "loss": 0.3947, + "step": 4862 + }, + { + "epoch": 0.14, + "grad_norm": 1.5059036008458326, + "learning_rate": 9.68017390370227e-06, + "loss": 0.3895, + "step": 4863 + }, + { + "epoch": 0.14, + "grad_norm": 1.4361807725097204, + "learning_rate": 9.680008589405532e-06, + "loss": 0.4001, + "step": 4864 + }, + { + "epoch": 0.14, + "grad_norm": 1.8330313542715146, + "learning_rate": 9.679843233807666e-06, + "loss": 0.4391, + "step": 4865 + }, + { + "epoch": 0.14, + "grad_norm": 1.5031960263571984, + "learning_rate": 9.679677836910133e-06, + "loss": 0.3885, + "step": 4866 + }, + { + "epoch": 0.14, + "grad_norm": 1.4919745692031032, + "learning_rate": 9.67951239871439e-06, + "loss": 0.4308, + "step": 4867 + }, + { + "epoch": 0.14, + "grad_norm": 1.6409111315782605, + "learning_rate": 9.679346919221901e-06, + "loss": 0.3943, + "step": 4868 + }, + { + "epoch": 0.14, + "grad_norm": 1.598692345440682, + "learning_rate": 9.67918139843412e-06, + "loss": 0.4066, + "step": 4869 + }, + { + "epoch": 0.14, + "grad_norm": 1.4683428336863162, + "learning_rate": 9.679015836352513e-06, + "loss": 0.3895, + "step": 4870 + }, + { + "epoch": 0.14, + "grad_norm": 1.593628835192591, + "learning_rate": 9.67885023297854e-06, + "loss": 0.3892, + "step": 4871 + }, + { + "epoch": 0.14, + "grad_norm": 1.4005077771380672, + "learning_rate": 9.678684588313661e-06, + "loss": 0.3934, + "step": 4872 + }, + { + "epoch": 0.14, + "grad_norm": 1.7286930322728404, + "learning_rate": 9.678518902359337e-06, + "loss": 0.4112, + "step": 4873 + }, + { + "epoch": 0.14, + "grad_norm": 1.5209406120277347, + "learning_rate": 9.678353175117033e-06, + "loss": 0.3744, + "step": 4874 + }, + { + "epoch": 0.14, + "grad_norm": 1.903530520296267, + "learning_rate": 9.678187406588211e-06, + "loss": 0.4208, + "step": 4875 + }, + { + "epoch": 0.14, + "grad_norm": 1.6718201380197466, + "learning_rate": 9.678021596774332e-06, + "loss": 0.4276, + "step": 4876 + }, + { + "epoch": 0.14, + "grad_norm": 1.644204126837206, + "learning_rate": 9.677855745676862e-06, + "loss": 0.3962, + "step": 4877 + }, + { + "epoch": 0.14, + "grad_norm": 1.445939910976658, + "learning_rate": 9.677689853297263e-06, + "loss": 0.3794, + "step": 4878 + }, + { + "epoch": 0.14, + "grad_norm": 1.8304511077785601, + "learning_rate": 9.677523919636998e-06, + "loss": 0.3997, + "step": 4879 + }, + { + "epoch": 0.14, + "grad_norm": 1.7026717247465262, + "learning_rate": 9.677357944697532e-06, + "loss": 0.4489, + "step": 4880 + }, + { + "epoch": 0.14, + "grad_norm": 1.6173925278880537, + "learning_rate": 9.677191928480332e-06, + "loss": 0.4085, + "step": 4881 + }, + { + "epoch": 0.14, + "grad_norm": 2.028611324111357, + "learning_rate": 9.67702587098686e-06, + "loss": 0.4166, + "step": 4882 + }, + { + "epoch": 0.14, + "grad_norm": 1.7412065120118891, + "learning_rate": 9.676859772218584e-06, + "loss": 0.3936, + "step": 4883 + }, + { + "epoch": 0.14, + "grad_norm": 1.5880689539720754, + "learning_rate": 9.676693632176968e-06, + "loss": 0.4521, + "step": 4884 + }, + { + "epoch": 0.14, + "grad_norm": 1.4928049076264114, + "learning_rate": 9.67652745086348e-06, + "loss": 0.4072, + "step": 4885 + }, + { + "epoch": 0.14, + "grad_norm": 1.3755143397696705, + "learning_rate": 9.676361228279583e-06, + "loss": 0.4291, + "step": 4886 + }, + { + "epoch": 0.14, + "grad_norm": 1.9818578673583491, + "learning_rate": 9.676194964426748e-06, + "loss": 0.4061, + "step": 4887 + }, + { + "epoch": 0.14, + "grad_norm": 1.5982223861921596, + "learning_rate": 9.67602865930644e-06, + "loss": 0.446, + "step": 4888 + }, + { + "epoch": 0.14, + "grad_norm": 1.5834921309181744, + "learning_rate": 9.675862312920128e-06, + "loss": 0.4351, + "step": 4889 + }, + { + "epoch": 0.14, + "grad_norm": 1.6219809189657106, + "learning_rate": 9.675695925269277e-06, + "loss": 0.4126, + "step": 4890 + }, + { + "epoch": 0.14, + "grad_norm": 2.6696251404594915, + "learning_rate": 9.675529496355361e-06, + "loss": 0.399, + "step": 4891 + }, + { + "epoch": 0.14, + "grad_norm": 1.9473182379961875, + "learning_rate": 9.675363026179843e-06, + "loss": 0.3988, + "step": 4892 + }, + { + "epoch": 0.14, + "grad_norm": 1.6129981638803992, + "learning_rate": 9.675196514744195e-06, + "loss": 0.413, + "step": 4893 + }, + { + "epoch": 0.14, + "grad_norm": 1.9049243795413742, + "learning_rate": 9.675029962049885e-06, + "loss": 0.4071, + "step": 4894 + }, + { + "epoch": 0.14, + "grad_norm": 1.5985309073029275, + "learning_rate": 9.674863368098386e-06, + "loss": 0.4011, + "step": 4895 + }, + { + "epoch": 0.14, + "grad_norm": 1.644195530573912, + "learning_rate": 9.674696732891163e-06, + "loss": 0.3861, + "step": 4896 + }, + { + "epoch": 0.14, + "grad_norm": 1.7155611615700115, + "learning_rate": 9.674530056429691e-06, + "loss": 0.3886, + "step": 4897 + }, + { + "epoch": 0.14, + "grad_norm": 1.0768675931585703, + "learning_rate": 9.67436333871544e-06, + "loss": 0.6256, + "step": 4898 + }, + { + "epoch": 0.14, + "grad_norm": 1.8433352633350244, + "learning_rate": 9.67419657974988e-06, + "loss": 0.4153, + "step": 4899 + }, + { + "epoch": 0.14, + "grad_norm": 1.8090078388648554, + "learning_rate": 9.674029779534483e-06, + "loss": 0.4217, + "step": 4900 + }, + { + "epoch": 0.14, + "grad_norm": 2.01551587757959, + "learning_rate": 9.673862938070722e-06, + "loss": 0.4378, + "step": 4901 + }, + { + "epoch": 0.14, + "grad_norm": 1.5654149597918914, + "learning_rate": 9.673696055360069e-06, + "loss": 0.4148, + "step": 4902 + }, + { + "epoch": 0.14, + "grad_norm": 1.9231178793907462, + "learning_rate": 9.673529131403995e-06, + "loss": 0.4264, + "step": 4903 + }, + { + "epoch": 0.14, + "grad_norm": 1.8834185403012538, + "learning_rate": 9.673362166203974e-06, + "loss": 0.407, + "step": 4904 + }, + { + "epoch": 0.14, + "grad_norm": 1.688824461545638, + "learning_rate": 9.673195159761482e-06, + "loss": 0.4035, + "step": 4905 + }, + { + "epoch": 0.14, + "grad_norm": 1.9450392277444732, + "learning_rate": 9.67302811207799e-06, + "loss": 0.3866, + "step": 4906 + }, + { + "epoch": 0.14, + "grad_norm": 1.696273806778349, + "learning_rate": 9.672861023154974e-06, + "loss": 0.3937, + "step": 4907 + }, + { + "epoch": 0.14, + "grad_norm": 1.4296400382925776, + "learning_rate": 9.672693892993908e-06, + "loss": 0.4032, + "step": 4908 + }, + { + "epoch": 0.14, + "grad_norm": 1.7093865611030898, + "learning_rate": 9.672526721596265e-06, + "loss": 0.3962, + "step": 4909 + }, + { + "epoch": 0.14, + "grad_norm": 1.623569443579484, + "learning_rate": 9.672359508963523e-06, + "loss": 0.3817, + "step": 4910 + }, + { + "epoch": 0.14, + "grad_norm": 1.6182776888020627, + "learning_rate": 9.672192255097155e-06, + "loss": 0.4629, + "step": 4911 + }, + { + "epoch": 0.14, + "grad_norm": 1.7933707384813362, + "learning_rate": 9.672024959998639e-06, + "loss": 0.4313, + "step": 4912 + }, + { + "epoch": 0.14, + "grad_norm": 1.5754002309240533, + "learning_rate": 9.671857623669451e-06, + "loss": 0.4689, + "step": 4913 + }, + { + "epoch": 0.14, + "grad_norm": 1.6058941982554713, + "learning_rate": 9.671690246111068e-06, + "loss": 0.4427, + "step": 4914 + }, + { + "epoch": 0.14, + "grad_norm": 1.5677963283043428, + "learning_rate": 9.671522827324968e-06, + "loss": 0.3757, + "step": 4915 + }, + { + "epoch": 0.14, + "grad_norm": 1.5671120185906189, + "learning_rate": 9.671355367312624e-06, + "loss": 0.3825, + "step": 4916 + }, + { + "epoch": 0.14, + "grad_norm": 1.5699577782814114, + "learning_rate": 9.67118786607552e-06, + "loss": 0.4071, + "step": 4917 + }, + { + "epoch": 0.14, + "grad_norm": 1.5516450314247832, + "learning_rate": 9.67102032361513e-06, + "loss": 0.4247, + "step": 4918 + }, + { + "epoch": 0.14, + "grad_norm": 1.5329175428995059, + "learning_rate": 9.670852739932934e-06, + "loss": 0.3823, + "step": 4919 + }, + { + "epoch": 0.14, + "grad_norm": 1.5525692932612478, + "learning_rate": 9.670685115030411e-06, + "loss": 0.4253, + "step": 4920 + }, + { + "epoch": 0.14, + "grad_norm": 1.541570344617491, + "learning_rate": 9.67051744890904e-06, + "loss": 0.3844, + "step": 4921 + }, + { + "epoch": 0.14, + "grad_norm": 1.6789527231733796, + "learning_rate": 9.670349741570302e-06, + "loss": 0.3693, + "step": 4922 + }, + { + "epoch": 0.14, + "grad_norm": 1.7744777433112409, + "learning_rate": 9.670181993015673e-06, + "loss": 0.4321, + "step": 4923 + }, + { + "epoch": 0.14, + "grad_norm": 1.509422132879482, + "learning_rate": 9.670014203246639e-06, + "loss": 0.3563, + "step": 4924 + }, + { + "epoch": 0.14, + "grad_norm": 1.4619473755689123, + "learning_rate": 9.669846372264674e-06, + "loss": 0.4394, + "step": 4925 + }, + { + "epoch": 0.14, + "grad_norm": 1.6494857832130776, + "learning_rate": 9.669678500071267e-06, + "loss": 0.4075, + "step": 4926 + }, + { + "epoch": 0.14, + "grad_norm": 1.5292802870736302, + "learning_rate": 9.669510586667894e-06, + "loss": 0.3928, + "step": 4927 + }, + { + "epoch": 0.14, + "grad_norm": 1.0923427873275713, + "learning_rate": 9.669342632056038e-06, + "loss": 0.5694, + "step": 4928 + }, + { + "epoch": 0.14, + "grad_norm": 2.0654293635540926, + "learning_rate": 9.669174636237182e-06, + "loss": 0.4046, + "step": 4929 + }, + { + "epoch": 0.14, + "grad_norm": 1.9691956599794502, + "learning_rate": 9.669006599212806e-06, + "loss": 0.3791, + "step": 4930 + }, + { + "epoch": 0.14, + "grad_norm": 5.204088996197555, + "learning_rate": 9.668838520984396e-06, + "loss": 0.4302, + "step": 4931 + }, + { + "epoch": 0.14, + "grad_norm": 1.633512157530646, + "learning_rate": 9.668670401553435e-06, + "loss": 0.4032, + "step": 4932 + }, + { + "epoch": 0.14, + "grad_norm": 2.0715352123646347, + "learning_rate": 9.668502240921406e-06, + "loss": 0.4213, + "step": 4933 + }, + { + "epoch": 0.14, + "grad_norm": 1.4795222510477986, + "learning_rate": 9.668334039089793e-06, + "loss": 0.3928, + "step": 4934 + }, + { + "epoch": 0.14, + "grad_norm": 1.421096029652864, + "learning_rate": 9.66816579606008e-06, + "loss": 0.3792, + "step": 4935 + }, + { + "epoch": 0.14, + "grad_norm": 1.611517613360111, + "learning_rate": 9.667997511833751e-06, + "loss": 0.4131, + "step": 4936 + }, + { + "epoch": 0.14, + "grad_norm": 1.5587214925497697, + "learning_rate": 9.667829186412292e-06, + "loss": 0.4238, + "step": 4937 + }, + { + "epoch": 0.14, + "grad_norm": 1.5457425603789237, + "learning_rate": 9.667660819797192e-06, + "loss": 0.4128, + "step": 4938 + }, + { + "epoch": 0.14, + "grad_norm": 1.474567249247365, + "learning_rate": 9.667492411989929e-06, + "loss": 0.3932, + "step": 4939 + }, + { + "epoch": 0.14, + "grad_norm": 1.4410906299242507, + "learning_rate": 9.667323962991998e-06, + "loss": 0.3947, + "step": 4940 + }, + { + "epoch": 0.14, + "grad_norm": 1.7546930842818032, + "learning_rate": 9.667155472804878e-06, + "loss": 0.3896, + "step": 4941 + }, + { + "epoch": 0.14, + "grad_norm": 2.6499142214955334, + "learning_rate": 9.66698694143006e-06, + "loss": 0.402, + "step": 4942 + }, + { + "epoch": 0.14, + "grad_norm": 1.659605619865466, + "learning_rate": 9.666818368869031e-06, + "loss": 0.4135, + "step": 4943 + }, + { + "epoch": 0.14, + "grad_norm": 1.5921884656212055, + "learning_rate": 9.666649755123278e-06, + "loss": 0.4469, + "step": 4944 + }, + { + "epoch": 0.14, + "grad_norm": 1.4560228375993745, + "learning_rate": 9.66648110019429e-06, + "loss": 0.3816, + "step": 4945 + }, + { + "epoch": 0.14, + "grad_norm": 1.1428389979918692, + "learning_rate": 9.666312404083553e-06, + "loss": 0.5923, + "step": 4946 + }, + { + "epoch": 0.14, + "grad_norm": 1.8684153026226746, + "learning_rate": 9.666143666792557e-06, + "loss": 0.3935, + "step": 4947 + }, + { + "epoch": 0.14, + "grad_norm": 1.6493873152065808, + "learning_rate": 9.665974888322794e-06, + "loss": 0.408, + "step": 4948 + }, + { + "epoch": 0.14, + "grad_norm": 1.4705727226857983, + "learning_rate": 9.665806068675749e-06, + "loss": 0.4164, + "step": 4949 + }, + { + "epoch": 0.14, + "grad_norm": 1.6873137629839305, + "learning_rate": 9.665637207852914e-06, + "loss": 0.4327, + "step": 4950 + }, + { + "epoch": 0.14, + "grad_norm": 2.549895728827968, + "learning_rate": 9.665468305855779e-06, + "loss": 0.4466, + "step": 4951 + }, + { + "epoch": 0.14, + "grad_norm": 1.686239799409687, + "learning_rate": 9.665299362685833e-06, + "loss": 0.4528, + "step": 4952 + }, + { + "epoch": 0.14, + "grad_norm": 2.486293858403903, + "learning_rate": 9.66513037834457e-06, + "loss": 0.3898, + "step": 4953 + }, + { + "epoch": 0.14, + "grad_norm": 1.5899689771348653, + "learning_rate": 9.664961352833478e-06, + "loss": 0.389, + "step": 4954 + }, + { + "epoch": 0.14, + "grad_norm": 1.483111141018463, + "learning_rate": 9.664792286154053e-06, + "loss": 0.3601, + "step": 4955 + }, + { + "epoch": 0.14, + "grad_norm": 1.6079270507481078, + "learning_rate": 9.664623178307781e-06, + "loss": 0.4, + "step": 4956 + }, + { + "epoch": 0.14, + "grad_norm": 1.4449776134588346, + "learning_rate": 9.66445402929616e-06, + "loss": 0.3973, + "step": 4957 + }, + { + "epoch": 0.14, + "grad_norm": 1.4653899639013273, + "learning_rate": 9.66428483912068e-06, + "loss": 0.3982, + "step": 4958 + }, + { + "epoch": 0.14, + "grad_norm": 1.5212626832702771, + "learning_rate": 9.664115607782835e-06, + "loss": 0.3835, + "step": 4959 + }, + { + "epoch": 0.14, + "grad_norm": 1.0759147530017736, + "learning_rate": 9.663946335284116e-06, + "loss": 0.6178, + "step": 4960 + }, + { + "epoch": 0.14, + "grad_norm": 1.5780714637506086, + "learning_rate": 9.663777021626023e-06, + "loss": 0.4212, + "step": 4961 + }, + { + "epoch": 0.14, + "grad_norm": 1.7285534995496319, + "learning_rate": 9.663607666810043e-06, + "loss": 0.395, + "step": 4962 + }, + { + "epoch": 0.14, + "grad_norm": 1.5918816305617132, + "learning_rate": 9.663438270837672e-06, + "loss": 0.4367, + "step": 4963 + }, + { + "epoch": 0.14, + "grad_norm": 1.7751335893385238, + "learning_rate": 9.663268833710408e-06, + "loss": 0.403, + "step": 4964 + }, + { + "epoch": 0.14, + "grad_norm": 1.6137242409672927, + "learning_rate": 9.663099355429746e-06, + "loss": 0.3998, + "step": 4965 + }, + { + "epoch": 0.14, + "grad_norm": 1.485882620754746, + "learning_rate": 9.66292983599718e-06, + "loss": 0.4258, + "step": 4966 + }, + { + "epoch": 0.14, + "grad_norm": 1.7043386017600792, + "learning_rate": 9.662760275414207e-06, + "loss": 0.4234, + "step": 4967 + }, + { + "epoch": 0.14, + "grad_norm": 2.44883959880772, + "learning_rate": 9.662590673682322e-06, + "loss": 0.3977, + "step": 4968 + }, + { + "epoch": 0.14, + "grad_norm": 1.9286512680673946, + "learning_rate": 9.662421030803022e-06, + "loss": 0.429, + "step": 4969 + }, + { + "epoch": 0.14, + "grad_norm": 1.8725375548212744, + "learning_rate": 9.662251346777806e-06, + "loss": 0.37, + "step": 4970 + }, + { + "epoch": 0.14, + "grad_norm": 1.5330610586249798, + "learning_rate": 9.66208162160817e-06, + "loss": 0.3763, + "step": 4971 + }, + { + "epoch": 0.14, + "grad_norm": 1.7704717637719702, + "learning_rate": 9.661911855295611e-06, + "loss": 0.3843, + "step": 4972 + }, + { + "epoch": 0.14, + "grad_norm": 1.6955800360835842, + "learning_rate": 9.66174204784163e-06, + "loss": 0.4214, + "step": 4973 + }, + { + "epoch": 0.14, + "grad_norm": 1.6732088117264847, + "learning_rate": 9.66157219924772e-06, + "loss": 0.4088, + "step": 4974 + }, + { + "epoch": 0.14, + "grad_norm": 2.0421715744518107, + "learning_rate": 9.661402309515386e-06, + "loss": 0.4009, + "step": 4975 + }, + { + "epoch": 0.14, + "grad_norm": 3.037469173344513, + "learning_rate": 9.661232378646125e-06, + "loss": 0.3743, + "step": 4976 + }, + { + "epoch": 0.14, + "grad_norm": 1.5428003515130897, + "learning_rate": 9.661062406641437e-06, + "loss": 0.4155, + "step": 4977 + }, + { + "epoch": 0.14, + "grad_norm": 2.3993202304813526, + "learning_rate": 9.66089239350282e-06, + "loss": 0.4031, + "step": 4978 + }, + { + "epoch": 0.14, + "grad_norm": 1.6782709711504002, + "learning_rate": 9.660722339231778e-06, + "loss": 0.4135, + "step": 4979 + }, + { + "epoch": 0.14, + "grad_norm": 1.73707381274698, + "learning_rate": 9.660552243829809e-06, + "loss": 0.43, + "step": 4980 + }, + { + "epoch": 0.14, + "grad_norm": 1.0332632312120806, + "learning_rate": 9.660382107298414e-06, + "loss": 0.609, + "step": 4981 + }, + { + "epoch": 0.14, + "grad_norm": 1.7905094912987976, + "learning_rate": 9.660211929639096e-06, + "loss": 0.4144, + "step": 4982 + }, + { + "epoch": 0.14, + "grad_norm": 1.652908768264028, + "learning_rate": 9.660041710853356e-06, + "loss": 0.3894, + "step": 4983 + }, + { + "epoch": 0.14, + "grad_norm": 2.0732214794881503, + "learning_rate": 9.659871450942695e-06, + "loss": 0.4079, + "step": 4984 + }, + { + "epoch": 0.14, + "grad_norm": 2.8334064391623586, + "learning_rate": 9.659701149908617e-06, + "loss": 0.4192, + "step": 4985 + }, + { + "epoch": 0.14, + "grad_norm": 1.9313852746034526, + "learning_rate": 9.659530807752626e-06, + "loss": 0.4088, + "step": 4986 + }, + { + "epoch": 0.14, + "grad_norm": 7.036734369217626, + "learning_rate": 9.659360424476223e-06, + "loss": 0.3967, + "step": 4987 + }, + { + "epoch": 0.14, + "grad_norm": 2.0295103745438934, + "learning_rate": 9.659190000080912e-06, + "loss": 0.4548, + "step": 4988 + }, + { + "epoch": 0.14, + "grad_norm": 1.7899692551105966, + "learning_rate": 9.659019534568199e-06, + "loss": 0.4025, + "step": 4989 + }, + { + "epoch": 0.14, + "grad_norm": 1.5733668012436564, + "learning_rate": 9.658849027939586e-06, + "loss": 0.4123, + "step": 4990 + }, + { + "epoch": 0.14, + "grad_norm": 1.7323572904740085, + "learning_rate": 9.658678480196579e-06, + "loss": 0.3955, + "step": 4991 + }, + { + "epoch": 0.14, + "grad_norm": 1.4466408516016531, + "learning_rate": 9.658507891340683e-06, + "loss": 0.3934, + "step": 4992 + }, + { + "epoch": 0.14, + "grad_norm": 2.1815229601365465, + "learning_rate": 9.658337261373402e-06, + "loss": 0.3875, + "step": 4993 + }, + { + "epoch": 0.14, + "grad_norm": 2.0892805867601876, + "learning_rate": 9.658166590296243e-06, + "loss": 0.4299, + "step": 4994 + }, + { + "epoch": 0.14, + "grad_norm": 1.486419380065112, + "learning_rate": 9.657995878110712e-06, + "loss": 0.3718, + "step": 4995 + }, + { + "epoch": 0.14, + "grad_norm": 1.6253695792989937, + "learning_rate": 9.657825124818317e-06, + "loss": 0.3691, + "step": 4996 + }, + { + "epoch": 0.14, + "grad_norm": 1.7407465517433305, + "learning_rate": 9.657654330420565e-06, + "loss": 0.409, + "step": 4997 + }, + { + "epoch": 0.14, + "grad_norm": 1.5083136673773971, + "learning_rate": 9.65748349491896e-06, + "loss": 0.3885, + "step": 4998 + }, + { + "epoch": 0.14, + "grad_norm": 2.07353541729687, + "learning_rate": 9.657312618315011e-06, + "loss": 0.3742, + "step": 4999 + }, + { + "epoch": 0.15, + "grad_norm": 1.5214879481219494, + "learning_rate": 9.657141700610226e-06, + "loss": 0.3741, + "step": 5000 + }, + { + "epoch": 0.15, + "grad_norm": 1.5618204306862173, + "learning_rate": 9.656970741806116e-06, + "loss": 0.4037, + "step": 5001 + }, + { + "epoch": 0.15, + "grad_norm": 1.7268171550929072, + "learning_rate": 9.656799741904186e-06, + "loss": 0.3964, + "step": 5002 + }, + { + "epoch": 0.15, + "grad_norm": 1.660434497201134, + "learning_rate": 9.656628700905947e-06, + "loss": 0.3904, + "step": 5003 + }, + { + "epoch": 0.15, + "grad_norm": 1.7165343339565462, + "learning_rate": 9.656457618812909e-06, + "loss": 0.4278, + "step": 5004 + }, + { + "epoch": 0.15, + "grad_norm": 1.7743570647040547, + "learning_rate": 9.656286495626578e-06, + "loss": 0.4295, + "step": 5005 + }, + { + "epoch": 0.15, + "grad_norm": 1.7413338900023916, + "learning_rate": 9.65611533134847e-06, + "loss": 0.4147, + "step": 5006 + }, + { + "epoch": 0.15, + "grad_norm": 1.6774522740818598, + "learning_rate": 9.655944125980092e-06, + "loss": 0.423, + "step": 5007 + }, + { + "epoch": 0.15, + "grad_norm": 1.9527157831550197, + "learning_rate": 9.655772879522953e-06, + "loss": 0.4244, + "step": 5008 + }, + { + "epoch": 0.15, + "grad_norm": 1.9840030877379762, + "learning_rate": 9.65560159197857e-06, + "loss": 0.4153, + "step": 5009 + }, + { + "epoch": 0.15, + "grad_norm": 1.541070941274277, + "learning_rate": 9.655430263348448e-06, + "loss": 0.3994, + "step": 5010 + }, + { + "epoch": 0.15, + "grad_norm": 1.6248338586591307, + "learning_rate": 9.655258893634105e-06, + "loss": 0.4304, + "step": 5011 + }, + { + "epoch": 0.15, + "grad_norm": 1.5866823626003717, + "learning_rate": 9.655087482837052e-06, + "loss": 0.4138, + "step": 5012 + }, + { + "epoch": 0.15, + "grad_norm": 1.7923940296178411, + "learning_rate": 9.654916030958797e-06, + "loss": 0.4145, + "step": 5013 + }, + { + "epoch": 0.15, + "grad_norm": 1.7837872580563114, + "learning_rate": 9.654744538000857e-06, + "loss": 0.3856, + "step": 5014 + }, + { + "epoch": 0.15, + "grad_norm": 1.9277295696487022, + "learning_rate": 9.654573003964745e-06, + "loss": 0.3951, + "step": 5015 + }, + { + "epoch": 0.15, + "grad_norm": 1.9310953060159017, + "learning_rate": 9.654401428851976e-06, + "loss": 0.4381, + "step": 5016 + }, + { + "epoch": 0.15, + "grad_norm": 1.8738417370877651, + "learning_rate": 9.654229812664061e-06, + "loss": 0.3855, + "step": 5017 + }, + { + "epoch": 0.15, + "grad_norm": 1.85704477820048, + "learning_rate": 9.654058155402517e-06, + "loss": 0.4016, + "step": 5018 + }, + { + "epoch": 0.15, + "grad_norm": 1.6068116632270273, + "learning_rate": 9.653886457068857e-06, + "loss": 0.3973, + "step": 5019 + }, + { + "epoch": 0.15, + "grad_norm": 1.4596304255112615, + "learning_rate": 9.6537147176646e-06, + "loss": 0.3987, + "step": 5020 + }, + { + "epoch": 0.15, + "grad_norm": 1.6075593037117297, + "learning_rate": 9.653542937191258e-06, + "loss": 0.4143, + "step": 5021 + }, + { + "epoch": 0.15, + "grad_norm": 1.6487421052719902, + "learning_rate": 9.653371115650347e-06, + "loss": 0.404, + "step": 5022 + }, + { + "epoch": 0.15, + "grad_norm": 1.5921931420312392, + "learning_rate": 9.653199253043383e-06, + "loss": 0.396, + "step": 5023 + }, + { + "epoch": 0.15, + "grad_norm": 2.213458603397581, + "learning_rate": 9.653027349371886e-06, + "loss": 0.3834, + "step": 5024 + }, + { + "epoch": 0.15, + "grad_norm": 1.980345919038325, + "learning_rate": 9.652855404637372e-06, + "loss": 0.3862, + "step": 5025 + }, + { + "epoch": 0.15, + "grad_norm": 1.6403662757630855, + "learning_rate": 9.652683418841355e-06, + "loss": 0.4163, + "step": 5026 + }, + { + "epoch": 0.15, + "grad_norm": 1.942740541842368, + "learning_rate": 9.652511391985357e-06, + "loss": 0.4032, + "step": 5027 + }, + { + "epoch": 0.15, + "grad_norm": 1.5033139451282773, + "learning_rate": 9.652339324070895e-06, + "loss": 0.422, + "step": 5028 + }, + { + "epoch": 0.15, + "grad_norm": 1.5462958411148346, + "learning_rate": 9.652167215099484e-06, + "loss": 0.4045, + "step": 5029 + }, + { + "epoch": 0.15, + "grad_norm": 1.564823797301706, + "learning_rate": 9.651995065072649e-06, + "loss": 0.3767, + "step": 5030 + }, + { + "epoch": 0.15, + "grad_norm": 1.826742970344862, + "learning_rate": 9.651822873991903e-06, + "loss": 0.4168, + "step": 5031 + }, + { + "epoch": 0.15, + "grad_norm": 1.5327913786480765, + "learning_rate": 9.65165064185877e-06, + "loss": 0.4058, + "step": 5032 + }, + { + "epoch": 0.15, + "grad_norm": 1.7272656791979428, + "learning_rate": 9.651478368674769e-06, + "loss": 0.3922, + "step": 5033 + }, + { + "epoch": 0.15, + "grad_norm": 0.9629864157164155, + "learning_rate": 9.651306054441418e-06, + "loss": 0.5577, + "step": 5034 + }, + { + "epoch": 0.15, + "grad_norm": 1.6393786765905605, + "learning_rate": 9.65113369916024e-06, + "loss": 0.4046, + "step": 5035 + }, + { + "epoch": 0.15, + "grad_norm": 1.8133589068849987, + "learning_rate": 9.650961302832757e-06, + "loss": 0.383, + "step": 5036 + }, + { + "epoch": 0.15, + "grad_norm": 2.0891308153884003, + "learning_rate": 9.650788865460487e-06, + "loss": 0.3965, + "step": 5037 + }, + { + "epoch": 0.15, + "grad_norm": 1.51135482554802, + "learning_rate": 9.650616387044955e-06, + "loss": 0.4081, + "step": 5038 + }, + { + "epoch": 0.15, + "grad_norm": 0.9850113109812497, + "learning_rate": 9.650443867587682e-06, + "loss": 0.5981, + "step": 5039 + }, + { + "epoch": 0.15, + "grad_norm": 1.9092936038023953, + "learning_rate": 9.65027130709019e-06, + "loss": 0.3831, + "step": 5040 + }, + { + "epoch": 0.15, + "grad_norm": 1.7582279668998992, + "learning_rate": 9.650098705554e-06, + "loss": 0.4013, + "step": 5041 + }, + { + "epoch": 0.15, + "grad_norm": 2.436042871179641, + "learning_rate": 9.64992606298064e-06, + "loss": 0.3919, + "step": 5042 + }, + { + "epoch": 0.15, + "grad_norm": 1.5177045364080666, + "learning_rate": 9.64975337937163e-06, + "loss": 0.3971, + "step": 5043 + }, + { + "epoch": 0.15, + "grad_norm": 1.5099562274843519, + "learning_rate": 9.649580654728496e-06, + "loss": 0.3728, + "step": 5044 + }, + { + "epoch": 0.15, + "grad_norm": 1.9983169272103742, + "learning_rate": 9.64940788905276e-06, + "loss": 0.3748, + "step": 5045 + }, + { + "epoch": 0.15, + "grad_norm": 1.515192384506624, + "learning_rate": 9.649235082345948e-06, + "loss": 0.3939, + "step": 5046 + }, + { + "epoch": 0.15, + "grad_norm": 1.969266768940617, + "learning_rate": 9.649062234609585e-06, + "loss": 0.3624, + "step": 5047 + }, + { + "epoch": 0.15, + "grad_norm": 1.7214350169250239, + "learning_rate": 9.648889345845196e-06, + "loss": 0.4138, + "step": 5048 + }, + { + "epoch": 0.15, + "grad_norm": 1.479006714014402, + "learning_rate": 9.648716416054305e-06, + "loss": 0.4152, + "step": 5049 + }, + { + "epoch": 0.15, + "grad_norm": 2.0281781196862427, + "learning_rate": 9.648543445238442e-06, + "loss": 0.4169, + "step": 5050 + }, + { + "epoch": 0.15, + "grad_norm": 1.7064014487357106, + "learning_rate": 9.648370433399133e-06, + "loss": 0.3875, + "step": 5051 + }, + { + "epoch": 0.15, + "grad_norm": 1.8295016727813804, + "learning_rate": 9.6481973805379e-06, + "loss": 0.3956, + "step": 5052 + }, + { + "epoch": 0.15, + "grad_norm": 1.623259300543148, + "learning_rate": 9.648024286656277e-06, + "loss": 0.4055, + "step": 5053 + }, + { + "epoch": 0.15, + "grad_norm": 1.9537721595809459, + "learning_rate": 9.647851151755785e-06, + "loss": 0.4191, + "step": 5054 + }, + { + "epoch": 0.15, + "grad_norm": 1.9489636804946533, + "learning_rate": 9.647677975837957e-06, + "loss": 0.4133, + "step": 5055 + }, + { + "epoch": 0.15, + "grad_norm": 2.2935000681609568, + "learning_rate": 9.647504758904319e-06, + "loss": 0.3847, + "step": 5056 + }, + { + "epoch": 0.15, + "grad_norm": 1.8029860325867078, + "learning_rate": 9.6473315009564e-06, + "loss": 0.3891, + "step": 5057 + }, + { + "epoch": 0.15, + "grad_norm": 1.5395235060886896, + "learning_rate": 9.647158201995728e-06, + "loss": 0.3939, + "step": 5058 + }, + { + "epoch": 0.15, + "grad_norm": 1.5067841839722946, + "learning_rate": 9.646984862023833e-06, + "loss": 0.383, + "step": 5059 + }, + { + "epoch": 0.15, + "grad_norm": 1.5621867452809146, + "learning_rate": 9.646811481042246e-06, + "loss": 0.4204, + "step": 5060 + }, + { + "epoch": 0.15, + "grad_norm": 1.5615858404631588, + "learning_rate": 9.646638059052498e-06, + "loss": 0.3857, + "step": 5061 + }, + { + "epoch": 0.15, + "grad_norm": 1.6272712235049578, + "learning_rate": 9.646464596056116e-06, + "loss": 0.3905, + "step": 5062 + }, + { + "epoch": 0.15, + "grad_norm": 1.7825633532583007, + "learning_rate": 9.64629109205463e-06, + "loss": 0.3898, + "step": 5063 + }, + { + "epoch": 0.15, + "grad_norm": 2.9036775141994946, + "learning_rate": 9.646117547049575e-06, + "loss": 0.408, + "step": 5064 + }, + { + "epoch": 0.15, + "grad_norm": 1.4895984929381194, + "learning_rate": 9.645943961042484e-06, + "loss": 0.3758, + "step": 5065 + }, + { + "epoch": 0.15, + "grad_norm": 1.6208767252553056, + "learning_rate": 9.645770334034883e-06, + "loss": 0.4013, + "step": 5066 + }, + { + "epoch": 0.15, + "grad_norm": 1.6849818307561912, + "learning_rate": 9.645596666028309e-06, + "loss": 0.4125, + "step": 5067 + }, + { + "epoch": 0.15, + "grad_norm": 1.5699185226020187, + "learning_rate": 9.645422957024291e-06, + "loss": 0.3739, + "step": 5068 + }, + { + "epoch": 0.15, + "grad_norm": 1.528728252818462, + "learning_rate": 9.645249207024366e-06, + "loss": 0.3761, + "step": 5069 + }, + { + "epoch": 0.15, + "grad_norm": 1.796330920459863, + "learning_rate": 9.645075416030063e-06, + "loss": 0.3696, + "step": 5070 + }, + { + "epoch": 0.15, + "grad_norm": 1.680133564193219, + "learning_rate": 9.64490158404292e-06, + "loss": 0.3958, + "step": 5071 + }, + { + "epoch": 0.15, + "grad_norm": 1.6647674208344014, + "learning_rate": 9.644727711064467e-06, + "loss": 0.4082, + "step": 5072 + }, + { + "epoch": 0.15, + "grad_norm": 1.8661974324772326, + "learning_rate": 9.64455379709624e-06, + "loss": 0.3865, + "step": 5073 + }, + { + "epoch": 0.15, + "grad_norm": 1.8065026313532797, + "learning_rate": 9.644379842139776e-06, + "loss": 0.4103, + "step": 5074 + }, + { + "epoch": 0.15, + "grad_norm": 1.525580608529059, + "learning_rate": 9.64420584619661e-06, + "loss": 0.3761, + "step": 5075 + }, + { + "epoch": 0.15, + "grad_norm": 2.284194942037514, + "learning_rate": 9.644031809268272e-06, + "loss": 0.4021, + "step": 5076 + }, + { + "epoch": 0.15, + "grad_norm": 1.752019811958089, + "learning_rate": 9.643857731356305e-06, + "loss": 0.4134, + "step": 5077 + }, + { + "epoch": 0.15, + "grad_norm": 1.783124246592801, + "learning_rate": 9.64368361246224e-06, + "loss": 0.4212, + "step": 5078 + }, + { + "epoch": 0.15, + "grad_norm": 1.6327045148597774, + "learning_rate": 9.643509452587616e-06, + "loss": 0.4234, + "step": 5079 + }, + { + "epoch": 0.15, + "grad_norm": 1.8176468432487773, + "learning_rate": 9.643335251733969e-06, + "loss": 0.3834, + "step": 5080 + }, + { + "epoch": 0.15, + "grad_norm": 1.8614315454474557, + "learning_rate": 9.64316100990284e-06, + "loss": 0.409, + "step": 5081 + }, + { + "epoch": 0.15, + "grad_norm": 1.6994090156339856, + "learning_rate": 9.64298672709576e-06, + "loss": 0.4019, + "step": 5082 + }, + { + "epoch": 0.15, + "grad_norm": 1.564053220180158, + "learning_rate": 9.642812403314272e-06, + "loss": 0.3887, + "step": 5083 + }, + { + "epoch": 0.15, + "grad_norm": 1.572577767336064, + "learning_rate": 9.642638038559913e-06, + "loss": 0.42, + "step": 5084 + }, + { + "epoch": 0.15, + "grad_norm": 1.7238874871315306, + "learning_rate": 9.642463632834222e-06, + "loss": 0.4003, + "step": 5085 + }, + { + "epoch": 0.15, + "grad_norm": 1.56347224140687, + "learning_rate": 9.642289186138738e-06, + "loss": 0.3941, + "step": 5086 + }, + { + "epoch": 0.15, + "grad_norm": 1.527340802166062, + "learning_rate": 9.642114698475e-06, + "loss": 0.3818, + "step": 5087 + }, + { + "epoch": 0.15, + "grad_norm": 1.6167781982507403, + "learning_rate": 9.641940169844549e-06, + "loss": 0.4239, + "step": 5088 + }, + { + "epoch": 0.15, + "grad_norm": 1.2633208055801064, + "learning_rate": 9.641765600248923e-06, + "loss": 0.5938, + "step": 5089 + }, + { + "epoch": 0.15, + "grad_norm": 1.7448368556367417, + "learning_rate": 9.641590989689664e-06, + "loss": 0.4059, + "step": 5090 + }, + { + "epoch": 0.15, + "grad_norm": 1.6445522236612593, + "learning_rate": 9.641416338168315e-06, + "loss": 0.4119, + "step": 5091 + }, + { + "epoch": 0.15, + "grad_norm": 2.179409101104154, + "learning_rate": 9.641241645686415e-06, + "loss": 0.3846, + "step": 5092 + }, + { + "epoch": 0.15, + "grad_norm": 1.642850425569389, + "learning_rate": 9.641066912245504e-06, + "loss": 0.3853, + "step": 5093 + }, + { + "epoch": 0.15, + "grad_norm": 3.76742067192924, + "learning_rate": 9.640892137847127e-06, + "loss": 0.3934, + "step": 5094 + }, + { + "epoch": 0.15, + "grad_norm": 1.558473855494101, + "learning_rate": 9.640717322492828e-06, + "loss": 0.4085, + "step": 5095 + }, + { + "epoch": 0.15, + "grad_norm": 1.5790574759306906, + "learning_rate": 9.640542466184144e-06, + "loss": 0.3883, + "step": 5096 + }, + { + "epoch": 0.15, + "grad_norm": 1.8873021564642511, + "learning_rate": 9.640367568922621e-06, + "loss": 0.4002, + "step": 5097 + }, + { + "epoch": 0.15, + "grad_norm": 1.9385614022687947, + "learning_rate": 9.640192630709805e-06, + "loss": 0.402, + "step": 5098 + }, + { + "epoch": 0.15, + "grad_norm": 1.6293372670973552, + "learning_rate": 9.640017651547236e-06, + "loss": 0.3839, + "step": 5099 + }, + { + "epoch": 0.15, + "grad_norm": 1.542709732291663, + "learning_rate": 9.63984263143646e-06, + "loss": 0.4001, + "step": 5100 + }, + { + "epoch": 0.15, + "grad_norm": 1.747380640448963, + "learning_rate": 9.63966757037902e-06, + "loss": 0.4183, + "step": 5101 + }, + { + "epoch": 0.15, + "grad_norm": 1.790826035592836, + "learning_rate": 9.639492468376466e-06, + "loss": 0.3832, + "step": 5102 + }, + { + "epoch": 0.15, + "grad_norm": 1.6224173631039978, + "learning_rate": 9.639317325430337e-06, + "loss": 0.3575, + "step": 5103 + }, + { + "epoch": 0.15, + "grad_norm": 1.5813463647319084, + "learning_rate": 9.63914214154218e-06, + "loss": 0.3768, + "step": 5104 + }, + { + "epoch": 0.15, + "grad_norm": 1.6474018789352527, + "learning_rate": 9.638966916713544e-06, + "loss": 0.4295, + "step": 5105 + }, + { + "epoch": 0.15, + "grad_norm": 1.5153572465005969, + "learning_rate": 9.638791650945974e-06, + "loss": 0.3751, + "step": 5106 + }, + { + "epoch": 0.15, + "grad_norm": 1.6195869549987616, + "learning_rate": 9.638616344241013e-06, + "loss": 0.4005, + "step": 5107 + }, + { + "epoch": 0.15, + "grad_norm": 1.6376229694650892, + "learning_rate": 9.638440996600216e-06, + "loss": 0.4288, + "step": 5108 + }, + { + "epoch": 0.15, + "grad_norm": 1.8779740375057168, + "learning_rate": 9.638265608025123e-06, + "loss": 0.3666, + "step": 5109 + }, + { + "epoch": 0.15, + "grad_norm": 1.4966063261958487, + "learning_rate": 9.638090178517284e-06, + "loss": 0.3838, + "step": 5110 + }, + { + "epoch": 0.15, + "grad_norm": 1.8102397591007524, + "learning_rate": 9.637914708078249e-06, + "loss": 0.4657, + "step": 5111 + }, + { + "epoch": 0.15, + "grad_norm": 2.042635634598862, + "learning_rate": 9.637739196709564e-06, + "loss": 0.419, + "step": 5112 + }, + { + "epoch": 0.15, + "grad_norm": 1.7276400408283263, + "learning_rate": 9.637563644412781e-06, + "loss": 0.3908, + "step": 5113 + }, + { + "epoch": 0.15, + "grad_norm": 2.656592671761413, + "learning_rate": 9.637388051189446e-06, + "loss": 0.3783, + "step": 5114 + }, + { + "epoch": 0.15, + "grad_norm": 1.6393789623396944, + "learning_rate": 9.637212417041111e-06, + "loss": 0.3898, + "step": 5115 + }, + { + "epoch": 0.15, + "grad_norm": 1.6432503389333202, + "learning_rate": 9.637036741969323e-06, + "loss": 0.371, + "step": 5116 + }, + { + "epoch": 0.15, + "grad_norm": 1.6595166084989064, + "learning_rate": 9.636861025975637e-06, + "loss": 0.4053, + "step": 5117 + }, + { + "epoch": 0.15, + "grad_norm": 1.518387390490746, + "learning_rate": 9.636685269061599e-06, + "loss": 0.4258, + "step": 5118 + }, + { + "epoch": 0.15, + "grad_norm": 1.8239168063645153, + "learning_rate": 9.636509471228763e-06, + "loss": 0.4034, + "step": 5119 + }, + { + "epoch": 0.15, + "grad_norm": 1.6089667883869383, + "learning_rate": 9.636333632478678e-06, + "loss": 0.4097, + "step": 5120 + }, + { + "epoch": 0.15, + "grad_norm": 1.5605196009280908, + "learning_rate": 9.636157752812899e-06, + "loss": 0.4218, + "step": 5121 + }, + { + "epoch": 0.15, + "grad_norm": 1.6440284050205627, + "learning_rate": 9.635981832232976e-06, + "loss": 0.4072, + "step": 5122 + }, + { + "epoch": 0.15, + "grad_norm": 2.002649440438631, + "learning_rate": 9.635805870740461e-06, + "loss": 0.363, + "step": 5123 + }, + { + "epoch": 0.15, + "grad_norm": 1.5635105502374291, + "learning_rate": 9.635629868336909e-06, + "loss": 0.3775, + "step": 5124 + }, + { + "epoch": 0.15, + "grad_norm": 1.4485404082933877, + "learning_rate": 9.635453825023871e-06, + "loss": 0.3761, + "step": 5125 + }, + { + "epoch": 0.15, + "grad_norm": 1.4813392500712481, + "learning_rate": 9.635277740802903e-06, + "loss": 0.4001, + "step": 5126 + }, + { + "epoch": 0.15, + "grad_norm": 1.8961320978501262, + "learning_rate": 9.635101615675557e-06, + "loss": 0.3935, + "step": 5127 + }, + { + "epoch": 0.15, + "grad_norm": 1.6398684836868753, + "learning_rate": 9.634925449643386e-06, + "loss": 0.4246, + "step": 5128 + }, + { + "epoch": 0.15, + "grad_norm": 1.6958884469038913, + "learning_rate": 9.634749242707948e-06, + "loss": 0.3775, + "step": 5129 + }, + { + "epoch": 0.15, + "grad_norm": 1.5450268143028207, + "learning_rate": 9.634572994870797e-06, + "loss": 0.4062, + "step": 5130 + }, + { + "epoch": 0.15, + "grad_norm": 2.347874928838777, + "learning_rate": 9.634396706133487e-06, + "loss": 0.3896, + "step": 5131 + }, + { + "epoch": 0.15, + "grad_norm": 1.6990080272314063, + "learning_rate": 9.634220376497574e-06, + "loss": 0.401, + "step": 5132 + }, + { + "epoch": 0.15, + "grad_norm": 1.5739576471173249, + "learning_rate": 9.634044005964616e-06, + "loss": 0.4067, + "step": 5133 + }, + { + "epoch": 0.15, + "grad_norm": 1.6281383843866137, + "learning_rate": 9.633867594536169e-06, + "loss": 0.4346, + "step": 5134 + }, + { + "epoch": 0.15, + "grad_norm": 1.6790660838463294, + "learning_rate": 9.633691142213789e-06, + "loss": 0.3892, + "step": 5135 + }, + { + "epoch": 0.15, + "grad_norm": 1.5844694907082835, + "learning_rate": 9.633514648999031e-06, + "loss": 0.3751, + "step": 5136 + }, + { + "epoch": 0.15, + "grad_norm": 1.5933101222597557, + "learning_rate": 9.633338114893455e-06, + "loss": 0.4159, + "step": 5137 + }, + { + "epoch": 0.15, + "grad_norm": 2.1243345607968136, + "learning_rate": 9.63316153989862e-06, + "loss": 0.398, + "step": 5138 + }, + { + "epoch": 0.15, + "grad_norm": 1.4633380188638503, + "learning_rate": 9.632984924016084e-06, + "loss": 0.396, + "step": 5139 + }, + { + "epoch": 0.15, + "grad_norm": 1.9394251586495557, + "learning_rate": 9.632808267247403e-06, + "loss": 0.3959, + "step": 5140 + }, + { + "epoch": 0.15, + "grad_norm": 1.4669467414828208, + "learning_rate": 9.632631569594139e-06, + "loss": 0.3983, + "step": 5141 + }, + { + "epoch": 0.15, + "grad_norm": 1.8556363298554461, + "learning_rate": 9.632454831057849e-06, + "loss": 0.3997, + "step": 5142 + }, + { + "epoch": 0.15, + "grad_norm": 1.5190869999428818, + "learning_rate": 9.632278051640095e-06, + "loss": 0.398, + "step": 5143 + }, + { + "epoch": 0.15, + "grad_norm": 1.7402888230015723, + "learning_rate": 9.632101231342433e-06, + "loss": 0.4744, + "step": 5144 + }, + { + "epoch": 0.15, + "grad_norm": 1.7566539925624365, + "learning_rate": 9.631924370166428e-06, + "loss": 0.3803, + "step": 5145 + }, + { + "epoch": 0.15, + "grad_norm": 1.8069410347829877, + "learning_rate": 9.631747468113639e-06, + "loss": 0.4547, + "step": 5146 + }, + { + "epoch": 0.15, + "grad_norm": 1.7489146877067585, + "learning_rate": 9.631570525185627e-06, + "loss": 0.3897, + "step": 5147 + }, + { + "epoch": 0.15, + "grad_norm": 1.6494923153340235, + "learning_rate": 9.631393541383956e-06, + "loss": 0.409, + "step": 5148 + }, + { + "epoch": 0.15, + "grad_norm": 1.783202906876353, + "learning_rate": 9.631216516710182e-06, + "loss": 0.4243, + "step": 5149 + }, + { + "epoch": 0.15, + "grad_norm": 1.4135833615916669, + "learning_rate": 9.631039451165873e-06, + "loss": 0.3796, + "step": 5150 + }, + { + "epoch": 0.15, + "grad_norm": 1.6090735282521076, + "learning_rate": 9.630862344752588e-06, + "loss": 0.4312, + "step": 5151 + }, + { + "epoch": 0.15, + "grad_norm": 1.7358454167715334, + "learning_rate": 9.630685197471893e-06, + "loss": 0.4013, + "step": 5152 + }, + { + "epoch": 0.15, + "grad_norm": 1.5395316485913708, + "learning_rate": 9.630508009325349e-06, + "loss": 0.4236, + "step": 5153 + }, + { + "epoch": 0.15, + "grad_norm": 1.6657367089060298, + "learning_rate": 9.63033078031452e-06, + "loss": 0.4091, + "step": 5154 + }, + { + "epoch": 0.15, + "grad_norm": 9.326199504100645, + "learning_rate": 9.630153510440971e-06, + "loss": 0.3912, + "step": 5155 + }, + { + "epoch": 0.15, + "grad_norm": 1.5560694915983915, + "learning_rate": 9.629976199706266e-06, + "loss": 0.4045, + "step": 5156 + }, + { + "epoch": 0.15, + "grad_norm": 1.6709039185023755, + "learning_rate": 9.62979884811197e-06, + "loss": 0.3851, + "step": 5157 + }, + { + "epoch": 0.15, + "grad_norm": 1.741611350129713, + "learning_rate": 9.629621455659649e-06, + "loss": 0.3989, + "step": 5158 + }, + { + "epoch": 0.15, + "grad_norm": 1.48712759019711, + "learning_rate": 9.629444022350865e-06, + "loss": 0.4317, + "step": 5159 + }, + { + "epoch": 0.15, + "grad_norm": 1.6797943277893368, + "learning_rate": 9.629266548187185e-06, + "loss": 0.3955, + "step": 5160 + }, + { + "epoch": 0.15, + "grad_norm": 1.8433657362314304, + "learning_rate": 9.62908903317018e-06, + "loss": 0.3958, + "step": 5161 + }, + { + "epoch": 0.15, + "grad_norm": 1.516752826198928, + "learning_rate": 9.628911477301411e-06, + "loss": 0.3906, + "step": 5162 + }, + { + "epoch": 0.15, + "grad_norm": 1.6014000405013544, + "learning_rate": 9.628733880582445e-06, + "loss": 0.4108, + "step": 5163 + }, + { + "epoch": 0.15, + "grad_norm": 0.9827453203851614, + "learning_rate": 9.628556243014855e-06, + "loss": 0.5627, + "step": 5164 + }, + { + "epoch": 0.15, + "grad_norm": 1.7068548891822537, + "learning_rate": 9.628378564600203e-06, + "loss": 0.4138, + "step": 5165 + }, + { + "epoch": 0.15, + "grad_norm": 1.8081287819748486, + "learning_rate": 9.628200845340059e-06, + "loss": 0.4034, + "step": 5166 + }, + { + "epoch": 0.15, + "grad_norm": 1.6955161634640221, + "learning_rate": 9.628023085235992e-06, + "loss": 0.404, + "step": 5167 + }, + { + "epoch": 0.15, + "grad_norm": 1.569621002081221, + "learning_rate": 9.627845284289567e-06, + "loss": 0.4132, + "step": 5168 + }, + { + "epoch": 0.15, + "grad_norm": 1.5312378723291578, + "learning_rate": 9.627667442502359e-06, + "loss": 0.4003, + "step": 5169 + }, + { + "epoch": 0.15, + "grad_norm": 1.5190333445432935, + "learning_rate": 9.627489559875933e-06, + "loss": 0.3801, + "step": 5170 + }, + { + "epoch": 0.15, + "grad_norm": 1.5161739343080975, + "learning_rate": 9.627311636411861e-06, + "loss": 0.4237, + "step": 5171 + }, + { + "epoch": 0.15, + "grad_norm": 2.0152343471321226, + "learning_rate": 9.627133672111713e-06, + "loss": 0.4124, + "step": 5172 + }, + { + "epoch": 0.15, + "grad_norm": 2.645101060877031, + "learning_rate": 9.626955666977058e-06, + "loss": 0.4656, + "step": 5173 + }, + { + "epoch": 0.15, + "grad_norm": 1.6888010907669337, + "learning_rate": 9.626777621009467e-06, + "loss": 0.3813, + "step": 5174 + }, + { + "epoch": 0.15, + "grad_norm": 1.75289873599844, + "learning_rate": 9.626599534210514e-06, + "loss": 0.4053, + "step": 5175 + }, + { + "epoch": 0.15, + "grad_norm": 1.5927533988004035, + "learning_rate": 9.626421406581767e-06, + "loss": 0.4035, + "step": 5176 + }, + { + "epoch": 0.15, + "grad_norm": 1.7607112039135122, + "learning_rate": 9.626243238124802e-06, + "loss": 0.4457, + "step": 5177 + }, + { + "epoch": 0.15, + "grad_norm": 2.208339624742893, + "learning_rate": 9.626065028841187e-06, + "loss": 0.402, + "step": 5178 + }, + { + "epoch": 0.15, + "grad_norm": 1.5203402085385371, + "learning_rate": 9.625886778732499e-06, + "loss": 0.3953, + "step": 5179 + }, + { + "epoch": 0.15, + "grad_norm": 2.8528765240563225, + "learning_rate": 9.625708487800306e-06, + "loss": 0.3888, + "step": 5180 + }, + { + "epoch": 0.15, + "grad_norm": 1.441450028254547, + "learning_rate": 9.625530156046188e-06, + "loss": 0.3974, + "step": 5181 + }, + { + "epoch": 0.15, + "grad_norm": 1.5356584462736633, + "learning_rate": 9.625351783471712e-06, + "loss": 0.4202, + "step": 5182 + }, + { + "epoch": 0.15, + "grad_norm": 1.1622615648299954, + "learning_rate": 9.625173370078456e-06, + "loss": 0.678, + "step": 5183 + }, + { + "epoch": 0.15, + "grad_norm": 1.5279355676613084, + "learning_rate": 9.624994915867993e-06, + "loss": 0.3794, + "step": 5184 + }, + { + "epoch": 0.15, + "grad_norm": 2.52466730979496, + "learning_rate": 9.624816420841901e-06, + "loss": 0.4165, + "step": 5185 + }, + { + "epoch": 0.15, + "grad_norm": 1.6676646389947707, + "learning_rate": 9.62463788500175e-06, + "loss": 0.3789, + "step": 5186 + }, + { + "epoch": 0.15, + "grad_norm": 4.677922095655357, + "learning_rate": 9.62445930834912e-06, + "loss": 0.4107, + "step": 5187 + }, + { + "epoch": 0.15, + "grad_norm": 1.756598818101383, + "learning_rate": 9.624280690885585e-06, + "loss": 0.4364, + "step": 5188 + }, + { + "epoch": 0.15, + "grad_norm": 1.519304873681441, + "learning_rate": 9.624102032612722e-06, + "loss": 0.4151, + "step": 5189 + }, + { + "epoch": 0.15, + "grad_norm": 2.0257849227053093, + "learning_rate": 9.623923333532105e-06, + "loss": 0.3852, + "step": 5190 + }, + { + "epoch": 0.15, + "grad_norm": 1.8530382290908722, + "learning_rate": 9.623744593645317e-06, + "loss": 0.4172, + "step": 5191 + }, + { + "epoch": 0.15, + "grad_norm": 1.6464653322161704, + "learning_rate": 9.62356581295393e-06, + "loss": 0.437, + "step": 5192 + }, + { + "epoch": 0.15, + "grad_norm": 2.0309625730130545, + "learning_rate": 9.623386991459522e-06, + "loss": 0.3973, + "step": 5193 + }, + { + "epoch": 0.15, + "grad_norm": 1.6798187918803817, + "learning_rate": 9.623208129163674e-06, + "loss": 0.4165, + "step": 5194 + }, + { + "epoch": 0.15, + "grad_norm": 1.5436307970222927, + "learning_rate": 9.623029226067962e-06, + "loss": 0.3929, + "step": 5195 + }, + { + "epoch": 0.15, + "grad_norm": 1.4906860035813476, + "learning_rate": 9.622850282173968e-06, + "loss": 0.3767, + "step": 5196 + }, + { + "epoch": 0.15, + "grad_norm": 1.552760942552491, + "learning_rate": 9.622671297483266e-06, + "loss": 0.4426, + "step": 5197 + }, + { + "epoch": 0.15, + "grad_norm": 2.2638932606931634, + "learning_rate": 9.62249227199744e-06, + "loss": 0.3847, + "step": 5198 + }, + { + "epoch": 0.15, + "grad_norm": 1.9875352878785069, + "learning_rate": 9.622313205718068e-06, + "loss": 0.3738, + "step": 5199 + }, + { + "epoch": 0.15, + "grad_norm": 1.5582152374760647, + "learning_rate": 9.622134098646731e-06, + "loss": 0.4088, + "step": 5200 + }, + { + "epoch": 0.15, + "grad_norm": 1.5959771165612664, + "learning_rate": 9.621954950785011e-06, + "loss": 0.3966, + "step": 5201 + }, + { + "epoch": 0.15, + "grad_norm": 1.540594419493125, + "learning_rate": 9.621775762134484e-06, + "loss": 0.4027, + "step": 5202 + }, + { + "epoch": 0.15, + "grad_norm": 2.5401758668248133, + "learning_rate": 9.621596532696737e-06, + "loss": 0.4074, + "step": 5203 + }, + { + "epoch": 0.15, + "grad_norm": 1.8043434074357962, + "learning_rate": 9.621417262473347e-06, + "loss": 0.3834, + "step": 5204 + }, + { + "epoch": 0.15, + "grad_norm": 1.5625387612102093, + "learning_rate": 9.6212379514659e-06, + "loss": 0.4132, + "step": 5205 + }, + { + "epoch": 0.15, + "grad_norm": 1.8828127726011712, + "learning_rate": 9.62105859967598e-06, + "loss": 0.3736, + "step": 5206 + }, + { + "epoch": 0.15, + "grad_norm": 1.5303765158953466, + "learning_rate": 9.620879207105162e-06, + "loss": 0.4397, + "step": 5207 + }, + { + "epoch": 0.15, + "grad_norm": 1.6059227148957718, + "learning_rate": 9.620699773755034e-06, + "loss": 0.44, + "step": 5208 + }, + { + "epoch": 0.15, + "grad_norm": 1.50782997252738, + "learning_rate": 9.62052029962718e-06, + "loss": 0.3919, + "step": 5209 + }, + { + "epoch": 0.15, + "grad_norm": 1.5949964622974966, + "learning_rate": 9.620340784723184e-06, + "loss": 0.3992, + "step": 5210 + }, + { + "epoch": 0.15, + "grad_norm": 1.5028641252138928, + "learning_rate": 9.620161229044629e-06, + "loss": 0.4013, + "step": 5211 + }, + { + "epoch": 0.15, + "grad_norm": 1.5172003192480263, + "learning_rate": 9.6199816325931e-06, + "loss": 0.3669, + "step": 5212 + }, + { + "epoch": 0.15, + "grad_norm": 1.4081999284155302, + "learning_rate": 9.619801995370182e-06, + "loss": 0.3807, + "step": 5213 + }, + { + "epoch": 0.15, + "grad_norm": 1.5262685573364947, + "learning_rate": 9.619622317377459e-06, + "loss": 0.394, + "step": 5214 + }, + { + "epoch": 0.15, + "grad_norm": 1.390462514647703, + "learning_rate": 9.619442598616517e-06, + "loss": 0.367, + "step": 5215 + }, + { + "epoch": 0.15, + "grad_norm": 1.4495410153769104, + "learning_rate": 9.619262839088946e-06, + "loss": 0.3908, + "step": 5216 + }, + { + "epoch": 0.15, + "grad_norm": 2.335766840611722, + "learning_rate": 9.619083038796326e-06, + "loss": 0.4127, + "step": 5217 + }, + { + "epoch": 0.15, + "grad_norm": 2.0294022985787654, + "learning_rate": 9.618903197740248e-06, + "loss": 0.3989, + "step": 5218 + }, + { + "epoch": 0.15, + "grad_norm": 1.6462974275219189, + "learning_rate": 9.618723315922301e-06, + "loss": 0.4284, + "step": 5219 + }, + { + "epoch": 0.15, + "grad_norm": 1.460855410111419, + "learning_rate": 9.618543393344066e-06, + "loss": 0.3817, + "step": 5220 + }, + { + "epoch": 0.15, + "grad_norm": 1.5076492921536397, + "learning_rate": 9.618363430007134e-06, + "loss": 0.4095, + "step": 5221 + }, + { + "epoch": 0.15, + "grad_norm": 2.1280847261017803, + "learning_rate": 9.618183425913095e-06, + "loss": 0.3908, + "step": 5222 + }, + { + "epoch": 0.15, + "grad_norm": 1.654309392088653, + "learning_rate": 9.618003381063536e-06, + "loss": 0.3799, + "step": 5223 + }, + { + "epoch": 0.15, + "grad_norm": 1.8970646821994246, + "learning_rate": 9.617823295460046e-06, + "loss": 0.4178, + "step": 5224 + }, + { + "epoch": 0.15, + "grad_norm": 1.6767606795838157, + "learning_rate": 9.617643169104214e-06, + "loss": 0.4091, + "step": 5225 + }, + { + "epoch": 0.15, + "grad_norm": 1.6565317594100604, + "learning_rate": 9.617463001997628e-06, + "loss": 0.4218, + "step": 5226 + }, + { + "epoch": 0.15, + "grad_norm": 1.6300064339883173, + "learning_rate": 9.617282794141882e-06, + "loss": 0.3848, + "step": 5227 + }, + { + "epoch": 0.15, + "grad_norm": 1.4495516613768884, + "learning_rate": 9.617102545538565e-06, + "loss": 0.4022, + "step": 5228 + }, + { + "epoch": 0.15, + "grad_norm": 1.757898071369046, + "learning_rate": 9.616922256189265e-06, + "loss": 0.39, + "step": 5229 + }, + { + "epoch": 0.15, + "grad_norm": 1.8047270389158143, + "learning_rate": 9.616741926095575e-06, + "loss": 0.4061, + "step": 5230 + }, + { + "epoch": 0.15, + "grad_norm": 1.6752652360254963, + "learning_rate": 9.616561555259088e-06, + "loss": 0.4013, + "step": 5231 + }, + { + "epoch": 0.15, + "grad_norm": 1.6430783486309373, + "learning_rate": 9.616381143681392e-06, + "loss": 0.4278, + "step": 5232 + }, + { + "epoch": 0.15, + "grad_norm": 1.3713962016496748, + "learning_rate": 9.616200691364081e-06, + "loss": 0.3679, + "step": 5233 + }, + { + "epoch": 0.15, + "grad_norm": 1.5198681772062135, + "learning_rate": 9.616020198308748e-06, + "loss": 0.4056, + "step": 5234 + }, + { + "epoch": 0.15, + "grad_norm": 1.7570494741758549, + "learning_rate": 9.615839664516987e-06, + "loss": 0.4108, + "step": 5235 + }, + { + "epoch": 0.15, + "grad_norm": 2.2778765809752777, + "learning_rate": 9.615659089990388e-06, + "loss": 0.4304, + "step": 5236 + }, + { + "epoch": 0.15, + "grad_norm": 1.8690640868333046, + "learning_rate": 9.615478474730548e-06, + "loss": 0.3995, + "step": 5237 + }, + { + "epoch": 0.15, + "grad_norm": 1.6621026191164037, + "learning_rate": 9.615297818739058e-06, + "loss": 0.4197, + "step": 5238 + }, + { + "epoch": 0.15, + "grad_norm": 1.4390634854868973, + "learning_rate": 9.615117122017514e-06, + "loss": 0.3756, + "step": 5239 + }, + { + "epoch": 0.15, + "grad_norm": 1.6389501889057196, + "learning_rate": 9.61493638456751e-06, + "loss": 0.3933, + "step": 5240 + }, + { + "epoch": 0.15, + "grad_norm": 1.5849914378996608, + "learning_rate": 9.614755606390641e-06, + "loss": 0.4317, + "step": 5241 + }, + { + "epoch": 0.15, + "grad_norm": 1.5800380255860051, + "learning_rate": 9.614574787488505e-06, + "loss": 0.3744, + "step": 5242 + }, + { + "epoch": 0.15, + "grad_norm": 1.9779651969275311, + "learning_rate": 9.614393927862692e-06, + "loss": 0.3851, + "step": 5243 + }, + { + "epoch": 0.15, + "grad_norm": 1.711714715151348, + "learning_rate": 9.614213027514802e-06, + "loss": 0.4071, + "step": 5244 + }, + { + "epoch": 0.15, + "grad_norm": 1.6337857441334267, + "learning_rate": 9.614032086446432e-06, + "loss": 0.4567, + "step": 5245 + }, + { + "epoch": 0.15, + "grad_norm": 1.5559618279126222, + "learning_rate": 9.613851104659178e-06, + "loss": 0.3686, + "step": 5246 + }, + { + "epoch": 0.15, + "grad_norm": 1.8845428079816422, + "learning_rate": 9.613670082154636e-06, + "loss": 0.4333, + "step": 5247 + }, + { + "epoch": 0.15, + "grad_norm": 2.6629618796385106, + "learning_rate": 9.613489018934404e-06, + "loss": 0.4282, + "step": 5248 + }, + { + "epoch": 0.15, + "grad_norm": 2.8644734397081506, + "learning_rate": 9.613307915000081e-06, + "loss": 0.3824, + "step": 5249 + }, + { + "epoch": 0.15, + "grad_norm": 1.5723182293084828, + "learning_rate": 9.613126770353263e-06, + "loss": 0.4181, + "step": 5250 + }, + { + "epoch": 0.15, + "grad_norm": 1.3572772442950773, + "learning_rate": 9.612945584995552e-06, + "loss": 0.3537, + "step": 5251 + }, + { + "epoch": 0.15, + "grad_norm": 1.4954258411637886, + "learning_rate": 9.612764358928543e-06, + "loss": 0.3736, + "step": 5252 + }, + { + "epoch": 0.15, + "grad_norm": 1.4924094710562115, + "learning_rate": 9.612583092153839e-06, + "loss": 0.3784, + "step": 5253 + }, + { + "epoch": 0.15, + "grad_norm": 1.6102850018865729, + "learning_rate": 9.612401784673037e-06, + "loss": 0.4111, + "step": 5254 + }, + { + "epoch": 0.15, + "grad_norm": 1.671982798233989, + "learning_rate": 9.612220436487739e-06, + "loss": 0.3788, + "step": 5255 + }, + { + "epoch": 0.15, + "grad_norm": 1.6574036198298852, + "learning_rate": 9.612039047599544e-06, + "loss": 0.4063, + "step": 5256 + }, + { + "epoch": 0.15, + "grad_norm": 1.5658611470847241, + "learning_rate": 9.611857618010054e-06, + "loss": 0.3907, + "step": 5257 + }, + { + "epoch": 0.15, + "grad_norm": 1.92066744978981, + "learning_rate": 9.611676147720868e-06, + "loss": 0.3937, + "step": 5258 + }, + { + "epoch": 0.15, + "grad_norm": 1.60345388705223, + "learning_rate": 9.611494636733589e-06, + "loss": 0.3912, + "step": 5259 + }, + { + "epoch": 0.15, + "grad_norm": 1.6857416027382197, + "learning_rate": 9.61131308504982e-06, + "loss": 0.4493, + "step": 5260 + }, + { + "epoch": 0.15, + "grad_norm": 1.6508478967812028, + "learning_rate": 9.611131492671162e-06, + "loss": 0.3766, + "step": 5261 + }, + { + "epoch": 0.15, + "grad_norm": 1.766634576890156, + "learning_rate": 9.610949859599215e-06, + "loss": 0.3591, + "step": 5262 + }, + { + "epoch": 0.15, + "grad_norm": 1.713943417995294, + "learning_rate": 9.610768185835587e-06, + "loss": 0.3951, + "step": 5263 + }, + { + "epoch": 0.15, + "grad_norm": 1.5540408996028316, + "learning_rate": 9.610586471381878e-06, + "loss": 0.3978, + "step": 5264 + }, + { + "epoch": 0.15, + "grad_norm": 1.5187132716614642, + "learning_rate": 9.610404716239691e-06, + "loss": 0.3695, + "step": 5265 + }, + { + "epoch": 0.15, + "grad_norm": 1.6800294898800923, + "learning_rate": 9.61022292041063e-06, + "loss": 0.384, + "step": 5266 + }, + { + "epoch": 0.15, + "grad_norm": 1.6951365082610388, + "learning_rate": 9.610041083896304e-06, + "loss": 0.3727, + "step": 5267 + }, + { + "epoch": 0.15, + "grad_norm": 1.7038626890443713, + "learning_rate": 9.609859206698313e-06, + "loss": 0.4794, + "step": 5268 + }, + { + "epoch": 0.15, + "grad_norm": 1.5211465015683676, + "learning_rate": 9.609677288818263e-06, + "loss": 0.3909, + "step": 5269 + }, + { + "epoch": 0.15, + "grad_norm": 3.3877151968584127, + "learning_rate": 9.60949533025776e-06, + "loss": 0.4156, + "step": 5270 + }, + { + "epoch": 0.15, + "grad_norm": 4.783415801724212, + "learning_rate": 9.60931333101841e-06, + "loss": 0.3939, + "step": 5271 + }, + { + "epoch": 0.15, + "grad_norm": 2.611220382154971, + "learning_rate": 9.609131291101818e-06, + "loss": 0.444, + "step": 5272 + }, + { + "epoch": 0.15, + "grad_norm": 1.6732316955045852, + "learning_rate": 9.608949210509591e-06, + "loss": 0.3964, + "step": 5273 + }, + { + "epoch": 0.15, + "grad_norm": 1.8441270356132171, + "learning_rate": 9.608767089243335e-06, + "loss": 0.3812, + "step": 5274 + }, + { + "epoch": 0.15, + "grad_norm": 1.7453193395104227, + "learning_rate": 9.608584927304662e-06, + "loss": 0.4134, + "step": 5275 + }, + { + "epoch": 0.15, + "grad_norm": 2.4884183608458015, + "learning_rate": 9.608402724695174e-06, + "loss": 0.3954, + "step": 5276 + }, + { + "epoch": 0.15, + "grad_norm": 1.5779823701058453, + "learning_rate": 9.60822048141648e-06, + "loss": 0.3742, + "step": 5277 + }, + { + "epoch": 0.15, + "grad_norm": 1.5533380071217877, + "learning_rate": 9.60803819747019e-06, + "loss": 0.3812, + "step": 5278 + }, + { + "epoch": 0.15, + "grad_norm": 1.589369505151196, + "learning_rate": 9.607855872857911e-06, + "loss": 0.3784, + "step": 5279 + }, + { + "epoch": 0.15, + "grad_norm": 1.8119507713002245, + "learning_rate": 9.607673507581254e-06, + "loss": 0.3978, + "step": 5280 + }, + { + "epoch": 0.15, + "grad_norm": 1.5456858650946852, + "learning_rate": 9.607491101641825e-06, + "loss": 0.3829, + "step": 5281 + }, + { + "epoch": 0.15, + "grad_norm": 2.106285783620372, + "learning_rate": 9.607308655041238e-06, + "loss": 0.3926, + "step": 5282 + }, + { + "epoch": 0.15, + "grad_norm": 1.9850562561995628, + "learning_rate": 9.6071261677811e-06, + "loss": 0.4737, + "step": 5283 + }, + { + "epoch": 0.15, + "grad_norm": 1.7897191579395224, + "learning_rate": 9.606943639863024e-06, + "loss": 0.3747, + "step": 5284 + }, + { + "epoch": 0.15, + "grad_norm": 1.6093376079669266, + "learning_rate": 9.606761071288618e-06, + "loss": 0.3998, + "step": 5285 + }, + { + "epoch": 0.15, + "grad_norm": 1.588128408968805, + "learning_rate": 9.606578462059495e-06, + "loss": 0.3987, + "step": 5286 + }, + { + "epoch": 0.15, + "grad_norm": 1.6450212003389937, + "learning_rate": 9.606395812177265e-06, + "loss": 0.4009, + "step": 5287 + }, + { + "epoch": 0.15, + "grad_norm": 1.7307043261758983, + "learning_rate": 9.606213121643542e-06, + "loss": 0.3739, + "step": 5288 + }, + { + "epoch": 0.15, + "grad_norm": 1.7526512897924853, + "learning_rate": 9.606030390459937e-06, + "loss": 0.4147, + "step": 5289 + }, + { + "epoch": 0.15, + "grad_norm": 1.7374715379097094, + "learning_rate": 9.60584761862806e-06, + "loss": 0.4112, + "step": 5290 + }, + { + "epoch": 0.15, + "grad_norm": 1.6538722008173983, + "learning_rate": 9.60566480614953e-06, + "loss": 0.3646, + "step": 5291 + }, + { + "epoch": 0.15, + "grad_norm": 1.8462381360352353, + "learning_rate": 9.605481953025957e-06, + "loss": 0.3965, + "step": 5292 + }, + { + "epoch": 0.15, + "grad_norm": 1.1959336083901035, + "learning_rate": 9.605299059258952e-06, + "loss": 0.657, + "step": 5293 + }, + { + "epoch": 0.15, + "grad_norm": 1.599175696235294, + "learning_rate": 9.605116124850133e-06, + "loss": 0.3732, + "step": 5294 + }, + { + "epoch": 0.15, + "grad_norm": 2.10229599178819, + "learning_rate": 9.604933149801114e-06, + "loss": 0.4193, + "step": 5295 + }, + { + "epoch": 0.15, + "grad_norm": 1.7113171999609582, + "learning_rate": 9.604750134113508e-06, + "loss": 0.3978, + "step": 5296 + }, + { + "epoch": 0.15, + "grad_norm": 1.799847421625085, + "learning_rate": 9.604567077788932e-06, + "loss": 0.3856, + "step": 5297 + }, + { + "epoch": 0.15, + "grad_norm": 1.5442516961370378, + "learning_rate": 9.604383980829e-06, + "loss": 0.4365, + "step": 5298 + }, + { + "epoch": 0.15, + "grad_norm": 1.6573451973461237, + "learning_rate": 9.604200843235328e-06, + "loss": 0.3995, + "step": 5299 + }, + { + "epoch": 0.15, + "grad_norm": 1.5160318060949134, + "learning_rate": 9.604017665009532e-06, + "loss": 0.4034, + "step": 5300 + }, + { + "epoch": 0.15, + "grad_norm": 1.8775621592997964, + "learning_rate": 9.60383444615323e-06, + "loss": 0.4024, + "step": 5301 + }, + { + "epoch": 0.15, + "grad_norm": 1.9930641778970097, + "learning_rate": 9.603651186668037e-06, + "loss": 0.4117, + "step": 5302 + }, + { + "epoch": 0.15, + "grad_norm": 1.8446782469474012, + "learning_rate": 9.60346788655557e-06, + "loss": 0.3937, + "step": 5303 + }, + { + "epoch": 0.15, + "grad_norm": 17.24850436210466, + "learning_rate": 9.603284545817451e-06, + "loss": 0.3908, + "step": 5304 + }, + { + "epoch": 0.15, + "grad_norm": 1.5940062374913209, + "learning_rate": 9.603101164455293e-06, + "loss": 0.4052, + "step": 5305 + }, + { + "epoch": 0.15, + "grad_norm": 3.1062896648448057, + "learning_rate": 9.602917742470717e-06, + "loss": 0.4192, + "step": 5306 + }, + { + "epoch": 0.15, + "grad_norm": 1.751953584592825, + "learning_rate": 9.60273427986534e-06, + "loss": 0.3949, + "step": 5307 + }, + { + "epoch": 0.15, + "grad_norm": 1.692387068095404, + "learning_rate": 9.602550776640782e-06, + "loss": 0.3893, + "step": 5308 + }, + { + "epoch": 0.15, + "grad_norm": 1.8105308714646184, + "learning_rate": 9.602367232798663e-06, + "loss": 0.3887, + "step": 5309 + }, + { + "epoch": 0.15, + "grad_norm": 1.3862955591641641, + "learning_rate": 9.602183648340601e-06, + "loss": 0.3869, + "step": 5310 + }, + { + "epoch": 0.15, + "grad_norm": 1.4277130313052775, + "learning_rate": 9.60200002326822e-06, + "loss": 0.3965, + "step": 5311 + }, + { + "epoch": 0.15, + "grad_norm": 1.6514827937206886, + "learning_rate": 9.601816357583135e-06, + "loss": 0.4043, + "step": 5312 + }, + { + "epoch": 0.15, + "grad_norm": 1.4915241895303937, + "learning_rate": 9.60163265128697e-06, + "loss": 0.38, + "step": 5313 + }, + { + "epoch": 0.15, + "grad_norm": 1.5124013580047844, + "learning_rate": 9.601448904381346e-06, + "loss": 0.3937, + "step": 5314 + }, + { + "epoch": 0.15, + "grad_norm": 1.6233269329268647, + "learning_rate": 9.601265116867883e-06, + "loss": 0.4459, + "step": 5315 + }, + { + "epoch": 0.15, + "grad_norm": 2.217587861294195, + "learning_rate": 9.601081288748206e-06, + "loss": 0.429, + "step": 5316 + }, + { + "epoch": 0.15, + "grad_norm": 1.42248044211, + "learning_rate": 9.600897420023937e-06, + "loss": 0.3876, + "step": 5317 + }, + { + "epoch": 0.15, + "grad_norm": 1.4157640068409243, + "learning_rate": 9.600713510696696e-06, + "loss": 0.3745, + "step": 5318 + }, + { + "epoch": 0.15, + "grad_norm": 1.5526775881304773, + "learning_rate": 9.600529560768107e-06, + "loss": 0.3845, + "step": 5319 + }, + { + "epoch": 0.15, + "grad_norm": 1.5941464475134146, + "learning_rate": 9.600345570239793e-06, + "loss": 0.3846, + "step": 5320 + }, + { + "epoch": 0.15, + "grad_norm": 1.8992520229090248, + "learning_rate": 9.600161539113377e-06, + "loss": 0.4206, + "step": 5321 + }, + { + "epoch": 0.15, + "grad_norm": 1.4046700747538625, + "learning_rate": 9.599977467390487e-06, + "loss": 0.4065, + "step": 5322 + }, + { + "epoch": 0.15, + "grad_norm": 1.5663844273410477, + "learning_rate": 9.599793355072744e-06, + "loss": 0.4176, + "step": 5323 + }, + { + "epoch": 0.15, + "grad_norm": 1.5878257979815351, + "learning_rate": 9.599609202161774e-06, + "loss": 0.3773, + "step": 5324 + }, + { + "epoch": 0.15, + "grad_norm": 2.0199992598826695, + "learning_rate": 9.599425008659201e-06, + "loss": 0.3967, + "step": 5325 + }, + { + "epoch": 0.15, + "grad_norm": 1.9603425262283023, + "learning_rate": 9.599240774566652e-06, + "loss": 0.4074, + "step": 5326 + }, + { + "epoch": 0.15, + "grad_norm": 1.5870164118255867, + "learning_rate": 9.59905649988575e-06, + "loss": 0.3915, + "step": 5327 + }, + { + "epoch": 0.15, + "grad_norm": 1.8267101811086905, + "learning_rate": 9.598872184618126e-06, + "loss": 0.4066, + "step": 5328 + }, + { + "epoch": 0.15, + "grad_norm": 1.5738555764851523, + "learning_rate": 9.598687828765402e-06, + "loss": 0.393, + "step": 5329 + }, + { + "epoch": 0.15, + "grad_norm": 1.610117733553802, + "learning_rate": 9.598503432329209e-06, + "loss": 0.3729, + "step": 5330 + }, + { + "epoch": 0.15, + "grad_norm": 1.5775462551748136, + "learning_rate": 9.59831899531117e-06, + "loss": 0.4072, + "step": 5331 + }, + { + "epoch": 0.15, + "grad_norm": 1.8701868078504786, + "learning_rate": 9.598134517712917e-06, + "loss": 0.3959, + "step": 5332 + }, + { + "epoch": 0.15, + "grad_norm": 1.5283755794129454, + "learning_rate": 9.597949999536075e-06, + "loss": 0.4271, + "step": 5333 + }, + { + "epoch": 0.15, + "grad_norm": 3.214456312975149, + "learning_rate": 9.597765440782274e-06, + "loss": 0.3844, + "step": 5334 + }, + { + "epoch": 0.15, + "grad_norm": 1.3231052217489934, + "learning_rate": 9.59758084145314e-06, + "loss": 0.3606, + "step": 5335 + }, + { + "epoch": 0.15, + "grad_norm": 1.7609256130315696, + "learning_rate": 9.597396201550307e-06, + "loss": 0.4089, + "step": 5336 + }, + { + "epoch": 0.15, + "grad_norm": 1.7581994280249604, + "learning_rate": 9.597211521075398e-06, + "loss": 0.4078, + "step": 5337 + }, + { + "epoch": 0.15, + "grad_norm": 1.4931569085106213, + "learning_rate": 9.597026800030048e-06, + "loss": 0.4081, + "step": 5338 + }, + { + "epoch": 0.15, + "grad_norm": 1.2992594897606002, + "learning_rate": 9.596842038415886e-06, + "loss": 0.3757, + "step": 5339 + }, + { + "epoch": 0.15, + "grad_norm": 1.4952081184935513, + "learning_rate": 9.596657236234541e-06, + "loss": 0.3739, + "step": 5340 + }, + { + "epoch": 0.15, + "grad_norm": 1.4844222408647356, + "learning_rate": 9.596472393487645e-06, + "loss": 0.4141, + "step": 5341 + }, + { + "epoch": 0.15, + "grad_norm": 1.5902263327933255, + "learning_rate": 9.59628751017683e-06, + "loss": 0.4376, + "step": 5342 + }, + { + "epoch": 0.15, + "grad_norm": 1.1752312241218867, + "learning_rate": 9.596102586303726e-06, + "loss": 0.5724, + "step": 5343 + }, + { + "epoch": 0.16, + "grad_norm": 3.8717583489449034, + "learning_rate": 9.595917621869967e-06, + "loss": 0.4278, + "step": 5344 + }, + { + "epoch": 0.16, + "grad_norm": 1.7579843057737834, + "learning_rate": 9.595732616877183e-06, + "loss": 0.3768, + "step": 5345 + }, + { + "epoch": 0.16, + "grad_norm": 1.5927910632700388, + "learning_rate": 9.595547571327008e-06, + "loss": 0.371, + "step": 5346 + }, + { + "epoch": 0.16, + "grad_norm": 1.643951159336712, + "learning_rate": 9.595362485221073e-06, + "loss": 0.3804, + "step": 5347 + }, + { + "epoch": 0.16, + "grad_norm": 11.660428344497875, + "learning_rate": 9.595177358561015e-06, + "loss": 0.3722, + "step": 5348 + }, + { + "epoch": 0.16, + "grad_norm": 2.1339413725566043, + "learning_rate": 9.594992191348465e-06, + "loss": 0.4027, + "step": 5349 + }, + { + "epoch": 0.16, + "grad_norm": 1.7231174850893844, + "learning_rate": 9.594806983585058e-06, + "loss": 0.4453, + "step": 5350 + }, + { + "epoch": 0.16, + "grad_norm": 1.5200006128140204, + "learning_rate": 9.594621735272429e-06, + "loss": 0.3989, + "step": 5351 + }, + { + "epoch": 0.16, + "grad_norm": 1.7500981708174268, + "learning_rate": 9.594436446412212e-06, + "loss": 0.4005, + "step": 5352 + }, + { + "epoch": 0.16, + "grad_norm": 1.9525442019861785, + "learning_rate": 9.594251117006041e-06, + "loss": 0.4044, + "step": 5353 + }, + { + "epoch": 0.16, + "grad_norm": 1.9751671593228917, + "learning_rate": 9.594065747055556e-06, + "loss": 0.4028, + "step": 5354 + }, + { + "epoch": 0.16, + "grad_norm": 1.652177906455859, + "learning_rate": 9.593880336562387e-06, + "loss": 0.4411, + "step": 5355 + }, + { + "epoch": 0.16, + "grad_norm": 1.4560818603175232, + "learning_rate": 9.593694885528173e-06, + "loss": 0.3803, + "step": 5356 + }, + { + "epoch": 0.16, + "grad_norm": 1.0469390068527502, + "learning_rate": 9.593509393954553e-06, + "loss": 0.607, + "step": 5357 + }, + { + "epoch": 0.16, + "grad_norm": 1.3901192311402277, + "learning_rate": 9.59332386184316e-06, + "loss": 0.3944, + "step": 5358 + }, + { + "epoch": 0.16, + "grad_norm": 1.6350981762308574, + "learning_rate": 9.593138289195634e-06, + "loss": 0.3786, + "step": 5359 + }, + { + "epoch": 0.16, + "grad_norm": 1.5633571145492455, + "learning_rate": 9.592952676013612e-06, + "loss": 0.3995, + "step": 5360 + }, + { + "epoch": 0.16, + "grad_norm": 1.7182702097821714, + "learning_rate": 9.59276702229873e-06, + "loss": 0.3815, + "step": 5361 + }, + { + "epoch": 0.16, + "grad_norm": 1.480683251252779, + "learning_rate": 9.592581328052628e-06, + "loss": 0.3791, + "step": 5362 + }, + { + "epoch": 0.16, + "grad_norm": 3.4062121138432775, + "learning_rate": 9.592395593276946e-06, + "loss": 0.3768, + "step": 5363 + }, + { + "epoch": 0.16, + "grad_norm": 2.309590112820692, + "learning_rate": 9.592209817973321e-06, + "loss": 0.3886, + "step": 5364 + }, + { + "epoch": 0.16, + "grad_norm": 1.551890414297406, + "learning_rate": 9.592024002143394e-06, + "loss": 0.3965, + "step": 5365 + }, + { + "epoch": 0.16, + "grad_norm": 1.8045358475551696, + "learning_rate": 9.591838145788803e-06, + "loss": 0.4132, + "step": 5366 + }, + { + "epoch": 0.16, + "grad_norm": 1.4645490011067581, + "learning_rate": 9.591652248911191e-06, + "loss": 0.3734, + "step": 5367 + }, + { + "epoch": 0.16, + "grad_norm": 1.5358625811776105, + "learning_rate": 9.591466311512197e-06, + "loss": 0.3892, + "step": 5368 + }, + { + "epoch": 0.16, + "grad_norm": 1.6568489642763362, + "learning_rate": 9.591280333593463e-06, + "loss": 0.3726, + "step": 5369 + }, + { + "epoch": 0.16, + "grad_norm": 1.390536442093382, + "learning_rate": 9.591094315156627e-06, + "loss": 0.3803, + "step": 5370 + }, + { + "epoch": 0.16, + "grad_norm": 1.603506859797188, + "learning_rate": 9.590908256203333e-06, + "loss": 0.3909, + "step": 5371 + }, + { + "epoch": 0.16, + "grad_norm": 1.4855510850940823, + "learning_rate": 9.590722156735222e-06, + "loss": 0.41, + "step": 5372 + }, + { + "epoch": 0.16, + "grad_norm": 1.9923309608515853, + "learning_rate": 9.590536016753938e-06, + "loss": 0.3854, + "step": 5373 + }, + { + "epoch": 0.16, + "grad_norm": 1.5204074616113485, + "learning_rate": 9.590349836261124e-06, + "loss": 0.3995, + "step": 5374 + }, + { + "epoch": 0.16, + "grad_norm": 1.878538502631975, + "learning_rate": 9.59016361525842e-06, + "loss": 0.4307, + "step": 5375 + }, + { + "epoch": 0.16, + "grad_norm": 1.764630135511233, + "learning_rate": 9.589977353747471e-06, + "loss": 0.3997, + "step": 5376 + }, + { + "epoch": 0.16, + "grad_norm": 2.328420903997987, + "learning_rate": 9.589791051729921e-06, + "loss": 0.3926, + "step": 5377 + }, + { + "epoch": 0.16, + "grad_norm": 1.7180703006075595, + "learning_rate": 9.589604709207417e-06, + "loss": 0.3628, + "step": 5378 + }, + { + "epoch": 0.16, + "grad_norm": 1.6452334641536266, + "learning_rate": 9.589418326181596e-06, + "loss": 0.4244, + "step": 5379 + }, + { + "epoch": 0.16, + "grad_norm": 1.6214326803484895, + "learning_rate": 9.58923190265411e-06, + "loss": 0.4098, + "step": 5380 + }, + { + "epoch": 0.16, + "grad_norm": 1.7859364927363426, + "learning_rate": 9.589045438626602e-06, + "loss": 0.3843, + "step": 5381 + }, + { + "epoch": 0.16, + "grad_norm": 1.6407340767199188, + "learning_rate": 9.588858934100715e-06, + "loss": 0.3852, + "step": 5382 + }, + { + "epoch": 0.16, + "grad_norm": 1.622820518940391, + "learning_rate": 9.5886723890781e-06, + "loss": 0.405, + "step": 5383 + }, + { + "epoch": 0.16, + "grad_norm": 1.4783174549908706, + "learning_rate": 9.588485803560396e-06, + "loss": 0.3897, + "step": 5384 + }, + { + "epoch": 0.16, + "grad_norm": 1.5441362454994163, + "learning_rate": 9.588299177549258e-06, + "loss": 0.4058, + "step": 5385 + }, + { + "epoch": 0.16, + "grad_norm": 1.430512317412693, + "learning_rate": 9.588112511046328e-06, + "loss": 0.389, + "step": 5386 + }, + { + "epoch": 0.16, + "grad_norm": 1.9887265169811335, + "learning_rate": 9.587925804053252e-06, + "loss": 0.4023, + "step": 5387 + }, + { + "epoch": 0.16, + "grad_norm": 1.5664873925508789, + "learning_rate": 9.58773905657168e-06, + "loss": 0.3938, + "step": 5388 + }, + { + "epoch": 0.16, + "grad_norm": 1.1248276083841304, + "learning_rate": 9.587552268603263e-06, + "loss": 0.5628, + "step": 5389 + }, + { + "epoch": 0.16, + "grad_norm": 1.5087808109530234, + "learning_rate": 9.587365440149646e-06, + "loss": 0.3992, + "step": 5390 + }, + { + "epoch": 0.16, + "grad_norm": 2.266640199644031, + "learning_rate": 9.587178571212474e-06, + "loss": 0.4018, + "step": 5391 + }, + { + "epoch": 0.16, + "grad_norm": 1.5091227584300182, + "learning_rate": 9.586991661793403e-06, + "loss": 0.4064, + "step": 5392 + }, + { + "epoch": 0.16, + "grad_norm": 1.5071807944149045, + "learning_rate": 9.58680471189408e-06, + "loss": 0.3858, + "step": 5393 + }, + { + "epoch": 0.16, + "grad_norm": 1.58240256806634, + "learning_rate": 9.586617721516155e-06, + "loss": 0.423, + "step": 5394 + }, + { + "epoch": 0.16, + "grad_norm": 1.5123942699010318, + "learning_rate": 9.586430690661277e-06, + "loss": 0.3757, + "step": 5395 + }, + { + "epoch": 0.16, + "grad_norm": 1.4431421905362565, + "learning_rate": 9.586243619331096e-06, + "loss": 0.3753, + "step": 5396 + }, + { + "epoch": 0.16, + "grad_norm": 1.560183317134147, + "learning_rate": 9.586056507527266e-06, + "loss": 0.3762, + "step": 5397 + }, + { + "epoch": 0.16, + "grad_norm": 1.5889603457968229, + "learning_rate": 9.585869355251437e-06, + "loss": 0.3726, + "step": 5398 + }, + { + "epoch": 0.16, + "grad_norm": 1.4859654505485005, + "learning_rate": 9.585682162505258e-06, + "loss": 0.3939, + "step": 5399 + }, + { + "epoch": 0.16, + "grad_norm": 1.6023586236362692, + "learning_rate": 9.585494929290384e-06, + "loss": 0.3996, + "step": 5400 + }, + { + "epoch": 0.16, + "grad_norm": 0.9923645253481248, + "learning_rate": 9.585307655608465e-06, + "loss": 0.601, + "step": 5401 + }, + { + "epoch": 0.16, + "grad_norm": 2.2881175381842604, + "learning_rate": 9.585120341461157e-06, + "loss": 0.3975, + "step": 5402 + }, + { + "epoch": 0.16, + "grad_norm": 1.016808954856122, + "learning_rate": 9.58493298685011e-06, + "loss": 0.6043, + "step": 5403 + }, + { + "epoch": 0.16, + "grad_norm": 2.0671306517165093, + "learning_rate": 9.58474559177698e-06, + "loss": 0.4098, + "step": 5404 + }, + { + "epoch": 0.16, + "grad_norm": 1.6618531903626945, + "learning_rate": 9.584558156243418e-06, + "loss": 0.4206, + "step": 5405 + }, + { + "epoch": 0.16, + "grad_norm": 2.403384102095518, + "learning_rate": 9.584370680251078e-06, + "loss": 0.4405, + "step": 5406 + }, + { + "epoch": 0.16, + "grad_norm": 1.525760577288314, + "learning_rate": 9.584183163801619e-06, + "loss": 0.395, + "step": 5407 + }, + { + "epoch": 0.16, + "grad_norm": 1.573258301002068, + "learning_rate": 9.58399560689669e-06, + "loss": 0.4002, + "step": 5408 + }, + { + "epoch": 0.16, + "grad_norm": 1.4710045994946699, + "learning_rate": 9.583808009537952e-06, + "loss": 0.3981, + "step": 5409 + }, + { + "epoch": 0.16, + "grad_norm": 1.428704071208482, + "learning_rate": 9.583620371727055e-06, + "loss": 0.3775, + "step": 5410 + }, + { + "epoch": 0.16, + "grad_norm": 1.5234020811507127, + "learning_rate": 9.583432693465657e-06, + "loss": 0.4081, + "step": 5411 + }, + { + "epoch": 0.16, + "grad_norm": 1.392352077219278, + "learning_rate": 9.583244974755416e-06, + "loss": 0.3922, + "step": 5412 + }, + { + "epoch": 0.16, + "grad_norm": 2.399726216189978, + "learning_rate": 9.583057215597987e-06, + "loss": 0.4263, + "step": 5413 + }, + { + "epoch": 0.16, + "grad_norm": 1.582757309787581, + "learning_rate": 9.582869415995027e-06, + "loss": 0.4201, + "step": 5414 + }, + { + "epoch": 0.16, + "grad_norm": 1.392141987602501, + "learning_rate": 9.582681575948194e-06, + "loss": 0.3545, + "step": 5415 + }, + { + "epoch": 0.16, + "grad_norm": 1.5037407948181933, + "learning_rate": 9.582493695459145e-06, + "loss": 0.4172, + "step": 5416 + }, + { + "epoch": 0.16, + "grad_norm": 1.921142881504928, + "learning_rate": 9.58230577452954e-06, + "loss": 0.4184, + "step": 5417 + }, + { + "epoch": 0.16, + "grad_norm": 2.13227092515305, + "learning_rate": 9.582117813161032e-06, + "loss": 0.3829, + "step": 5418 + }, + { + "epoch": 0.16, + "grad_norm": 1.5178269934229847, + "learning_rate": 9.581929811355286e-06, + "loss": 0.3917, + "step": 5419 + }, + { + "epoch": 0.16, + "grad_norm": 1.4897683832007258, + "learning_rate": 9.581741769113958e-06, + "loss": 0.3856, + "step": 5420 + }, + { + "epoch": 0.16, + "grad_norm": 1.3912648666987142, + "learning_rate": 9.581553686438709e-06, + "loss": 0.3596, + "step": 5421 + }, + { + "epoch": 0.16, + "grad_norm": 1.6344105904699697, + "learning_rate": 9.581365563331195e-06, + "loss": 0.4044, + "step": 5422 + }, + { + "epoch": 0.16, + "grad_norm": 1.1245809362753263, + "learning_rate": 9.581177399793083e-06, + "loss": 0.6195, + "step": 5423 + }, + { + "epoch": 0.16, + "grad_norm": 1.5146510449434425, + "learning_rate": 9.580989195826027e-06, + "loss": 0.3769, + "step": 5424 + }, + { + "epoch": 0.16, + "grad_norm": 1.597875960639281, + "learning_rate": 9.58080095143169e-06, + "loss": 0.4274, + "step": 5425 + }, + { + "epoch": 0.16, + "grad_norm": 1.429487701753555, + "learning_rate": 9.580612666611737e-06, + "loss": 0.3845, + "step": 5426 + }, + { + "epoch": 0.16, + "grad_norm": 1.5517217922396762, + "learning_rate": 9.580424341367823e-06, + "loss": 0.389, + "step": 5427 + }, + { + "epoch": 0.16, + "grad_norm": 1.7860215391749288, + "learning_rate": 9.580235975701615e-06, + "loss": 0.3888, + "step": 5428 + }, + { + "epoch": 0.16, + "grad_norm": 1.479329982782653, + "learning_rate": 9.580047569614775e-06, + "loss": 0.3651, + "step": 5429 + }, + { + "epoch": 0.16, + "grad_norm": 1.3259310600477296, + "learning_rate": 9.57985912310896e-06, + "loss": 0.3641, + "step": 5430 + }, + { + "epoch": 0.16, + "grad_norm": 1.4894845649885573, + "learning_rate": 9.579670636185842e-06, + "loss": 0.4242, + "step": 5431 + }, + { + "epoch": 0.16, + "grad_norm": 1.4592097912273205, + "learning_rate": 9.579482108847077e-06, + "loss": 0.3801, + "step": 5432 + }, + { + "epoch": 0.16, + "grad_norm": 1.514436461010039, + "learning_rate": 9.579293541094334e-06, + "loss": 0.4131, + "step": 5433 + }, + { + "epoch": 0.16, + "grad_norm": 1.7202377348650273, + "learning_rate": 9.579104932929273e-06, + "loss": 0.4048, + "step": 5434 + }, + { + "epoch": 0.16, + "grad_norm": 1.659267063649799, + "learning_rate": 9.578916284353561e-06, + "loss": 0.4449, + "step": 5435 + }, + { + "epoch": 0.16, + "grad_norm": 2.298689471274519, + "learning_rate": 9.578727595368861e-06, + "loss": 0.3833, + "step": 5436 + }, + { + "epoch": 0.16, + "grad_norm": 2.50239517048908, + "learning_rate": 9.578538865976841e-06, + "loss": 0.3776, + "step": 5437 + }, + { + "epoch": 0.16, + "grad_norm": 1.7193092379135706, + "learning_rate": 9.578350096179163e-06, + "loss": 0.3736, + "step": 5438 + }, + { + "epoch": 0.16, + "grad_norm": 3.9859070058514456, + "learning_rate": 9.578161285977494e-06, + "loss": 0.3813, + "step": 5439 + }, + { + "epoch": 0.16, + "grad_norm": 1.640046301407428, + "learning_rate": 9.577972435373502e-06, + "loss": 0.4139, + "step": 5440 + }, + { + "epoch": 0.16, + "grad_norm": 1.6962723906569057, + "learning_rate": 9.577783544368853e-06, + "loss": 0.3925, + "step": 5441 + }, + { + "epoch": 0.16, + "grad_norm": 1.6061174252335144, + "learning_rate": 9.577594612965213e-06, + "loss": 0.4398, + "step": 5442 + }, + { + "epoch": 0.16, + "grad_norm": 1.7111208538389524, + "learning_rate": 9.57740564116425e-06, + "loss": 0.4149, + "step": 5443 + }, + { + "epoch": 0.16, + "grad_norm": 1.1594366232504791, + "learning_rate": 9.577216628967631e-06, + "loss": 0.6307, + "step": 5444 + }, + { + "epoch": 0.16, + "grad_norm": 1.448883588827839, + "learning_rate": 9.577027576377025e-06, + "loss": 0.4229, + "step": 5445 + }, + { + "epoch": 0.16, + "grad_norm": 1.7703226207481912, + "learning_rate": 9.576838483394098e-06, + "loss": 0.4104, + "step": 5446 + }, + { + "epoch": 0.16, + "grad_norm": 0.9882995011319012, + "learning_rate": 9.576649350020522e-06, + "loss": 0.5739, + "step": 5447 + }, + { + "epoch": 0.16, + "grad_norm": 1.5150423495451057, + "learning_rate": 9.576460176257964e-06, + "loss": 0.3835, + "step": 5448 + }, + { + "epoch": 0.16, + "grad_norm": 2.012157181604389, + "learning_rate": 9.576270962108097e-06, + "loss": 0.4131, + "step": 5449 + }, + { + "epoch": 0.16, + "grad_norm": 1.9312463454674988, + "learning_rate": 9.576081707572584e-06, + "loss": 0.4023, + "step": 5450 + }, + { + "epoch": 0.16, + "grad_norm": 1.4859399557131303, + "learning_rate": 9.575892412653102e-06, + "loss": 0.3892, + "step": 5451 + }, + { + "epoch": 0.16, + "grad_norm": 1.914929260946015, + "learning_rate": 9.575703077351317e-06, + "loss": 0.4079, + "step": 5452 + }, + { + "epoch": 0.16, + "grad_norm": 1.6158910135623057, + "learning_rate": 9.575513701668904e-06, + "loss": 0.4082, + "step": 5453 + }, + { + "epoch": 0.16, + "grad_norm": 1.6430519678268418, + "learning_rate": 9.57532428560753e-06, + "loss": 0.3749, + "step": 5454 + }, + { + "epoch": 0.16, + "grad_norm": 1.5156233064556257, + "learning_rate": 9.575134829168869e-06, + "loss": 0.4026, + "step": 5455 + }, + { + "epoch": 0.16, + "grad_norm": 1.5730191020181645, + "learning_rate": 9.57494533235459e-06, + "loss": 0.3778, + "step": 5456 + }, + { + "epoch": 0.16, + "grad_norm": 1.4267104189274518, + "learning_rate": 9.574755795166371e-06, + "loss": 0.4074, + "step": 5457 + }, + { + "epoch": 0.16, + "grad_norm": 3.441432412763272, + "learning_rate": 9.57456621760588e-06, + "loss": 0.4237, + "step": 5458 + }, + { + "epoch": 0.16, + "grad_norm": 1.5791641286041023, + "learning_rate": 9.57437659967479e-06, + "loss": 0.4227, + "step": 5459 + }, + { + "epoch": 0.16, + "grad_norm": 1.7871870175098687, + "learning_rate": 9.574186941374778e-06, + "loss": 0.4243, + "step": 5460 + }, + { + "epoch": 0.16, + "grad_norm": 1.699939915579523, + "learning_rate": 9.573997242707515e-06, + "loss": 0.4446, + "step": 5461 + }, + { + "epoch": 0.16, + "grad_norm": 2.3694031575604826, + "learning_rate": 9.573807503674675e-06, + "loss": 0.3827, + "step": 5462 + }, + { + "epoch": 0.16, + "grad_norm": 1.4558719536202618, + "learning_rate": 9.573617724277933e-06, + "loss": 0.3955, + "step": 5463 + }, + { + "epoch": 0.16, + "grad_norm": 1.5057506006728836, + "learning_rate": 9.573427904518964e-06, + "loss": 0.3916, + "step": 5464 + }, + { + "epoch": 0.16, + "grad_norm": 2.0064813729322912, + "learning_rate": 9.573238044399445e-06, + "loss": 0.4206, + "step": 5465 + }, + { + "epoch": 0.16, + "grad_norm": 1.6773359542304869, + "learning_rate": 9.573048143921046e-06, + "loss": 0.4396, + "step": 5466 + }, + { + "epoch": 0.16, + "grad_norm": 1.457183215472678, + "learning_rate": 9.572858203085449e-06, + "loss": 0.3992, + "step": 5467 + }, + { + "epoch": 0.16, + "grad_norm": 1.6095286361572991, + "learning_rate": 9.572668221894326e-06, + "loss": 0.3978, + "step": 5468 + }, + { + "epoch": 0.16, + "grad_norm": 1.6073238225270596, + "learning_rate": 9.572478200349355e-06, + "loss": 0.4019, + "step": 5469 + }, + { + "epoch": 0.16, + "grad_norm": 1.4360021171867536, + "learning_rate": 9.572288138452215e-06, + "loss": 0.3899, + "step": 5470 + }, + { + "epoch": 0.16, + "grad_norm": 1.4469576116236436, + "learning_rate": 9.57209803620458e-06, + "loss": 0.3729, + "step": 5471 + }, + { + "epoch": 0.16, + "grad_norm": 1.6247827694131516, + "learning_rate": 9.57190789360813e-06, + "loss": 0.4069, + "step": 5472 + }, + { + "epoch": 0.16, + "grad_norm": 1.5778951925601963, + "learning_rate": 9.571717710664543e-06, + "loss": 0.3949, + "step": 5473 + }, + { + "epoch": 0.16, + "grad_norm": 1.4587771879776028, + "learning_rate": 9.571527487375494e-06, + "loss": 0.4072, + "step": 5474 + }, + { + "epoch": 0.16, + "grad_norm": 1.3625618422734758, + "learning_rate": 9.571337223742664e-06, + "loss": 0.3781, + "step": 5475 + }, + { + "epoch": 0.16, + "grad_norm": 1.5244630017656193, + "learning_rate": 9.571146919767736e-06, + "loss": 0.3886, + "step": 5476 + }, + { + "epoch": 0.16, + "grad_norm": 1.6894789226521878, + "learning_rate": 9.570956575452382e-06, + "loss": 0.392, + "step": 5477 + }, + { + "epoch": 0.16, + "grad_norm": 1.9394089747795928, + "learning_rate": 9.570766190798288e-06, + "loss": 0.4014, + "step": 5478 + }, + { + "epoch": 0.16, + "grad_norm": 1.7077424261645566, + "learning_rate": 9.57057576580713e-06, + "loss": 0.3576, + "step": 5479 + }, + { + "epoch": 0.16, + "grad_norm": 1.6549158292843158, + "learning_rate": 9.57038530048059e-06, + "loss": 0.3959, + "step": 5480 + }, + { + "epoch": 0.16, + "grad_norm": 1.470425687319113, + "learning_rate": 9.570194794820351e-06, + "loss": 0.3881, + "step": 5481 + }, + { + "epoch": 0.16, + "grad_norm": 1.602963469444188, + "learning_rate": 9.57000424882809e-06, + "loss": 0.4202, + "step": 5482 + }, + { + "epoch": 0.16, + "grad_norm": 1.701838351852067, + "learning_rate": 9.569813662505493e-06, + "loss": 0.3828, + "step": 5483 + }, + { + "epoch": 0.16, + "grad_norm": 2.3071580282716857, + "learning_rate": 9.569623035854239e-06, + "loss": 0.4176, + "step": 5484 + }, + { + "epoch": 0.16, + "grad_norm": 1.1642610006503191, + "learning_rate": 9.56943236887601e-06, + "loss": 0.6224, + "step": 5485 + }, + { + "epoch": 0.16, + "grad_norm": 1.6732145382691952, + "learning_rate": 9.56924166157249e-06, + "loss": 0.3909, + "step": 5486 + }, + { + "epoch": 0.16, + "grad_norm": 2.345271312610796, + "learning_rate": 9.569050913945363e-06, + "loss": 0.476, + "step": 5487 + }, + { + "epoch": 0.16, + "grad_norm": 1.7082818236912316, + "learning_rate": 9.568860125996308e-06, + "loss": 0.389, + "step": 5488 + }, + { + "epoch": 0.16, + "grad_norm": 1.3950327939662501, + "learning_rate": 9.568669297727016e-06, + "loss": 0.3558, + "step": 5489 + }, + { + "epoch": 0.16, + "grad_norm": 1.7764071008460862, + "learning_rate": 9.568478429139163e-06, + "loss": 0.3945, + "step": 5490 + }, + { + "epoch": 0.16, + "grad_norm": 1.357247706839144, + "learning_rate": 9.568287520234438e-06, + "loss": 0.3802, + "step": 5491 + }, + { + "epoch": 0.16, + "grad_norm": 1.481678290194632, + "learning_rate": 9.568096571014527e-06, + "loss": 0.39, + "step": 5492 + }, + { + "epoch": 0.16, + "grad_norm": 1.677411521581406, + "learning_rate": 9.567905581481112e-06, + "loss": 0.4017, + "step": 5493 + }, + { + "epoch": 0.16, + "grad_norm": 1.55767204740012, + "learning_rate": 9.567714551635878e-06, + "loss": 0.4061, + "step": 5494 + }, + { + "epoch": 0.16, + "grad_norm": 1.5165807995675515, + "learning_rate": 9.567523481480514e-06, + "loss": 0.393, + "step": 5495 + }, + { + "epoch": 0.16, + "grad_norm": 1.6258330787007071, + "learning_rate": 9.567332371016704e-06, + "loss": 0.4161, + "step": 5496 + }, + { + "epoch": 0.16, + "grad_norm": 1.9825808725542118, + "learning_rate": 9.567141220246136e-06, + "loss": 0.3879, + "step": 5497 + }, + { + "epoch": 0.16, + "grad_norm": 1.4286991317154938, + "learning_rate": 9.566950029170495e-06, + "loss": 0.3995, + "step": 5498 + }, + { + "epoch": 0.16, + "grad_norm": 1.5224029100001784, + "learning_rate": 9.56675879779147e-06, + "loss": 0.4389, + "step": 5499 + }, + { + "epoch": 0.16, + "grad_norm": 1.5987625786347126, + "learning_rate": 9.566567526110746e-06, + "loss": 0.3689, + "step": 5500 + }, + { + "epoch": 0.16, + "grad_norm": 2.3506220764449552, + "learning_rate": 9.566376214130015e-06, + "loss": 0.3926, + "step": 5501 + }, + { + "epoch": 0.16, + "grad_norm": 2.923737624595699, + "learning_rate": 9.566184861850965e-06, + "loss": 0.4159, + "step": 5502 + }, + { + "epoch": 0.16, + "grad_norm": 1.6249529339184365, + "learning_rate": 9.56599346927528e-06, + "loss": 0.3675, + "step": 5503 + }, + { + "epoch": 0.16, + "grad_norm": 1.582280127720596, + "learning_rate": 9.565802036404652e-06, + "loss": 0.3724, + "step": 5504 + }, + { + "epoch": 0.16, + "grad_norm": 1.6076404499818695, + "learning_rate": 9.565610563240771e-06, + "loss": 0.3722, + "step": 5505 + }, + { + "epoch": 0.16, + "grad_norm": 1.427911261030577, + "learning_rate": 9.565419049785327e-06, + "loss": 0.3966, + "step": 5506 + }, + { + "epoch": 0.16, + "grad_norm": 1.482664439681381, + "learning_rate": 9.565227496040009e-06, + "loss": 0.3738, + "step": 5507 + }, + { + "epoch": 0.16, + "grad_norm": 1.6112957955762734, + "learning_rate": 9.565035902006507e-06, + "loss": 0.4054, + "step": 5508 + }, + { + "epoch": 0.16, + "grad_norm": 1.3185897310615637, + "learning_rate": 9.564844267686513e-06, + "loss": 0.3771, + "step": 5509 + }, + { + "epoch": 0.16, + "grad_norm": 3.432053524221619, + "learning_rate": 9.564652593081716e-06, + "loss": 0.3728, + "step": 5510 + }, + { + "epoch": 0.16, + "grad_norm": 1.5286851989903036, + "learning_rate": 9.56446087819381e-06, + "loss": 0.393, + "step": 5511 + }, + { + "epoch": 0.16, + "grad_norm": 1.7113053447104865, + "learning_rate": 9.564269123024488e-06, + "loss": 0.3907, + "step": 5512 + }, + { + "epoch": 0.16, + "grad_norm": 1.6412112226950977, + "learning_rate": 9.56407732757544e-06, + "loss": 0.4124, + "step": 5513 + }, + { + "epoch": 0.16, + "grad_norm": 1.551601923246361, + "learning_rate": 9.563885491848357e-06, + "loss": 0.3971, + "step": 5514 + }, + { + "epoch": 0.16, + "grad_norm": 1.5120866768676606, + "learning_rate": 9.563693615844936e-06, + "loss": 0.3718, + "step": 5515 + }, + { + "epoch": 0.16, + "grad_norm": 1.7656283459183737, + "learning_rate": 9.563501699566866e-06, + "loss": 0.4099, + "step": 5516 + }, + { + "epoch": 0.16, + "grad_norm": 1.6091999366173955, + "learning_rate": 9.563309743015844e-06, + "loss": 0.4206, + "step": 5517 + }, + { + "epoch": 0.16, + "grad_norm": 1.7537045928738866, + "learning_rate": 9.563117746193565e-06, + "loss": 0.3923, + "step": 5518 + }, + { + "epoch": 0.16, + "grad_norm": 2.0619019997243018, + "learning_rate": 9.562925709101719e-06, + "loss": 0.4306, + "step": 5519 + }, + { + "epoch": 0.16, + "grad_norm": 1.4827202591083182, + "learning_rate": 9.562733631742003e-06, + "loss": 0.3839, + "step": 5520 + }, + { + "epoch": 0.16, + "grad_norm": 1.6185411225416582, + "learning_rate": 9.562541514116113e-06, + "loss": 0.3932, + "step": 5521 + }, + { + "epoch": 0.16, + "grad_norm": 2.066229507675112, + "learning_rate": 9.562349356225743e-06, + "loss": 0.3701, + "step": 5522 + }, + { + "epoch": 0.16, + "grad_norm": 1.4913138403591901, + "learning_rate": 9.562157158072588e-06, + "loss": 0.3829, + "step": 5523 + }, + { + "epoch": 0.16, + "grad_norm": 1.3924665471459154, + "learning_rate": 9.561964919658346e-06, + "loss": 0.3824, + "step": 5524 + }, + { + "epoch": 0.16, + "grad_norm": 1.8720094666952611, + "learning_rate": 9.561772640984714e-06, + "loss": 0.3894, + "step": 5525 + }, + { + "epoch": 0.16, + "grad_norm": 1.5172229340459917, + "learning_rate": 9.561580322053387e-06, + "loss": 0.4206, + "step": 5526 + }, + { + "epoch": 0.16, + "grad_norm": 1.5105599207278182, + "learning_rate": 9.561387962866064e-06, + "loss": 0.3821, + "step": 5527 + }, + { + "epoch": 0.16, + "grad_norm": 1.3315678731055482, + "learning_rate": 9.561195563424442e-06, + "loss": 0.3632, + "step": 5528 + }, + { + "epoch": 0.16, + "grad_norm": 1.6970316160404968, + "learning_rate": 9.561003123730218e-06, + "loss": 0.3787, + "step": 5529 + }, + { + "epoch": 0.16, + "grad_norm": 1.6617405540520418, + "learning_rate": 9.56081064378509e-06, + "loss": 0.4108, + "step": 5530 + }, + { + "epoch": 0.16, + "grad_norm": 1.9514408561643752, + "learning_rate": 9.560618123590757e-06, + "loss": 0.3966, + "step": 5531 + }, + { + "epoch": 0.16, + "grad_norm": 1.6421912392090032, + "learning_rate": 9.56042556314892e-06, + "loss": 0.3871, + "step": 5532 + }, + { + "epoch": 0.16, + "grad_norm": 1.6449761234681488, + "learning_rate": 9.560232962461276e-06, + "loss": 0.4219, + "step": 5533 + }, + { + "epoch": 0.16, + "grad_norm": 1.735624854858795, + "learning_rate": 9.560040321529527e-06, + "loss": 0.3876, + "step": 5534 + }, + { + "epoch": 0.16, + "grad_norm": 1.7813405705994458, + "learning_rate": 9.55984764035537e-06, + "loss": 0.3713, + "step": 5535 + }, + { + "epoch": 0.16, + "grad_norm": 1.5055683812382885, + "learning_rate": 9.559654918940507e-06, + "loss": 0.41, + "step": 5536 + }, + { + "epoch": 0.16, + "grad_norm": 1.4660882410305671, + "learning_rate": 9.55946215728664e-06, + "loss": 0.3509, + "step": 5537 + }, + { + "epoch": 0.16, + "grad_norm": 1.4564398700850354, + "learning_rate": 9.559269355395467e-06, + "loss": 0.3829, + "step": 5538 + }, + { + "epoch": 0.16, + "grad_norm": 1.4210837317008884, + "learning_rate": 9.559076513268692e-06, + "loss": 0.3942, + "step": 5539 + }, + { + "epoch": 0.16, + "grad_norm": 1.4776093833631327, + "learning_rate": 9.558883630908018e-06, + "loss": 0.3701, + "step": 5540 + }, + { + "epoch": 0.16, + "grad_norm": 1.5411775970462698, + "learning_rate": 9.558690708315144e-06, + "loss": 0.4269, + "step": 5541 + }, + { + "epoch": 0.16, + "grad_norm": 1.7543186849233925, + "learning_rate": 9.558497745491773e-06, + "loss": 0.4039, + "step": 5542 + }, + { + "epoch": 0.16, + "grad_norm": 1.5169613229481576, + "learning_rate": 9.55830474243961e-06, + "loss": 0.3744, + "step": 5543 + }, + { + "epoch": 0.16, + "grad_norm": 2.5128254150129834, + "learning_rate": 9.558111699160357e-06, + "loss": 0.3972, + "step": 5544 + }, + { + "epoch": 0.16, + "grad_norm": 1.6067110605509944, + "learning_rate": 9.557918615655718e-06, + "loss": 0.3768, + "step": 5545 + }, + { + "epoch": 0.16, + "grad_norm": 1.740441860227371, + "learning_rate": 9.557725491927395e-06, + "loss": 0.4058, + "step": 5546 + }, + { + "epoch": 0.16, + "grad_norm": 1.6001470508666293, + "learning_rate": 9.557532327977095e-06, + "loss": 0.4224, + "step": 5547 + }, + { + "epoch": 0.16, + "grad_norm": 1.4721380177717207, + "learning_rate": 9.557339123806522e-06, + "loss": 0.4339, + "step": 5548 + }, + { + "epoch": 0.16, + "grad_norm": 1.5443641673851247, + "learning_rate": 9.557145879417382e-06, + "loss": 0.4086, + "step": 5549 + }, + { + "epoch": 0.16, + "grad_norm": 1.8539549569761338, + "learning_rate": 9.556952594811376e-06, + "loss": 0.4235, + "step": 5550 + }, + { + "epoch": 0.16, + "grad_norm": 2.3346036410506765, + "learning_rate": 9.556759269990215e-06, + "loss": 0.4166, + "step": 5551 + }, + { + "epoch": 0.16, + "grad_norm": 1.8430371947650126, + "learning_rate": 9.556565904955601e-06, + "loss": 0.3927, + "step": 5552 + }, + { + "epoch": 0.16, + "grad_norm": 1.5801551578037014, + "learning_rate": 9.556372499709244e-06, + "loss": 0.4032, + "step": 5553 + }, + { + "epoch": 0.16, + "grad_norm": 1.454897917475249, + "learning_rate": 9.55617905425285e-06, + "loss": 0.4208, + "step": 5554 + }, + { + "epoch": 0.16, + "grad_norm": 1.507899796102559, + "learning_rate": 9.555985568588124e-06, + "loss": 0.4205, + "step": 5555 + }, + { + "epoch": 0.16, + "grad_norm": 1.5271792085168001, + "learning_rate": 9.555792042716773e-06, + "loss": 0.3671, + "step": 5556 + }, + { + "epoch": 0.16, + "grad_norm": 1.5623887338537399, + "learning_rate": 9.55559847664051e-06, + "loss": 0.3653, + "step": 5557 + }, + { + "epoch": 0.16, + "grad_norm": 1.444702618579969, + "learning_rate": 9.555404870361039e-06, + "loss": 0.376, + "step": 5558 + }, + { + "epoch": 0.16, + "grad_norm": 1.5456437311942588, + "learning_rate": 9.555211223880067e-06, + "loss": 0.387, + "step": 5559 + }, + { + "epoch": 0.16, + "grad_norm": 1.7373841588078065, + "learning_rate": 9.555017537199309e-06, + "loss": 0.4008, + "step": 5560 + }, + { + "epoch": 0.16, + "grad_norm": 1.4253349873268124, + "learning_rate": 9.554823810320467e-06, + "loss": 0.3729, + "step": 5561 + }, + { + "epoch": 0.16, + "grad_norm": 1.6023385212977461, + "learning_rate": 9.554630043245257e-06, + "loss": 0.4228, + "step": 5562 + }, + { + "epoch": 0.16, + "grad_norm": 1.7752575091191403, + "learning_rate": 9.554436235975386e-06, + "loss": 0.3713, + "step": 5563 + }, + { + "epoch": 0.16, + "grad_norm": 1.4774269311197596, + "learning_rate": 9.554242388512565e-06, + "loss": 0.3743, + "step": 5564 + }, + { + "epoch": 0.16, + "grad_norm": 1.7885464958205917, + "learning_rate": 9.554048500858503e-06, + "loss": 0.4095, + "step": 5565 + }, + { + "epoch": 0.16, + "grad_norm": 1.5581748761602912, + "learning_rate": 9.553854573014913e-06, + "loss": 0.3845, + "step": 5566 + }, + { + "epoch": 0.16, + "grad_norm": 2.8774894110081055, + "learning_rate": 9.553660604983506e-06, + "loss": 0.4031, + "step": 5567 + }, + { + "epoch": 0.16, + "grad_norm": 1.5583741174987278, + "learning_rate": 9.553466596765995e-06, + "loss": 0.4197, + "step": 5568 + }, + { + "epoch": 0.16, + "grad_norm": 1.7729006010620212, + "learning_rate": 9.553272548364089e-06, + "loss": 0.3896, + "step": 5569 + }, + { + "epoch": 0.16, + "grad_norm": 1.5791557133176088, + "learning_rate": 9.553078459779502e-06, + "loss": 0.4193, + "step": 5570 + }, + { + "epoch": 0.16, + "grad_norm": 1.6131850205370035, + "learning_rate": 9.552884331013948e-06, + "loss": 0.4131, + "step": 5571 + }, + { + "epoch": 0.16, + "grad_norm": 2.6509084971251524, + "learning_rate": 9.55269016206914e-06, + "loss": 0.392, + "step": 5572 + }, + { + "epoch": 0.16, + "grad_norm": 1.1361089803338449, + "learning_rate": 9.552495952946789e-06, + "loss": 0.672, + "step": 5573 + }, + { + "epoch": 0.16, + "grad_norm": 1.3943832200667106, + "learning_rate": 9.552301703648612e-06, + "loss": 0.3768, + "step": 5574 + }, + { + "epoch": 0.16, + "grad_norm": 1.6513427860864698, + "learning_rate": 9.552107414176321e-06, + "loss": 0.4389, + "step": 5575 + }, + { + "epoch": 0.16, + "grad_norm": 1.4627758087862073, + "learning_rate": 9.551913084531631e-06, + "loss": 0.3822, + "step": 5576 + }, + { + "epoch": 0.16, + "grad_norm": 1.6478818196822709, + "learning_rate": 9.55171871471626e-06, + "loss": 0.3993, + "step": 5577 + }, + { + "epoch": 0.16, + "grad_norm": 5.831702898759534, + "learning_rate": 9.551524304731921e-06, + "loss": 0.4071, + "step": 5578 + }, + { + "epoch": 0.16, + "grad_norm": 1.9028627109629115, + "learning_rate": 9.551329854580326e-06, + "loss": 0.3963, + "step": 5579 + }, + { + "epoch": 0.16, + "grad_norm": 1.6676227179075205, + "learning_rate": 9.551135364263198e-06, + "loss": 0.3783, + "step": 5580 + }, + { + "epoch": 0.16, + "grad_norm": 1.6776356940247128, + "learning_rate": 9.550940833782248e-06, + "loss": 0.3824, + "step": 5581 + }, + { + "epoch": 0.16, + "grad_norm": 1.8995529474602386, + "learning_rate": 9.550746263139195e-06, + "loss": 0.4162, + "step": 5582 + }, + { + "epoch": 0.16, + "grad_norm": 1.47920461604366, + "learning_rate": 9.550551652335757e-06, + "loss": 0.3951, + "step": 5583 + }, + { + "epoch": 0.16, + "grad_norm": 1.7313345804363673, + "learning_rate": 9.550357001373649e-06, + "loss": 0.3883, + "step": 5584 + }, + { + "epoch": 0.16, + "grad_norm": 1.4219015141207423, + "learning_rate": 9.550162310254589e-06, + "loss": 0.3992, + "step": 5585 + }, + { + "epoch": 0.16, + "grad_norm": 1.9691386879841026, + "learning_rate": 9.549967578980297e-06, + "loss": 0.3798, + "step": 5586 + }, + { + "epoch": 0.16, + "grad_norm": 2.0841175808165504, + "learning_rate": 9.549772807552492e-06, + "loss": 0.4405, + "step": 5587 + }, + { + "epoch": 0.16, + "grad_norm": 1.6788771682307555, + "learning_rate": 9.549577995972891e-06, + "loss": 0.4049, + "step": 5588 + }, + { + "epoch": 0.16, + "grad_norm": 1.7231299170817032, + "learning_rate": 9.549383144243213e-06, + "loss": 0.3962, + "step": 5589 + }, + { + "epoch": 0.16, + "grad_norm": 1.585142386884427, + "learning_rate": 9.549188252365178e-06, + "loss": 0.408, + "step": 5590 + }, + { + "epoch": 0.16, + "grad_norm": 1.607892318281497, + "learning_rate": 9.548993320340507e-06, + "loss": 0.3944, + "step": 5591 + }, + { + "epoch": 0.16, + "grad_norm": 1.5346253671464742, + "learning_rate": 9.54879834817092e-06, + "loss": 0.3813, + "step": 5592 + }, + { + "epoch": 0.16, + "grad_norm": 1.735582451954774, + "learning_rate": 9.548603335858135e-06, + "loss": 0.4655, + "step": 5593 + }, + { + "epoch": 0.16, + "grad_norm": 1.8624023308487583, + "learning_rate": 9.548408283403877e-06, + "loss": 0.4152, + "step": 5594 + }, + { + "epoch": 0.16, + "grad_norm": 1.7869835427938334, + "learning_rate": 9.548213190809864e-06, + "loss": 0.4077, + "step": 5595 + }, + { + "epoch": 0.16, + "grad_norm": 1.5912863120434273, + "learning_rate": 9.548018058077819e-06, + "loss": 0.4047, + "step": 5596 + }, + { + "epoch": 0.16, + "grad_norm": 1.6399400579396497, + "learning_rate": 9.547822885209466e-06, + "loss": 0.3782, + "step": 5597 + }, + { + "epoch": 0.16, + "grad_norm": 1.4623899328999659, + "learning_rate": 9.547627672206525e-06, + "loss": 0.3957, + "step": 5598 + }, + { + "epoch": 0.16, + "grad_norm": 1.6303933531819972, + "learning_rate": 9.54743241907072e-06, + "loss": 0.4144, + "step": 5599 + }, + { + "epoch": 0.16, + "grad_norm": 1.5703300956982114, + "learning_rate": 9.547237125803771e-06, + "loss": 0.4114, + "step": 5600 + }, + { + "epoch": 0.16, + "grad_norm": 1.571912457430001, + "learning_rate": 9.547041792407405e-06, + "loss": 0.3934, + "step": 5601 + }, + { + "epoch": 0.16, + "grad_norm": 1.660614031063729, + "learning_rate": 9.546846418883346e-06, + "loss": 0.4042, + "step": 5602 + }, + { + "epoch": 0.16, + "grad_norm": 1.3893324797801854, + "learning_rate": 9.546651005233316e-06, + "loss": 0.3666, + "step": 5603 + }, + { + "epoch": 0.16, + "grad_norm": 2.021352102305138, + "learning_rate": 9.54645555145904e-06, + "loss": 0.4122, + "step": 5604 + }, + { + "epoch": 0.16, + "grad_norm": 1.6942865656165467, + "learning_rate": 9.546260057562244e-06, + "loss": 0.3982, + "step": 5605 + }, + { + "epoch": 0.16, + "grad_norm": 1.6355741155754482, + "learning_rate": 9.546064523544652e-06, + "loss": 0.3857, + "step": 5606 + }, + { + "epoch": 0.16, + "grad_norm": 1.5657820604503432, + "learning_rate": 9.54586894940799e-06, + "loss": 0.4002, + "step": 5607 + }, + { + "epoch": 0.16, + "grad_norm": 1.668489194405184, + "learning_rate": 9.545673335153985e-06, + "loss": 0.3921, + "step": 5608 + }, + { + "epoch": 0.16, + "grad_norm": 1.496111156870181, + "learning_rate": 9.545477680784363e-06, + "loss": 0.403, + "step": 5609 + }, + { + "epoch": 0.16, + "grad_norm": 1.5665804884985284, + "learning_rate": 9.545281986300849e-06, + "loss": 0.4174, + "step": 5610 + }, + { + "epoch": 0.16, + "grad_norm": 4.330861825977274, + "learning_rate": 9.54508625170517e-06, + "loss": 0.4129, + "step": 5611 + }, + { + "epoch": 0.16, + "grad_norm": 1.8838054763827512, + "learning_rate": 9.544890476999056e-06, + "loss": 0.4312, + "step": 5612 + }, + { + "epoch": 0.16, + "grad_norm": 1.3675256637396043, + "learning_rate": 9.544694662184233e-06, + "loss": 0.3685, + "step": 5613 + }, + { + "epoch": 0.16, + "grad_norm": 1.9145355436832423, + "learning_rate": 9.544498807262427e-06, + "loss": 0.3725, + "step": 5614 + }, + { + "epoch": 0.16, + "grad_norm": 1.4943316858558007, + "learning_rate": 9.544302912235373e-06, + "loss": 0.4054, + "step": 5615 + }, + { + "epoch": 0.16, + "grad_norm": 1.934867824140978, + "learning_rate": 9.544106977104792e-06, + "loss": 0.3572, + "step": 5616 + }, + { + "epoch": 0.16, + "grad_norm": 3.424305486421568, + "learning_rate": 9.543911001872417e-06, + "loss": 0.3887, + "step": 5617 + }, + { + "epoch": 0.16, + "grad_norm": 1.609583018370331, + "learning_rate": 9.543714986539978e-06, + "loss": 0.386, + "step": 5618 + }, + { + "epoch": 0.16, + "grad_norm": 1.5363119689766724, + "learning_rate": 9.543518931109202e-06, + "loss": 0.4066, + "step": 5619 + }, + { + "epoch": 0.16, + "grad_norm": 1.586725309479272, + "learning_rate": 9.543322835581823e-06, + "loss": 0.4029, + "step": 5620 + }, + { + "epoch": 0.16, + "grad_norm": 1.7670516730184385, + "learning_rate": 9.543126699959569e-06, + "loss": 0.3901, + "step": 5621 + }, + { + "epoch": 0.16, + "grad_norm": 1.6932847591616387, + "learning_rate": 9.54293052424417e-06, + "loss": 0.4325, + "step": 5622 + }, + { + "epoch": 0.16, + "grad_norm": 1.7634923190890845, + "learning_rate": 9.54273430843736e-06, + "loss": 0.3852, + "step": 5623 + }, + { + "epoch": 0.16, + "grad_norm": 2.14370018098897, + "learning_rate": 9.54253805254087e-06, + "loss": 0.4088, + "step": 5624 + }, + { + "epoch": 0.16, + "grad_norm": 1.7376273593362208, + "learning_rate": 9.54234175655643e-06, + "loss": 0.4085, + "step": 5625 + }, + { + "epoch": 0.16, + "grad_norm": 1.3743167831443794, + "learning_rate": 9.542145420485774e-06, + "loss": 0.3548, + "step": 5626 + }, + { + "epoch": 0.16, + "grad_norm": 1.4851471749754612, + "learning_rate": 9.541949044330635e-06, + "loss": 0.3745, + "step": 5627 + }, + { + "epoch": 0.16, + "grad_norm": 1.442280867991643, + "learning_rate": 9.541752628092745e-06, + "loss": 0.384, + "step": 5628 + }, + { + "epoch": 0.16, + "grad_norm": 1.5590014106153574, + "learning_rate": 9.541556171773838e-06, + "loss": 0.391, + "step": 5629 + }, + { + "epoch": 0.16, + "grad_norm": 1.5146952595272456, + "learning_rate": 9.541359675375645e-06, + "loss": 0.4008, + "step": 5630 + }, + { + "epoch": 0.16, + "grad_norm": 1.649459024342128, + "learning_rate": 9.541163138899905e-06, + "loss": 0.3894, + "step": 5631 + }, + { + "epoch": 0.16, + "grad_norm": 1.7910544534805222, + "learning_rate": 9.540966562348351e-06, + "loss": 0.3974, + "step": 5632 + }, + { + "epoch": 0.16, + "grad_norm": 1.7088982667769381, + "learning_rate": 9.540769945722715e-06, + "loss": 0.3702, + "step": 5633 + }, + { + "epoch": 0.16, + "grad_norm": 1.772087534143612, + "learning_rate": 9.540573289024732e-06, + "loss": 0.3702, + "step": 5634 + }, + { + "epoch": 0.16, + "grad_norm": 1.7765541399983584, + "learning_rate": 9.540376592256142e-06, + "loss": 0.3871, + "step": 5635 + }, + { + "epoch": 0.16, + "grad_norm": 1.423200692262294, + "learning_rate": 9.540179855418677e-06, + "loss": 0.3856, + "step": 5636 + }, + { + "epoch": 0.16, + "grad_norm": 1.4562967505680577, + "learning_rate": 9.539983078514074e-06, + "loss": 0.3726, + "step": 5637 + }, + { + "epoch": 0.16, + "grad_norm": 1.526874897146247, + "learning_rate": 9.539786261544072e-06, + "loss": 0.4171, + "step": 5638 + }, + { + "epoch": 0.16, + "grad_norm": 1.9973991974918266, + "learning_rate": 9.539589404510404e-06, + "loss": 0.4005, + "step": 5639 + }, + { + "epoch": 0.16, + "grad_norm": 1.5077910033780668, + "learning_rate": 9.539392507414811e-06, + "loss": 0.4069, + "step": 5640 + }, + { + "epoch": 0.16, + "grad_norm": 1.5853780202800958, + "learning_rate": 9.539195570259026e-06, + "loss": 0.3584, + "step": 5641 + }, + { + "epoch": 0.16, + "grad_norm": 1.7148282290377301, + "learning_rate": 9.538998593044793e-06, + "loss": 0.378, + "step": 5642 + }, + { + "epoch": 0.16, + "grad_norm": 1.4880974398725761, + "learning_rate": 9.538801575773845e-06, + "loss": 0.3826, + "step": 5643 + }, + { + "epoch": 0.16, + "grad_norm": 1.56344953282308, + "learning_rate": 9.538604518447923e-06, + "loss": 0.3842, + "step": 5644 + }, + { + "epoch": 0.16, + "grad_norm": 1.1459274987777932, + "learning_rate": 9.538407421068764e-06, + "loss": 0.6323, + "step": 5645 + }, + { + "epoch": 0.16, + "grad_norm": 1.664205587127664, + "learning_rate": 9.538210283638112e-06, + "loss": 0.427, + "step": 5646 + }, + { + "epoch": 0.16, + "grad_norm": 1.4943824829481258, + "learning_rate": 9.538013106157704e-06, + "loss": 0.3808, + "step": 5647 + }, + { + "epoch": 0.16, + "grad_norm": 1.719900852715556, + "learning_rate": 9.537815888629278e-06, + "loss": 0.3828, + "step": 5648 + }, + { + "epoch": 0.16, + "grad_norm": 1.8047022582427024, + "learning_rate": 9.537618631054576e-06, + "loss": 0.4187, + "step": 5649 + }, + { + "epoch": 0.16, + "grad_norm": 1.5906481197773161, + "learning_rate": 9.537421333435342e-06, + "loss": 0.4182, + "step": 5650 + }, + { + "epoch": 0.16, + "grad_norm": 2.1661644352456344, + "learning_rate": 9.537223995773312e-06, + "loss": 0.3847, + "step": 5651 + }, + { + "epoch": 0.16, + "grad_norm": 3.1688054058783925, + "learning_rate": 9.537026618070233e-06, + "loss": 0.3919, + "step": 5652 + }, + { + "epoch": 0.16, + "grad_norm": 1.611148271906741, + "learning_rate": 9.53682920032784e-06, + "loss": 0.4019, + "step": 5653 + }, + { + "epoch": 0.16, + "grad_norm": 1.8776234589894385, + "learning_rate": 9.536631742547882e-06, + "loss": 0.3946, + "step": 5654 + }, + { + "epoch": 0.16, + "grad_norm": 2.2238836714615586, + "learning_rate": 9.536434244732097e-06, + "loss": 0.4167, + "step": 5655 + }, + { + "epoch": 0.16, + "grad_norm": 1.665855958371395, + "learning_rate": 9.536236706882229e-06, + "loss": 0.4039, + "step": 5656 + }, + { + "epoch": 0.16, + "grad_norm": 1.533874922357994, + "learning_rate": 9.536039129000023e-06, + "loss": 0.3968, + "step": 5657 + }, + { + "epoch": 0.16, + "grad_norm": 1.9155711360002186, + "learning_rate": 9.53584151108722e-06, + "loss": 0.4065, + "step": 5658 + }, + { + "epoch": 0.16, + "grad_norm": 1.6870095517498984, + "learning_rate": 9.535643853145567e-06, + "loss": 0.4033, + "step": 5659 + }, + { + "epoch": 0.16, + "grad_norm": 1.4223864411236362, + "learning_rate": 9.535446155176805e-06, + "loss": 0.3867, + "step": 5660 + }, + { + "epoch": 0.16, + "grad_norm": 1.6522919168166361, + "learning_rate": 9.53524841718268e-06, + "loss": 0.4052, + "step": 5661 + }, + { + "epoch": 0.16, + "grad_norm": 1.542905858093312, + "learning_rate": 9.53505063916494e-06, + "loss": 0.4088, + "step": 5662 + }, + { + "epoch": 0.16, + "grad_norm": 1.977750356518702, + "learning_rate": 9.534852821125326e-06, + "loss": 0.3916, + "step": 5663 + }, + { + "epoch": 0.16, + "grad_norm": 1.9300895959432183, + "learning_rate": 9.534654963065586e-06, + "loss": 0.4005, + "step": 5664 + }, + { + "epoch": 0.16, + "grad_norm": 1.7849630299689345, + "learning_rate": 9.534457064987465e-06, + "loss": 0.413, + "step": 5665 + }, + { + "epoch": 0.16, + "grad_norm": 2.9162088329792324, + "learning_rate": 9.53425912689271e-06, + "loss": 0.4157, + "step": 5666 + }, + { + "epoch": 0.16, + "grad_norm": 1.970446976267435, + "learning_rate": 9.534061148783071e-06, + "loss": 0.4788, + "step": 5667 + }, + { + "epoch": 0.16, + "grad_norm": 1.4472919028586397, + "learning_rate": 9.53386313066029e-06, + "loss": 0.4212, + "step": 5668 + }, + { + "epoch": 0.16, + "grad_norm": 1.6041556296806012, + "learning_rate": 9.533665072526114e-06, + "loss": 0.3968, + "step": 5669 + }, + { + "epoch": 0.16, + "grad_norm": 1.6327563263430713, + "learning_rate": 9.533466974382297e-06, + "loss": 0.3907, + "step": 5670 + }, + { + "epoch": 0.16, + "grad_norm": 1.5542501841140726, + "learning_rate": 9.533268836230582e-06, + "loss": 0.3601, + "step": 5671 + }, + { + "epoch": 0.16, + "grad_norm": 1.5674963840766967, + "learning_rate": 9.533070658072719e-06, + "loss": 0.3914, + "step": 5672 + }, + { + "epoch": 0.16, + "grad_norm": 1.4636012006481807, + "learning_rate": 9.532872439910459e-06, + "loss": 0.3965, + "step": 5673 + }, + { + "epoch": 0.16, + "grad_norm": 1.5790653982361162, + "learning_rate": 9.532674181745547e-06, + "loss": 0.365, + "step": 5674 + }, + { + "epoch": 0.16, + "grad_norm": 1.4698942522589105, + "learning_rate": 9.532475883579738e-06, + "loss": 0.372, + "step": 5675 + }, + { + "epoch": 0.16, + "grad_norm": 1.722528078935819, + "learning_rate": 9.532277545414776e-06, + "loss": 0.392, + "step": 5676 + }, + { + "epoch": 0.16, + "grad_norm": 1.622790643736734, + "learning_rate": 9.532079167252414e-06, + "loss": 0.3649, + "step": 5677 + }, + { + "epoch": 0.16, + "grad_norm": 1.7185945943778829, + "learning_rate": 9.531880749094407e-06, + "loss": 0.4014, + "step": 5678 + }, + { + "epoch": 0.16, + "grad_norm": 1.6716540434309575, + "learning_rate": 9.531682290942498e-06, + "loss": 0.3937, + "step": 5679 + }, + { + "epoch": 0.16, + "grad_norm": 10.502564998025221, + "learning_rate": 9.531483792798446e-06, + "loss": 0.4064, + "step": 5680 + }, + { + "epoch": 0.16, + "grad_norm": 1.6814945723607797, + "learning_rate": 9.531285254663997e-06, + "loss": 0.3953, + "step": 5681 + }, + { + "epoch": 0.16, + "grad_norm": 2.1530681903537716, + "learning_rate": 9.531086676540906e-06, + "loss": 0.3975, + "step": 5682 + }, + { + "epoch": 0.16, + "grad_norm": 1.4169190455860976, + "learning_rate": 9.530888058430926e-06, + "loss": 0.3937, + "step": 5683 + }, + { + "epoch": 0.16, + "grad_norm": 1.617676943130948, + "learning_rate": 9.530689400335806e-06, + "loss": 0.3954, + "step": 5684 + }, + { + "epoch": 0.16, + "grad_norm": 1.5585437611964712, + "learning_rate": 9.530490702257304e-06, + "loss": 0.3768, + "step": 5685 + }, + { + "epoch": 0.16, + "grad_norm": 1.6212517683825685, + "learning_rate": 9.53029196419717e-06, + "loss": 0.4102, + "step": 5686 + }, + { + "epoch": 0.16, + "grad_norm": 1.6801823262035496, + "learning_rate": 9.530093186157159e-06, + "loss": 0.4259, + "step": 5687 + }, + { + "epoch": 0.16, + "grad_norm": 1.6517102250285705, + "learning_rate": 9.529894368139025e-06, + "loss": 0.3672, + "step": 5688 + }, + { + "epoch": 0.17, + "grad_norm": 1.581173308881594, + "learning_rate": 9.529695510144525e-06, + "loss": 0.4119, + "step": 5689 + }, + { + "epoch": 0.17, + "grad_norm": 1.3626439679440006, + "learning_rate": 9.52949661217541e-06, + "loss": 0.3599, + "step": 5690 + }, + { + "epoch": 0.17, + "grad_norm": 1.4558173575877464, + "learning_rate": 9.529297674233439e-06, + "loss": 0.3844, + "step": 5691 + }, + { + "epoch": 0.17, + "grad_norm": 1.495458643540908, + "learning_rate": 9.529098696320364e-06, + "loss": 0.3888, + "step": 5692 + }, + { + "epoch": 0.17, + "grad_norm": 1.5912046321825677, + "learning_rate": 9.528899678437944e-06, + "loss": 0.3861, + "step": 5693 + }, + { + "epoch": 0.17, + "grad_norm": 2.0283553671414603, + "learning_rate": 9.528700620587934e-06, + "loss": 0.3612, + "step": 5694 + }, + { + "epoch": 0.17, + "grad_norm": 2.661669558621416, + "learning_rate": 9.52850152277209e-06, + "loss": 0.3909, + "step": 5695 + }, + { + "epoch": 0.17, + "grad_norm": 1.7377399017254875, + "learning_rate": 9.52830238499217e-06, + "loss": 0.412, + "step": 5696 + }, + { + "epoch": 0.17, + "grad_norm": 1.463260101771579, + "learning_rate": 9.528103207249932e-06, + "loss": 0.3998, + "step": 5697 + }, + { + "epoch": 0.17, + "grad_norm": 2.4907624141670444, + "learning_rate": 9.527903989547132e-06, + "loss": 0.4483, + "step": 5698 + }, + { + "epoch": 0.17, + "grad_norm": 2.493361895296641, + "learning_rate": 9.52770473188553e-06, + "loss": 0.4056, + "step": 5699 + }, + { + "epoch": 0.17, + "grad_norm": 2.3390450072595765, + "learning_rate": 9.527505434266882e-06, + "loss": 0.404, + "step": 5700 + }, + { + "epoch": 0.17, + "grad_norm": 1.4573685621889914, + "learning_rate": 9.52730609669295e-06, + "loss": 0.3931, + "step": 5701 + }, + { + "epoch": 0.17, + "grad_norm": 1.726104973051191, + "learning_rate": 9.527106719165491e-06, + "loss": 0.4185, + "step": 5702 + }, + { + "epoch": 0.17, + "grad_norm": 1.7117358196666959, + "learning_rate": 9.526907301686264e-06, + "loss": 0.3927, + "step": 5703 + }, + { + "epoch": 0.17, + "grad_norm": 1.38430235318933, + "learning_rate": 9.526707844257031e-06, + "loss": 0.3838, + "step": 5704 + }, + { + "epoch": 0.17, + "grad_norm": 1.738082627808141, + "learning_rate": 9.526508346879551e-06, + "loss": 0.3943, + "step": 5705 + }, + { + "epoch": 0.17, + "grad_norm": 1.4890498450295417, + "learning_rate": 9.526308809555584e-06, + "loss": 0.3994, + "step": 5706 + }, + { + "epoch": 0.17, + "grad_norm": 1.5037825759689452, + "learning_rate": 9.526109232286892e-06, + "loss": 0.4217, + "step": 5707 + }, + { + "epoch": 0.17, + "grad_norm": 1.427161190805047, + "learning_rate": 9.525909615075236e-06, + "loss": 0.3943, + "step": 5708 + }, + { + "epoch": 0.17, + "grad_norm": 1.3816783236833523, + "learning_rate": 9.525709957922376e-06, + "loss": 0.3784, + "step": 5709 + }, + { + "epoch": 0.17, + "grad_norm": 1.5113262397047453, + "learning_rate": 9.525510260830077e-06, + "loss": 0.4001, + "step": 5710 + }, + { + "epoch": 0.17, + "grad_norm": 1.3562329830709063, + "learning_rate": 9.5253105238001e-06, + "loss": 0.3557, + "step": 5711 + }, + { + "epoch": 0.17, + "grad_norm": 1.385430101066098, + "learning_rate": 9.525110746834205e-06, + "loss": 0.4298, + "step": 5712 + }, + { + "epoch": 0.17, + "grad_norm": 1.6548791658177542, + "learning_rate": 9.524910929934159e-06, + "loss": 0.3964, + "step": 5713 + }, + { + "epoch": 0.17, + "grad_norm": 1.743340277198476, + "learning_rate": 9.524711073101724e-06, + "loss": 0.4199, + "step": 5714 + }, + { + "epoch": 0.17, + "grad_norm": 1.4903550101898744, + "learning_rate": 9.524511176338665e-06, + "loss": 0.386, + "step": 5715 + }, + { + "epoch": 0.17, + "grad_norm": 1.425929194995649, + "learning_rate": 9.524311239646742e-06, + "loss": 0.3944, + "step": 5716 + }, + { + "epoch": 0.17, + "grad_norm": 1.395813782789141, + "learning_rate": 9.524111263027724e-06, + "loss": 0.378, + "step": 5717 + }, + { + "epoch": 0.17, + "grad_norm": 1.6137242521121848, + "learning_rate": 9.523911246483373e-06, + "loss": 0.4189, + "step": 5718 + }, + { + "epoch": 0.17, + "grad_norm": 1.4357625784020207, + "learning_rate": 9.523711190015455e-06, + "loss": 0.387, + "step": 5719 + }, + { + "epoch": 0.17, + "grad_norm": 1.3336528715970133, + "learning_rate": 9.523511093625736e-06, + "loss": 0.3864, + "step": 5720 + }, + { + "epoch": 0.17, + "grad_norm": 1.5043295952888713, + "learning_rate": 9.523310957315981e-06, + "loss": 0.3867, + "step": 5721 + }, + { + "epoch": 0.17, + "grad_norm": 1.416650280806825, + "learning_rate": 9.523110781087957e-06, + "loss": 0.3718, + "step": 5722 + }, + { + "epoch": 0.17, + "grad_norm": 1.5328116318249942, + "learning_rate": 9.522910564943431e-06, + "loss": 0.388, + "step": 5723 + }, + { + "epoch": 0.17, + "grad_norm": 1.3612276024473497, + "learning_rate": 9.522710308884168e-06, + "loss": 0.3816, + "step": 5724 + }, + { + "epoch": 0.17, + "grad_norm": 1.4896831521516292, + "learning_rate": 9.522510012911936e-06, + "loss": 0.3819, + "step": 5725 + }, + { + "epoch": 0.17, + "grad_norm": 1.5582632817013482, + "learning_rate": 9.522309677028503e-06, + "loss": 0.4052, + "step": 5726 + }, + { + "epoch": 0.17, + "grad_norm": 1.463947621813182, + "learning_rate": 9.522109301235637e-06, + "loss": 0.3954, + "step": 5727 + }, + { + "epoch": 0.17, + "grad_norm": 1.629353636792905, + "learning_rate": 9.521908885535109e-06, + "loss": 0.3946, + "step": 5728 + }, + { + "epoch": 0.17, + "grad_norm": 1.453831050568572, + "learning_rate": 9.521708429928682e-06, + "loss": 0.3913, + "step": 5729 + }, + { + "epoch": 0.17, + "grad_norm": 1.5134944961580912, + "learning_rate": 9.521507934418129e-06, + "loss": 0.3844, + "step": 5730 + }, + { + "epoch": 0.17, + "grad_norm": 1.7301902188885598, + "learning_rate": 9.521307399005218e-06, + "loss": 0.4169, + "step": 5731 + }, + { + "epoch": 0.17, + "grad_norm": 1.4657093242009178, + "learning_rate": 9.521106823691719e-06, + "loss": 0.4011, + "step": 5732 + }, + { + "epoch": 0.17, + "grad_norm": 1.294413750897762, + "learning_rate": 9.520906208479401e-06, + "loss": 0.3626, + "step": 5733 + }, + { + "epoch": 0.17, + "grad_norm": 1.6701646962448133, + "learning_rate": 9.520705553370037e-06, + "loss": 0.3838, + "step": 5734 + }, + { + "epoch": 0.17, + "grad_norm": 1.5902945841679788, + "learning_rate": 9.520504858365397e-06, + "loss": 0.3812, + "step": 5735 + }, + { + "epoch": 0.17, + "grad_norm": 1.70390756342664, + "learning_rate": 9.52030412346725e-06, + "loss": 0.4008, + "step": 5736 + }, + { + "epoch": 0.17, + "grad_norm": 1.5699988781873853, + "learning_rate": 9.52010334867737e-06, + "loss": 0.4106, + "step": 5737 + }, + { + "epoch": 0.17, + "grad_norm": 2.9011356919482987, + "learning_rate": 9.519902533997524e-06, + "loss": 0.3669, + "step": 5738 + }, + { + "epoch": 0.17, + "grad_norm": 1.335331516249315, + "learning_rate": 9.519701679429494e-06, + "loss": 0.3671, + "step": 5739 + }, + { + "epoch": 0.17, + "grad_norm": 1.5352086408682073, + "learning_rate": 9.519500784975042e-06, + "loss": 0.4048, + "step": 5740 + }, + { + "epoch": 0.17, + "grad_norm": 1.7971516315750837, + "learning_rate": 9.519299850635949e-06, + "loss": 0.4342, + "step": 5741 + }, + { + "epoch": 0.17, + "grad_norm": 1.5276218327257594, + "learning_rate": 9.519098876413982e-06, + "loss": 0.3961, + "step": 5742 + }, + { + "epoch": 0.17, + "grad_norm": 1.574416186786297, + "learning_rate": 9.51889786231092e-06, + "loss": 0.3981, + "step": 5743 + }, + { + "epoch": 0.17, + "grad_norm": 1.6213906020698963, + "learning_rate": 9.51869680832853e-06, + "loss": 0.3734, + "step": 5744 + }, + { + "epoch": 0.17, + "grad_norm": 1.8119309481689678, + "learning_rate": 9.518495714468593e-06, + "loss": 0.3958, + "step": 5745 + }, + { + "epoch": 0.17, + "grad_norm": 1.681764027618366, + "learning_rate": 9.51829458073288e-06, + "loss": 0.3969, + "step": 5746 + }, + { + "epoch": 0.17, + "grad_norm": 1.5567476733608168, + "learning_rate": 9.51809340712317e-06, + "loss": 0.3959, + "step": 5747 + }, + { + "epoch": 0.17, + "grad_norm": 1.8277750895503972, + "learning_rate": 9.517892193641234e-06, + "loss": 0.4069, + "step": 5748 + }, + { + "epoch": 0.17, + "grad_norm": 1.5532834899424508, + "learning_rate": 9.517690940288849e-06, + "loss": 0.4045, + "step": 5749 + }, + { + "epoch": 0.17, + "grad_norm": 1.5039795648180405, + "learning_rate": 9.51748964706779e-06, + "loss": 0.3954, + "step": 5750 + }, + { + "epoch": 0.17, + "grad_norm": 1.5701563163720527, + "learning_rate": 9.517288313979838e-06, + "loss": 0.3823, + "step": 5751 + }, + { + "epoch": 0.17, + "grad_norm": 1.5273339575288547, + "learning_rate": 9.517086941026763e-06, + "loss": 0.3938, + "step": 5752 + }, + { + "epoch": 0.17, + "grad_norm": 1.3524756268481408, + "learning_rate": 9.516885528210348e-06, + "loss": 0.3558, + "step": 5753 + }, + { + "epoch": 0.17, + "grad_norm": 1.799701588742833, + "learning_rate": 9.516684075532367e-06, + "loss": 0.4107, + "step": 5754 + }, + { + "epoch": 0.17, + "grad_norm": 1.6131364905993189, + "learning_rate": 9.516482582994599e-06, + "loss": 0.4086, + "step": 5755 + }, + { + "epoch": 0.17, + "grad_norm": 1.5804664420136418, + "learning_rate": 9.516281050598823e-06, + "loss": 0.4142, + "step": 5756 + }, + { + "epoch": 0.17, + "grad_norm": 1.4374659896285913, + "learning_rate": 9.516079478346817e-06, + "loss": 0.3783, + "step": 5757 + }, + { + "epoch": 0.17, + "grad_norm": 1.4862056325823074, + "learning_rate": 9.515877866240358e-06, + "loss": 0.389, + "step": 5758 + }, + { + "epoch": 0.17, + "grad_norm": 1.3881822915927975, + "learning_rate": 9.515676214281228e-06, + "loss": 0.3601, + "step": 5759 + }, + { + "epoch": 0.17, + "grad_norm": 1.0545292589563835, + "learning_rate": 9.515474522471204e-06, + "loss": 0.6061, + "step": 5760 + }, + { + "epoch": 0.17, + "grad_norm": 1.4690789231927963, + "learning_rate": 9.515272790812069e-06, + "loss": 0.3732, + "step": 5761 + }, + { + "epoch": 0.17, + "grad_norm": 1.477089714667006, + "learning_rate": 9.515071019305599e-06, + "loss": 0.4227, + "step": 5762 + }, + { + "epoch": 0.17, + "grad_norm": 1.5734392922709346, + "learning_rate": 9.51486920795358e-06, + "loss": 0.3793, + "step": 5763 + }, + { + "epoch": 0.17, + "grad_norm": 1.8003983662244678, + "learning_rate": 9.514667356757789e-06, + "loss": 0.3811, + "step": 5764 + }, + { + "epoch": 0.17, + "grad_norm": 1.4854658797803693, + "learning_rate": 9.514465465720008e-06, + "loss": 0.389, + "step": 5765 + }, + { + "epoch": 0.17, + "grad_norm": 1.696290488133281, + "learning_rate": 9.514263534842019e-06, + "loss": 0.4291, + "step": 5766 + }, + { + "epoch": 0.17, + "grad_norm": 1.6135815618675453, + "learning_rate": 9.514061564125606e-06, + "loss": 0.4129, + "step": 5767 + }, + { + "epoch": 0.17, + "grad_norm": 1.5171527747715732, + "learning_rate": 9.513859553572547e-06, + "loss": 0.4009, + "step": 5768 + }, + { + "epoch": 0.17, + "grad_norm": 1.6132308712261745, + "learning_rate": 9.51365750318463e-06, + "loss": 0.4056, + "step": 5769 + }, + { + "epoch": 0.17, + "grad_norm": 1.514931384601315, + "learning_rate": 9.513455412963632e-06, + "loss": 0.3945, + "step": 5770 + }, + { + "epoch": 0.17, + "grad_norm": 1.5217836548615042, + "learning_rate": 9.513253282911343e-06, + "loss": 0.3714, + "step": 5771 + }, + { + "epoch": 0.17, + "grad_norm": 1.448827676560061, + "learning_rate": 9.513051113029543e-06, + "loss": 0.41, + "step": 5772 + }, + { + "epoch": 0.17, + "grad_norm": 1.450346601885446, + "learning_rate": 9.512848903320017e-06, + "loss": 0.3556, + "step": 5773 + }, + { + "epoch": 0.17, + "grad_norm": 1.6144238649378644, + "learning_rate": 9.512646653784549e-06, + "loss": 0.4202, + "step": 5774 + }, + { + "epoch": 0.17, + "grad_norm": 1.5420630138743496, + "learning_rate": 9.512444364424924e-06, + "loss": 0.4291, + "step": 5775 + }, + { + "epoch": 0.17, + "grad_norm": 1.4022562721561396, + "learning_rate": 9.512242035242927e-06, + "loss": 0.3726, + "step": 5776 + }, + { + "epoch": 0.17, + "grad_norm": 1.6069969435911673, + "learning_rate": 9.512039666240344e-06, + "loss": 0.3681, + "step": 5777 + }, + { + "epoch": 0.17, + "grad_norm": 1.7411379592408482, + "learning_rate": 9.511837257418963e-06, + "loss": 0.401, + "step": 5778 + }, + { + "epoch": 0.17, + "grad_norm": 1.5428968963810252, + "learning_rate": 9.511634808780565e-06, + "loss": 0.4167, + "step": 5779 + }, + { + "epoch": 0.17, + "grad_norm": 1.384133763224568, + "learning_rate": 9.511432320326943e-06, + "loss": 0.3669, + "step": 5780 + }, + { + "epoch": 0.17, + "grad_norm": 1.6235394780894747, + "learning_rate": 9.51122979205988e-06, + "loss": 0.3997, + "step": 5781 + }, + { + "epoch": 0.17, + "grad_norm": 1.6867539871857742, + "learning_rate": 9.511027223981163e-06, + "loss": 0.3964, + "step": 5782 + }, + { + "epoch": 0.17, + "grad_norm": 1.605467380172988, + "learning_rate": 9.51082461609258e-06, + "loss": 0.3897, + "step": 5783 + }, + { + "epoch": 0.17, + "grad_norm": 1.7662730614581097, + "learning_rate": 9.510621968395922e-06, + "loss": 0.3898, + "step": 5784 + }, + { + "epoch": 0.17, + "grad_norm": 1.6460529110311053, + "learning_rate": 9.510419280892971e-06, + "loss": 0.3967, + "step": 5785 + }, + { + "epoch": 0.17, + "grad_norm": 1.4892613483146846, + "learning_rate": 9.510216553585523e-06, + "loss": 0.4199, + "step": 5786 + }, + { + "epoch": 0.17, + "grad_norm": 1.961786103615264, + "learning_rate": 9.510013786475364e-06, + "loss": 0.3732, + "step": 5787 + }, + { + "epoch": 0.17, + "grad_norm": 1.395412967301992, + "learning_rate": 9.509810979564282e-06, + "loss": 0.4062, + "step": 5788 + }, + { + "epoch": 0.17, + "grad_norm": 1.978977237440772, + "learning_rate": 9.509608132854068e-06, + "loss": 0.3766, + "step": 5789 + }, + { + "epoch": 0.17, + "grad_norm": 1.3710438940596659, + "learning_rate": 9.509405246346515e-06, + "loss": 0.3762, + "step": 5790 + }, + { + "epoch": 0.17, + "grad_norm": 1.506692202555169, + "learning_rate": 9.509202320043407e-06, + "loss": 0.3773, + "step": 5791 + }, + { + "epoch": 0.17, + "grad_norm": 2.1945240826684236, + "learning_rate": 9.508999353946541e-06, + "loss": 0.4159, + "step": 5792 + }, + { + "epoch": 0.17, + "grad_norm": 1.581376310883394, + "learning_rate": 9.508796348057705e-06, + "loss": 0.399, + "step": 5793 + }, + { + "epoch": 0.17, + "grad_norm": 1.6753724510592054, + "learning_rate": 9.508593302378692e-06, + "loss": 0.4122, + "step": 5794 + }, + { + "epoch": 0.17, + "grad_norm": 1.5918093769225876, + "learning_rate": 9.508390216911292e-06, + "loss": 0.406, + "step": 5795 + }, + { + "epoch": 0.17, + "grad_norm": 1.4366395947206845, + "learning_rate": 9.508187091657297e-06, + "loss": 0.3768, + "step": 5796 + }, + { + "epoch": 0.17, + "grad_norm": 1.7664464642667794, + "learning_rate": 9.507983926618502e-06, + "loss": 0.3869, + "step": 5797 + }, + { + "epoch": 0.17, + "grad_norm": 1.4365733212321208, + "learning_rate": 9.5077807217967e-06, + "loss": 0.3833, + "step": 5798 + }, + { + "epoch": 0.17, + "grad_norm": 1.409601114693184, + "learning_rate": 9.507577477193684e-06, + "loss": 0.3873, + "step": 5799 + }, + { + "epoch": 0.17, + "grad_norm": 1.5320740599719218, + "learning_rate": 9.507374192811243e-06, + "loss": 0.3711, + "step": 5800 + }, + { + "epoch": 0.17, + "grad_norm": 1.6671513885013238, + "learning_rate": 9.507170868651177e-06, + "loss": 0.4534, + "step": 5801 + }, + { + "epoch": 0.17, + "grad_norm": 1.5363310562161576, + "learning_rate": 9.506967504715278e-06, + "loss": 0.3763, + "step": 5802 + }, + { + "epoch": 0.17, + "grad_norm": 1.6047603283171978, + "learning_rate": 9.506764101005342e-06, + "loss": 0.4427, + "step": 5803 + }, + { + "epoch": 0.17, + "grad_norm": 1.399490194573787, + "learning_rate": 9.506560657523161e-06, + "loss": 0.3895, + "step": 5804 + }, + { + "epoch": 0.17, + "grad_norm": 1.628364356121676, + "learning_rate": 9.506357174270532e-06, + "loss": 0.3981, + "step": 5805 + }, + { + "epoch": 0.17, + "grad_norm": 1.434623472941508, + "learning_rate": 9.506153651249252e-06, + "loss": 0.3981, + "step": 5806 + }, + { + "epoch": 0.17, + "grad_norm": 1.5495659075903876, + "learning_rate": 9.505950088461116e-06, + "loss": 0.4076, + "step": 5807 + }, + { + "epoch": 0.17, + "grad_norm": 1.8311196145503241, + "learning_rate": 9.505746485907921e-06, + "loss": 0.3922, + "step": 5808 + }, + { + "epoch": 0.17, + "grad_norm": 1.8552196172589785, + "learning_rate": 9.505542843591461e-06, + "loss": 0.3949, + "step": 5809 + }, + { + "epoch": 0.17, + "grad_norm": 1.3683951286326892, + "learning_rate": 9.505339161513539e-06, + "loss": 0.3747, + "step": 5810 + }, + { + "epoch": 0.17, + "grad_norm": 1.6595441302661593, + "learning_rate": 9.505135439675945e-06, + "loss": 0.4062, + "step": 5811 + }, + { + "epoch": 0.17, + "grad_norm": 1.1327663706730948, + "learning_rate": 9.504931678080482e-06, + "loss": 0.6057, + "step": 5812 + }, + { + "epoch": 0.17, + "grad_norm": 1.5499076018507765, + "learning_rate": 9.504727876728948e-06, + "loss": 0.3875, + "step": 5813 + }, + { + "epoch": 0.17, + "grad_norm": 1.1153246282178468, + "learning_rate": 9.50452403562314e-06, + "loss": 0.6436, + "step": 5814 + }, + { + "epoch": 0.17, + "grad_norm": 1.5835358080811746, + "learning_rate": 9.504320154764857e-06, + "loss": 0.373, + "step": 5815 + }, + { + "epoch": 0.17, + "grad_norm": 3.173108645005883, + "learning_rate": 9.5041162341559e-06, + "loss": 0.414, + "step": 5816 + }, + { + "epoch": 0.17, + "grad_norm": 1.503092148591118, + "learning_rate": 9.503912273798066e-06, + "loss": 0.372, + "step": 5817 + }, + { + "epoch": 0.17, + "grad_norm": 1.574210648995602, + "learning_rate": 9.503708273693158e-06, + "loss": 0.3951, + "step": 5818 + }, + { + "epoch": 0.17, + "grad_norm": 1.4290408536599022, + "learning_rate": 9.503504233842973e-06, + "loss": 0.3889, + "step": 5819 + }, + { + "epoch": 0.17, + "grad_norm": 1.7101819621369967, + "learning_rate": 9.503300154249314e-06, + "loss": 0.425, + "step": 5820 + }, + { + "epoch": 0.17, + "grad_norm": 1.3670532045884762, + "learning_rate": 9.50309603491398e-06, + "loss": 0.3881, + "step": 5821 + }, + { + "epoch": 0.17, + "grad_norm": 1.347336665228403, + "learning_rate": 9.502891875838773e-06, + "loss": 0.3665, + "step": 5822 + }, + { + "epoch": 0.17, + "grad_norm": 1.4761215565030514, + "learning_rate": 9.502687677025495e-06, + "loss": 0.3926, + "step": 5823 + }, + { + "epoch": 0.17, + "grad_norm": 1.8376342207599239, + "learning_rate": 9.50248343847595e-06, + "loss": 0.3792, + "step": 5824 + }, + { + "epoch": 0.17, + "grad_norm": 1.4962286271297291, + "learning_rate": 9.502279160191938e-06, + "loss": 0.3972, + "step": 5825 + }, + { + "epoch": 0.17, + "grad_norm": 1.4031104943125614, + "learning_rate": 9.502074842175265e-06, + "loss": 0.4083, + "step": 5826 + }, + { + "epoch": 0.17, + "grad_norm": 1.4253680570098013, + "learning_rate": 9.501870484427728e-06, + "loss": 0.375, + "step": 5827 + }, + { + "epoch": 0.17, + "grad_norm": 1.5116665601916535, + "learning_rate": 9.501666086951135e-06, + "loss": 0.3997, + "step": 5828 + }, + { + "epoch": 0.17, + "grad_norm": 1.5706806056347267, + "learning_rate": 9.50146164974729e-06, + "loss": 0.4133, + "step": 5829 + }, + { + "epoch": 0.17, + "grad_norm": 1.5210367873955675, + "learning_rate": 9.501257172817995e-06, + "loss": 0.4105, + "step": 5830 + }, + { + "epoch": 0.17, + "grad_norm": 1.3494540238719253, + "learning_rate": 9.501052656165055e-06, + "loss": 0.4046, + "step": 5831 + }, + { + "epoch": 0.17, + "grad_norm": 1.4150431001234225, + "learning_rate": 9.500848099790276e-06, + "loss": 0.3955, + "step": 5832 + }, + { + "epoch": 0.17, + "grad_norm": 1.4315573200496068, + "learning_rate": 9.500643503695463e-06, + "loss": 0.3985, + "step": 5833 + }, + { + "epoch": 0.17, + "grad_norm": 1.5480213862662477, + "learning_rate": 9.500438867882423e-06, + "loss": 0.3745, + "step": 5834 + }, + { + "epoch": 0.17, + "grad_norm": 1.5017653196718117, + "learning_rate": 9.500234192352957e-06, + "loss": 0.3777, + "step": 5835 + }, + { + "epoch": 0.17, + "grad_norm": 1.6527777001585024, + "learning_rate": 9.500029477108875e-06, + "loss": 0.4042, + "step": 5836 + }, + { + "epoch": 0.17, + "grad_norm": 1.4215074985509453, + "learning_rate": 9.499824722151982e-06, + "loss": 0.4109, + "step": 5837 + }, + { + "epoch": 0.17, + "grad_norm": 1.3696271067289953, + "learning_rate": 9.499619927484087e-06, + "loss": 0.4039, + "step": 5838 + }, + { + "epoch": 0.17, + "grad_norm": 1.3620932112505546, + "learning_rate": 9.499415093106997e-06, + "loss": 0.3907, + "step": 5839 + }, + { + "epoch": 0.17, + "grad_norm": 1.3642464259178035, + "learning_rate": 9.49921021902252e-06, + "loss": 0.3821, + "step": 5840 + }, + { + "epoch": 0.17, + "grad_norm": 1.4026891917209727, + "learning_rate": 9.49900530523246e-06, + "loss": 0.3397, + "step": 5841 + }, + { + "epoch": 0.17, + "grad_norm": 1.3998655844403702, + "learning_rate": 9.498800351738629e-06, + "loss": 0.4272, + "step": 5842 + }, + { + "epoch": 0.17, + "grad_norm": 1.448353453633291, + "learning_rate": 9.498595358542835e-06, + "loss": 0.4025, + "step": 5843 + }, + { + "epoch": 0.17, + "grad_norm": 1.575920678155707, + "learning_rate": 9.498390325646887e-06, + "loss": 0.3875, + "step": 5844 + }, + { + "epoch": 0.17, + "grad_norm": 1.7611072344156322, + "learning_rate": 9.498185253052595e-06, + "loss": 0.3905, + "step": 5845 + }, + { + "epoch": 0.17, + "grad_norm": 2.0787518346177305, + "learning_rate": 9.49798014076177e-06, + "loss": 0.3982, + "step": 5846 + }, + { + "epoch": 0.17, + "grad_norm": 1.4506518248353992, + "learning_rate": 9.497774988776218e-06, + "loss": 0.3815, + "step": 5847 + }, + { + "epoch": 0.17, + "grad_norm": 1.6563987340814956, + "learning_rate": 9.497569797097752e-06, + "loss": 0.3883, + "step": 5848 + }, + { + "epoch": 0.17, + "grad_norm": 1.8034113216985643, + "learning_rate": 9.497364565728182e-06, + "loss": 0.4773, + "step": 5849 + }, + { + "epoch": 0.17, + "grad_norm": 1.686266563983949, + "learning_rate": 9.49715929466932e-06, + "loss": 0.4067, + "step": 5850 + }, + { + "epoch": 0.17, + "grad_norm": 1.5036209234861675, + "learning_rate": 9.496953983922979e-06, + "loss": 0.3923, + "step": 5851 + }, + { + "epoch": 0.17, + "grad_norm": 1.583546593451308, + "learning_rate": 9.496748633490969e-06, + "loss": 0.4139, + "step": 5852 + }, + { + "epoch": 0.17, + "grad_norm": 1.360692855470172, + "learning_rate": 9.4965432433751e-06, + "loss": 0.3669, + "step": 5853 + }, + { + "epoch": 0.17, + "grad_norm": 1.7010004392569025, + "learning_rate": 9.49633781357719e-06, + "loss": 0.4444, + "step": 5854 + }, + { + "epoch": 0.17, + "grad_norm": 1.4557242303343265, + "learning_rate": 9.496132344099047e-06, + "loss": 0.3832, + "step": 5855 + }, + { + "epoch": 0.17, + "grad_norm": 1.3799420314199797, + "learning_rate": 9.495926834942488e-06, + "loss": 0.3918, + "step": 5856 + }, + { + "epoch": 0.17, + "grad_norm": 1.3658402728073151, + "learning_rate": 9.495721286109322e-06, + "loss": 0.3472, + "step": 5857 + }, + { + "epoch": 0.17, + "grad_norm": 1.5454605632357363, + "learning_rate": 9.49551569760137e-06, + "loss": 0.6174, + "step": 5858 + }, + { + "epoch": 0.17, + "grad_norm": 1.719813064601987, + "learning_rate": 9.495310069420438e-06, + "loss": 0.3831, + "step": 5859 + }, + { + "epoch": 0.17, + "grad_norm": 1.3329686890800305, + "learning_rate": 9.495104401568347e-06, + "loss": 0.3774, + "step": 5860 + }, + { + "epoch": 0.17, + "grad_norm": 1.472964157633625, + "learning_rate": 9.494898694046909e-06, + "loss": 0.3841, + "step": 5861 + }, + { + "epoch": 0.17, + "grad_norm": 1.3900422745871857, + "learning_rate": 9.494692946857939e-06, + "loss": 0.4249, + "step": 5862 + }, + { + "epoch": 0.17, + "grad_norm": 1.7138468745596735, + "learning_rate": 9.494487160003256e-06, + "loss": 0.4128, + "step": 5863 + }, + { + "epoch": 0.17, + "grad_norm": 1.614173610444681, + "learning_rate": 9.494281333484674e-06, + "loss": 0.3899, + "step": 5864 + }, + { + "epoch": 0.17, + "grad_norm": 1.6932735760569457, + "learning_rate": 9.494075467304007e-06, + "loss": 0.425, + "step": 5865 + }, + { + "epoch": 0.17, + "grad_norm": 1.571294802160632, + "learning_rate": 9.493869561463076e-06, + "loss": 0.3711, + "step": 5866 + }, + { + "epoch": 0.17, + "grad_norm": 1.5127543270463377, + "learning_rate": 9.493663615963697e-06, + "loss": 0.3843, + "step": 5867 + }, + { + "epoch": 0.17, + "grad_norm": 1.4481408938353895, + "learning_rate": 9.493457630807685e-06, + "loss": 0.3819, + "step": 5868 + }, + { + "epoch": 0.17, + "grad_norm": 1.7675800223671123, + "learning_rate": 9.493251605996861e-06, + "loss": 0.3914, + "step": 5869 + }, + { + "epoch": 0.17, + "grad_norm": 1.563587535369175, + "learning_rate": 9.493045541533039e-06, + "loss": 0.3928, + "step": 5870 + }, + { + "epoch": 0.17, + "grad_norm": 2.9981138028699945, + "learning_rate": 9.492839437418043e-06, + "loss": 0.3721, + "step": 5871 + }, + { + "epoch": 0.17, + "grad_norm": 1.4929473451070812, + "learning_rate": 9.492633293653688e-06, + "loss": 0.3775, + "step": 5872 + }, + { + "epoch": 0.17, + "grad_norm": 1.3083194765546382, + "learning_rate": 9.492427110241795e-06, + "loss": 0.3695, + "step": 5873 + }, + { + "epoch": 0.17, + "grad_norm": 1.4125758345000274, + "learning_rate": 9.492220887184183e-06, + "loss": 0.3749, + "step": 5874 + }, + { + "epoch": 0.17, + "grad_norm": 1.5122193218196578, + "learning_rate": 9.49201462448267e-06, + "loss": 0.424, + "step": 5875 + }, + { + "epoch": 0.17, + "grad_norm": 1.6413966807192988, + "learning_rate": 9.49180832213908e-06, + "loss": 0.4334, + "step": 5876 + }, + { + "epoch": 0.17, + "grad_norm": 1.5212312655694706, + "learning_rate": 9.491601980155232e-06, + "loss": 0.3874, + "step": 5877 + }, + { + "epoch": 0.17, + "grad_norm": 1.33743057579324, + "learning_rate": 9.491395598532945e-06, + "loss": 0.3571, + "step": 5878 + }, + { + "epoch": 0.17, + "grad_norm": 1.5543841166157601, + "learning_rate": 9.491189177274043e-06, + "loss": 0.3815, + "step": 5879 + }, + { + "epoch": 0.17, + "grad_norm": 4.249335855814887, + "learning_rate": 9.490982716380348e-06, + "loss": 0.3696, + "step": 5880 + }, + { + "epoch": 0.17, + "grad_norm": 1.444629805619405, + "learning_rate": 9.490776215853678e-06, + "loss": 0.3766, + "step": 5881 + }, + { + "epoch": 0.17, + "grad_norm": 2.6214227378790547, + "learning_rate": 9.49056967569586e-06, + "loss": 0.4096, + "step": 5882 + }, + { + "epoch": 0.17, + "grad_norm": 1.484167762286927, + "learning_rate": 9.490363095908714e-06, + "loss": 0.3954, + "step": 5883 + }, + { + "epoch": 0.17, + "grad_norm": 1.4255045167321498, + "learning_rate": 9.490156476494064e-06, + "loss": 0.3863, + "step": 5884 + }, + { + "epoch": 0.17, + "grad_norm": 0.975649491038333, + "learning_rate": 9.489949817453735e-06, + "loss": 0.5611, + "step": 5885 + }, + { + "epoch": 0.17, + "grad_norm": 1.451865392384513, + "learning_rate": 9.489743118789545e-06, + "loss": 0.3908, + "step": 5886 + }, + { + "epoch": 0.17, + "grad_norm": 1.64425880850536, + "learning_rate": 9.489536380503325e-06, + "loss": 0.4284, + "step": 5887 + }, + { + "epoch": 0.17, + "grad_norm": 1.4939606786869266, + "learning_rate": 9.489329602596898e-06, + "loss": 0.3885, + "step": 5888 + }, + { + "epoch": 0.17, + "grad_norm": 1.4399171677872418, + "learning_rate": 9.489122785072084e-06, + "loss": 0.4139, + "step": 5889 + }, + { + "epoch": 0.17, + "grad_norm": 1.3636104099722113, + "learning_rate": 9.488915927930714e-06, + "loss": 0.3912, + "step": 5890 + }, + { + "epoch": 0.17, + "grad_norm": 1.3319954561278005, + "learning_rate": 9.48870903117461e-06, + "loss": 0.372, + "step": 5891 + }, + { + "epoch": 0.17, + "grad_norm": 1.7350983252993677, + "learning_rate": 9.488502094805599e-06, + "loss": 0.459, + "step": 5892 + }, + { + "epoch": 0.17, + "grad_norm": 1.630575275742135, + "learning_rate": 9.488295118825509e-06, + "loss": 0.3824, + "step": 5893 + }, + { + "epoch": 0.17, + "grad_norm": 1.626712541318882, + "learning_rate": 9.488088103236163e-06, + "loss": 0.3673, + "step": 5894 + }, + { + "epoch": 0.17, + "grad_norm": 1.7144021448636113, + "learning_rate": 9.48788104803939e-06, + "loss": 0.4172, + "step": 5895 + }, + { + "epoch": 0.17, + "grad_norm": 1.6840684697742807, + "learning_rate": 9.487673953237018e-06, + "loss": 0.4091, + "step": 5896 + }, + { + "epoch": 0.17, + "grad_norm": 1.3647313680134914, + "learning_rate": 9.487466818830872e-06, + "loss": 0.3766, + "step": 5897 + }, + { + "epoch": 0.17, + "grad_norm": 1.4883449812367497, + "learning_rate": 9.487259644822782e-06, + "loss": 0.4055, + "step": 5898 + }, + { + "epoch": 0.17, + "grad_norm": 1.508011260373089, + "learning_rate": 9.487052431214576e-06, + "loss": 0.3795, + "step": 5899 + }, + { + "epoch": 0.17, + "grad_norm": 1.644830051381764, + "learning_rate": 9.486845178008082e-06, + "loss": 0.401, + "step": 5900 + }, + { + "epoch": 0.17, + "grad_norm": 1.495456080631167, + "learning_rate": 9.48663788520513e-06, + "loss": 0.3993, + "step": 5901 + }, + { + "epoch": 0.17, + "grad_norm": 1.484735940662386, + "learning_rate": 9.486430552807549e-06, + "loss": 0.3768, + "step": 5902 + }, + { + "epoch": 0.17, + "grad_norm": 1.5632564494820975, + "learning_rate": 9.486223180817169e-06, + "loss": 0.3913, + "step": 5903 + }, + { + "epoch": 0.17, + "grad_norm": 1.6179459309582807, + "learning_rate": 9.486015769235818e-06, + "loss": 0.4219, + "step": 5904 + }, + { + "epoch": 0.17, + "grad_norm": 1.4869373067917735, + "learning_rate": 9.48580831806533e-06, + "loss": 0.3834, + "step": 5905 + }, + { + "epoch": 0.17, + "grad_norm": 1.6284809363264763, + "learning_rate": 9.485600827307535e-06, + "loss": 0.377, + "step": 5906 + }, + { + "epoch": 0.17, + "grad_norm": 1.4034708329676329, + "learning_rate": 9.48539329696426e-06, + "loss": 0.4008, + "step": 5907 + }, + { + "epoch": 0.17, + "grad_norm": 1.470021482258778, + "learning_rate": 9.48518572703734e-06, + "loss": 0.3964, + "step": 5908 + }, + { + "epoch": 0.17, + "grad_norm": 1.6376135932646128, + "learning_rate": 9.484978117528608e-06, + "loss": 0.3979, + "step": 5909 + }, + { + "epoch": 0.17, + "grad_norm": 1.6992130273095865, + "learning_rate": 9.484770468439892e-06, + "loss": 0.4007, + "step": 5910 + }, + { + "epoch": 0.17, + "grad_norm": 1.487622066090299, + "learning_rate": 9.484562779773027e-06, + "loss": 0.3881, + "step": 5911 + }, + { + "epoch": 0.17, + "grad_norm": 1.498144742177305, + "learning_rate": 9.484355051529848e-06, + "loss": 0.4014, + "step": 5912 + }, + { + "epoch": 0.17, + "grad_norm": 1.6284191165317, + "learning_rate": 9.484147283712185e-06, + "loss": 0.403, + "step": 5913 + }, + { + "epoch": 0.17, + "grad_norm": 1.449606518218831, + "learning_rate": 9.483939476321872e-06, + "loss": 0.3765, + "step": 5914 + }, + { + "epoch": 0.17, + "grad_norm": 1.651383194832689, + "learning_rate": 9.483731629360743e-06, + "loss": 0.3934, + "step": 5915 + }, + { + "epoch": 0.17, + "grad_norm": 1.4483811971036653, + "learning_rate": 9.483523742830633e-06, + "loss": 0.405, + "step": 5916 + }, + { + "epoch": 0.17, + "grad_norm": 1.7623897300692055, + "learning_rate": 9.483315816733376e-06, + "loss": 0.3886, + "step": 5917 + }, + { + "epoch": 0.17, + "grad_norm": 1.566773026647692, + "learning_rate": 9.483107851070807e-06, + "loss": 0.3778, + "step": 5918 + }, + { + "epoch": 0.17, + "grad_norm": 1.4886364267196273, + "learning_rate": 9.482899845844763e-06, + "loss": 0.415, + "step": 5919 + }, + { + "epoch": 0.17, + "grad_norm": 1.4223208377725298, + "learning_rate": 9.482691801057078e-06, + "loss": 0.3975, + "step": 5920 + }, + { + "epoch": 0.17, + "grad_norm": 1.4315668350031017, + "learning_rate": 9.482483716709586e-06, + "loss": 0.3984, + "step": 5921 + }, + { + "epoch": 0.17, + "grad_norm": 1.3793947460215175, + "learning_rate": 9.482275592804127e-06, + "loss": 0.3863, + "step": 5922 + }, + { + "epoch": 0.17, + "grad_norm": 1.9405381044697616, + "learning_rate": 9.482067429342536e-06, + "loss": 0.4044, + "step": 5923 + }, + { + "epoch": 0.17, + "grad_norm": 1.5500247019692326, + "learning_rate": 9.48185922632665e-06, + "loss": 0.3961, + "step": 5924 + }, + { + "epoch": 0.17, + "grad_norm": 1.3716256595123062, + "learning_rate": 9.481650983758308e-06, + "loss": 0.3686, + "step": 5925 + }, + { + "epoch": 0.17, + "grad_norm": 1.4170170876600932, + "learning_rate": 9.481442701639344e-06, + "loss": 0.3564, + "step": 5926 + }, + { + "epoch": 0.17, + "grad_norm": 1.5223956682389193, + "learning_rate": 9.4812343799716e-06, + "loss": 0.3805, + "step": 5927 + }, + { + "epoch": 0.17, + "grad_norm": 1.9186367515904874, + "learning_rate": 9.481026018756912e-06, + "loss": 0.4014, + "step": 5928 + }, + { + "epoch": 0.17, + "grad_norm": 1.4889670393258456, + "learning_rate": 9.48081761799712e-06, + "loss": 0.395, + "step": 5929 + }, + { + "epoch": 0.17, + "grad_norm": 1.5692833034531155, + "learning_rate": 9.480609177694063e-06, + "loss": 0.3744, + "step": 5930 + }, + { + "epoch": 0.17, + "grad_norm": 2.030198642947401, + "learning_rate": 9.48040069784958e-06, + "loss": 0.3951, + "step": 5931 + }, + { + "epoch": 0.17, + "grad_norm": 1.4097974800952497, + "learning_rate": 9.480192178465511e-06, + "loss": 0.3938, + "step": 5932 + }, + { + "epoch": 0.17, + "grad_norm": 1.9040142259838457, + "learning_rate": 9.479983619543697e-06, + "loss": 0.3926, + "step": 5933 + }, + { + "epoch": 0.17, + "grad_norm": 1.3886190412885842, + "learning_rate": 9.479775021085977e-06, + "loss": 0.387, + "step": 5934 + }, + { + "epoch": 0.17, + "grad_norm": 1.4386465866520954, + "learning_rate": 9.479566383094195e-06, + "loss": 0.3762, + "step": 5935 + }, + { + "epoch": 0.17, + "grad_norm": 1.5310836754187251, + "learning_rate": 9.479357705570187e-06, + "loss": 0.4086, + "step": 5936 + }, + { + "epoch": 0.17, + "grad_norm": 1.3454998923628614, + "learning_rate": 9.479148988515798e-06, + "loss": 0.3491, + "step": 5937 + }, + { + "epoch": 0.17, + "grad_norm": 1.5195215761273488, + "learning_rate": 9.47894023193287e-06, + "loss": 0.3973, + "step": 5938 + }, + { + "epoch": 0.17, + "grad_norm": 2.032319558624948, + "learning_rate": 9.478731435823245e-06, + "loss": 0.3721, + "step": 5939 + }, + { + "epoch": 0.17, + "grad_norm": 1.4526807282151049, + "learning_rate": 9.478522600188767e-06, + "loss": 0.4014, + "step": 5940 + }, + { + "epoch": 0.17, + "grad_norm": 6.680207901114776, + "learning_rate": 9.478313725031276e-06, + "loss": 0.3708, + "step": 5941 + }, + { + "epoch": 0.17, + "grad_norm": 1.0617627356746064, + "learning_rate": 9.478104810352615e-06, + "loss": 0.5785, + "step": 5942 + }, + { + "epoch": 0.17, + "grad_norm": 1.7883074468951765, + "learning_rate": 9.47789585615463e-06, + "loss": 0.4129, + "step": 5943 + }, + { + "epoch": 0.17, + "grad_norm": 1.422613272351202, + "learning_rate": 9.477686862439166e-06, + "loss": 0.4045, + "step": 5944 + }, + { + "epoch": 0.17, + "grad_norm": 1.5741555890028365, + "learning_rate": 9.477477829208066e-06, + "loss": 0.4135, + "step": 5945 + }, + { + "epoch": 0.17, + "grad_norm": 1.4662659672401785, + "learning_rate": 9.477268756463174e-06, + "loss": 0.3909, + "step": 5946 + }, + { + "epoch": 0.17, + "grad_norm": 1.0818558751102145, + "learning_rate": 9.477059644206335e-06, + "loss": 0.625, + "step": 5947 + }, + { + "epoch": 0.17, + "grad_norm": 0.9061126235456683, + "learning_rate": 9.476850492439393e-06, + "loss": 0.5684, + "step": 5948 + }, + { + "epoch": 0.17, + "grad_norm": 1.519219594162856, + "learning_rate": 9.4766413011642e-06, + "loss": 0.414, + "step": 5949 + }, + { + "epoch": 0.17, + "grad_norm": 1.4078682061775722, + "learning_rate": 9.476432070382596e-06, + "loss": 0.3886, + "step": 5950 + }, + { + "epoch": 0.17, + "grad_norm": 1.6005602109446264, + "learning_rate": 9.476222800096427e-06, + "loss": 0.3744, + "step": 5951 + }, + { + "epoch": 0.17, + "grad_norm": 1.3176737407216483, + "learning_rate": 9.476013490307545e-06, + "loss": 0.3744, + "step": 5952 + }, + { + "epoch": 0.17, + "grad_norm": 1.4999785183490264, + "learning_rate": 9.475804141017793e-06, + "loss": 0.3834, + "step": 5953 + }, + { + "epoch": 0.17, + "grad_norm": 1.4310622259416375, + "learning_rate": 9.47559475222902e-06, + "loss": 0.3927, + "step": 5954 + }, + { + "epoch": 0.17, + "grad_norm": 1.7390180829142927, + "learning_rate": 9.475385323943073e-06, + "loss": 0.4011, + "step": 5955 + }, + { + "epoch": 0.17, + "grad_norm": 1.4641921333199626, + "learning_rate": 9.475175856161803e-06, + "loss": 0.3839, + "step": 5956 + }, + { + "epoch": 0.17, + "grad_norm": 1.506902653154584, + "learning_rate": 9.474966348887055e-06, + "loss": 0.4008, + "step": 5957 + }, + { + "epoch": 0.17, + "grad_norm": 1.6573722102335804, + "learning_rate": 9.47475680212068e-06, + "loss": 0.4312, + "step": 5958 + }, + { + "epoch": 0.17, + "grad_norm": 1.5351941128587814, + "learning_rate": 9.474547215864525e-06, + "loss": 0.3687, + "step": 5959 + }, + { + "epoch": 0.17, + "grad_norm": 1.4700488702647652, + "learning_rate": 9.474337590120442e-06, + "loss": 0.3894, + "step": 5960 + }, + { + "epoch": 0.17, + "grad_norm": 1.3531164842296604, + "learning_rate": 9.474127924890281e-06, + "loss": 0.3717, + "step": 5961 + }, + { + "epoch": 0.17, + "grad_norm": 1.4240203902097406, + "learning_rate": 9.473918220175891e-06, + "loss": 0.3975, + "step": 5962 + }, + { + "epoch": 0.17, + "grad_norm": 1.397002612085973, + "learning_rate": 9.473708475979122e-06, + "loss": 0.3819, + "step": 5963 + }, + { + "epoch": 0.17, + "grad_norm": 1.1170290042642685, + "learning_rate": 9.473498692301828e-06, + "loss": 0.5968, + "step": 5964 + }, + { + "epoch": 0.17, + "grad_norm": 1.3342097332494032, + "learning_rate": 9.473288869145857e-06, + "loss": 0.3832, + "step": 5965 + }, + { + "epoch": 0.17, + "grad_norm": 1.74363161289645, + "learning_rate": 9.473079006513062e-06, + "loss": 0.3847, + "step": 5966 + }, + { + "epoch": 0.17, + "grad_norm": 1.0146431816346766, + "learning_rate": 9.472869104405295e-06, + "loss": 0.6146, + "step": 5967 + }, + { + "epoch": 0.17, + "grad_norm": 1.5366458818747954, + "learning_rate": 9.47265916282441e-06, + "loss": 0.4363, + "step": 5968 + }, + { + "epoch": 0.17, + "grad_norm": 3.5842462024274035, + "learning_rate": 9.472449181772256e-06, + "loss": 0.3925, + "step": 5969 + }, + { + "epoch": 0.17, + "grad_norm": 1.5898896400461535, + "learning_rate": 9.472239161250691e-06, + "loss": 0.3756, + "step": 5970 + }, + { + "epoch": 0.17, + "grad_norm": 3.404883019889776, + "learning_rate": 9.472029101261563e-06, + "loss": 0.3821, + "step": 5971 + }, + { + "epoch": 0.17, + "grad_norm": 1.4112503661878029, + "learning_rate": 9.471819001806731e-06, + "loss": 0.3746, + "step": 5972 + }, + { + "epoch": 0.17, + "grad_norm": 1.4597408967226357, + "learning_rate": 9.471608862888047e-06, + "loss": 0.3802, + "step": 5973 + }, + { + "epoch": 0.17, + "grad_norm": 1.3985384525177678, + "learning_rate": 9.471398684507364e-06, + "loss": 0.3756, + "step": 5974 + }, + { + "epoch": 0.17, + "grad_norm": 1.686103728449325, + "learning_rate": 9.471188466666537e-06, + "loss": 0.383, + "step": 5975 + }, + { + "epoch": 0.17, + "grad_norm": 2.437252827546064, + "learning_rate": 9.470978209367423e-06, + "loss": 0.3832, + "step": 5976 + }, + { + "epoch": 0.17, + "grad_norm": 1.509613760273641, + "learning_rate": 9.470767912611878e-06, + "loss": 0.3911, + "step": 5977 + }, + { + "epoch": 0.17, + "grad_norm": 2.0158557312866248, + "learning_rate": 9.470557576401756e-06, + "loss": 0.378, + "step": 5978 + }, + { + "epoch": 0.17, + "grad_norm": 1.5866582956360753, + "learning_rate": 9.470347200738912e-06, + "loss": 0.3737, + "step": 5979 + }, + { + "epoch": 0.17, + "grad_norm": 1.6125708142845643, + "learning_rate": 9.470136785625206e-06, + "loss": 0.3864, + "step": 5980 + }, + { + "epoch": 0.17, + "grad_norm": 1.549014123009463, + "learning_rate": 9.469926331062494e-06, + "loss": 0.4008, + "step": 5981 + }, + { + "epoch": 0.17, + "grad_norm": 1.4913083414953028, + "learning_rate": 9.46971583705263e-06, + "loss": 0.3556, + "step": 5982 + }, + { + "epoch": 0.17, + "grad_norm": 1.3994499965836633, + "learning_rate": 9.469505303597474e-06, + "loss": 0.3648, + "step": 5983 + }, + { + "epoch": 0.17, + "grad_norm": 7.955155579909698, + "learning_rate": 9.469294730698889e-06, + "loss": 0.3691, + "step": 5984 + }, + { + "epoch": 0.17, + "grad_norm": 1.9049355270368356, + "learning_rate": 9.469084118358724e-06, + "loss": 0.4277, + "step": 5985 + }, + { + "epoch": 0.17, + "grad_norm": 1.6773171938899798, + "learning_rate": 9.468873466578842e-06, + "loss": 0.3782, + "step": 5986 + }, + { + "epoch": 0.17, + "grad_norm": 1.4995040704282456, + "learning_rate": 9.468662775361104e-06, + "loss": 0.394, + "step": 5987 + }, + { + "epoch": 0.17, + "grad_norm": 1.6041989801025256, + "learning_rate": 9.468452044707368e-06, + "loss": 0.3967, + "step": 5988 + }, + { + "epoch": 0.17, + "grad_norm": 1.5829650156831847, + "learning_rate": 9.46824127461949e-06, + "loss": 0.4152, + "step": 5989 + }, + { + "epoch": 0.17, + "grad_norm": 1.7714631770130047, + "learning_rate": 9.468030465099335e-06, + "loss": 0.3965, + "step": 5990 + }, + { + "epoch": 0.17, + "grad_norm": 1.5096591217313853, + "learning_rate": 9.467819616148762e-06, + "loss": 0.4082, + "step": 5991 + }, + { + "epoch": 0.17, + "grad_norm": 1.484646374664225, + "learning_rate": 9.46760872776963e-06, + "loss": 0.3896, + "step": 5992 + }, + { + "epoch": 0.17, + "grad_norm": 1.4173990209897542, + "learning_rate": 9.467397799963803e-06, + "loss": 0.3732, + "step": 5993 + }, + { + "epoch": 0.17, + "grad_norm": 1.4198404006577792, + "learning_rate": 9.46718683273314e-06, + "loss": 0.3639, + "step": 5994 + }, + { + "epoch": 0.17, + "grad_norm": 1.5689424204144797, + "learning_rate": 9.466975826079505e-06, + "loss": 0.4252, + "step": 5995 + }, + { + "epoch": 0.17, + "grad_norm": 1.7879760412098982, + "learning_rate": 9.466764780004757e-06, + "loss": 0.4002, + "step": 5996 + }, + { + "epoch": 0.17, + "grad_norm": 1.4982311365216145, + "learning_rate": 9.46655369451076e-06, + "loss": 0.3817, + "step": 5997 + }, + { + "epoch": 0.17, + "grad_norm": 1.5264967364778768, + "learning_rate": 9.46634256959938e-06, + "loss": 0.3826, + "step": 5998 + }, + { + "epoch": 0.17, + "grad_norm": 2.264963124068878, + "learning_rate": 9.466131405272474e-06, + "loss": 0.3694, + "step": 5999 + }, + { + "epoch": 0.17, + "grad_norm": 1.3748302026507724, + "learning_rate": 9.465920201531911e-06, + "loss": 0.3855, + "step": 6000 + }, + { + "epoch": 0.17, + "grad_norm": 1.789334990657173, + "learning_rate": 9.465708958379552e-06, + "loss": 0.4287, + "step": 6001 + }, + { + "epoch": 0.17, + "grad_norm": 1.1810724904857872, + "learning_rate": 9.465497675817261e-06, + "loss": 0.6308, + "step": 6002 + }, + { + "epoch": 0.17, + "grad_norm": 1.478986730462368, + "learning_rate": 9.465286353846905e-06, + "loss": 0.3976, + "step": 6003 + }, + { + "epoch": 0.17, + "grad_norm": 1.6693805806152815, + "learning_rate": 9.465074992470345e-06, + "loss": 0.387, + "step": 6004 + }, + { + "epoch": 0.17, + "grad_norm": 1.482888460754208, + "learning_rate": 9.46486359168945e-06, + "loss": 0.3906, + "step": 6005 + }, + { + "epoch": 0.17, + "grad_norm": 1.7405112799266995, + "learning_rate": 9.464652151506084e-06, + "loss": 0.3848, + "step": 6006 + }, + { + "epoch": 0.17, + "grad_norm": 1.812051048381962, + "learning_rate": 9.464440671922114e-06, + "loss": 0.4134, + "step": 6007 + }, + { + "epoch": 0.17, + "grad_norm": 1.5331265237982776, + "learning_rate": 9.464229152939406e-06, + "loss": 0.3884, + "step": 6008 + }, + { + "epoch": 0.17, + "grad_norm": 1.397745130805111, + "learning_rate": 9.464017594559825e-06, + "loss": 0.3535, + "step": 6009 + }, + { + "epoch": 0.17, + "grad_norm": 1.4751022747329054, + "learning_rate": 9.46380599678524e-06, + "loss": 0.3848, + "step": 6010 + }, + { + "epoch": 0.17, + "grad_norm": 1.9096283897847852, + "learning_rate": 9.463594359617517e-06, + "loss": 0.3778, + "step": 6011 + }, + { + "epoch": 0.17, + "grad_norm": 1.5170505309191278, + "learning_rate": 9.463382683058525e-06, + "loss": 0.3711, + "step": 6012 + }, + { + "epoch": 0.17, + "grad_norm": 3.723018822215537, + "learning_rate": 9.463170967110131e-06, + "loss": 0.3917, + "step": 6013 + }, + { + "epoch": 0.17, + "grad_norm": 1.0349311550174016, + "learning_rate": 9.462959211774204e-06, + "loss": 0.5828, + "step": 6014 + }, + { + "epoch": 0.17, + "grad_norm": 1.6062422535231051, + "learning_rate": 9.462747417052611e-06, + "loss": 0.4362, + "step": 6015 + }, + { + "epoch": 0.17, + "grad_norm": 1.4939963627223805, + "learning_rate": 9.462535582947223e-06, + "loss": 0.3944, + "step": 6016 + }, + { + "epoch": 0.17, + "grad_norm": 1.979718291446335, + "learning_rate": 9.46232370945991e-06, + "loss": 0.4482, + "step": 6017 + }, + { + "epoch": 0.17, + "grad_norm": 1.8027638158802275, + "learning_rate": 9.46211179659254e-06, + "loss": 0.3639, + "step": 6018 + }, + { + "epoch": 0.17, + "grad_norm": 1.520964598997824, + "learning_rate": 9.461899844346984e-06, + "loss": 0.4462, + "step": 6019 + }, + { + "epoch": 0.17, + "grad_norm": 1.776768572589607, + "learning_rate": 9.461687852725113e-06, + "loss": 0.4405, + "step": 6020 + }, + { + "epoch": 0.17, + "grad_norm": 1.5373413088159045, + "learning_rate": 9.461475821728796e-06, + "loss": 0.4249, + "step": 6021 + }, + { + "epoch": 0.17, + "grad_norm": 1.3597989751474708, + "learning_rate": 9.461263751359906e-06, + "loss": 0.3595, + "step": 6022 + }, + { + "epoch": 0.17, + "grad_norm": 1.5676607171108574, + "learning_rate": 9.461051641620313e-06, + "loss": 0.3702, + "step": 6023 + }, + { + "epoch": 0.17, + "grad_norm": 1.3893760335019276, + "learning_rate": 9.460839492511892e-06, + "loss": 0.3918, + "step": 6024 + }, + { + "epoch": 0.17, + "grad_norm": 1.4747540414061673, + "learning_rate": 9.460627304036511e-06, + "loss": 0.3594, + "step": 6025 + }, + { + "epoch": 0.17, + "grad_norm": 1.5223086666279637, + "learning_rate": 9.460415076196046e-06, + "loss": 0.3868, + "step": 6026 + }, + { + "epoch": 0.17, + "grad_norm": 1.843924257047875, + "learning_rate": 9.460202808992367e-06, + "loss": 0.3786, + "step": 6027 + }, + { + "epoch": 0.17, + "grad_norm": 1.4275487555592932, + "learning_rate": 9.459990502427349e-06, + "loss": 0.3862, + "step": 6028 + }, + { + "epoch": 0.17, + "grad_norm": 1.3548409562107342, + "learning_rate": 9.459778156502868e-06, + "loss": 0.3911, + "step": 6029 + }, + { + "epoch": 0.17, + "grad_norm": 1.409318938490218, + "learning_rate": 9.459565771220792e-06, + "loss": 0.4227, + "step": 6030 + }, + { + "epoch": 0.17, + "grad_norm": 1.686682359191291, + "learning_rate": 9.459353346583e-06, + "loss": 0.3751, + "step": 6031 + }, + { + "epoch": 0.17, + "grad_norm": 1.4384993499265422, + "learning_rate": 9.459140882591365e-06, + "loss": 0.3907, + "step": 6032 + }, + { + "epoch": 0.17, + "grad_norm": 1.568023828844734, + "learning_rate": 9.458928379247762e-06, + "loss": 0.3591, + "step": 6033 + }, + { + "epoch": 0.18, + "grad_norm": 2.4242922380685985, + "learning_rate": 9.458715836554066e-06, + "loss": 0.3747, + "step": 6034 + }, + { + "epoch": 0.18, + "grad_norm": 1.4369703905807039, + "learning_rate": 9.458503254512155e-06, + "loss": 0.4026, + "step": 6035 + }, + { + "epoch": 0.18, + "grad_norm": 1.8063655016118205, + "learning_rate": 9.458290633123904e-06, + "loss": 0.3965, + "step": 6036 + }, + { + "epoch": 0.18, + "grad_norm": 1.8048100615005762, + "learning_rate": 9.458077972391185e-06, + "loss": 0.3786, + "step": 6037 + }, + { + "epoch": 0.18, + "grad_norm": 1.4707352052989269, + "learning_rate": 9.45786527231588e-06, + "loss": 0.3872, + "step": 6038 + }, + { + "epoch": 0.18, + "grad_norm": 1.653874042524329, + "learning_rate": 9.457652532899865e-06, + "loss": 0.3641, + "step": 6039 + }, + { + "epoch": 0.18, + "grad_norm": 1.5414248521097265, + "learning_rate": 9.457439754145017e-06, + "loss": 0.4101, + "step": 6040 + }, + { + "epoch": 0.18, + "grad_norm": 11.264633356585309, + "learning_rate": 9.457226936053214e-06, + "loss": 0.377, + "step": 6041 + }, + { + "epoch": 0.18, + "grad_norm": 1.4385788289298278, + "learning_rate": 9.457014078626334e-06, + "loss": 0.3977, + "step": 6042 + }, + { + "epoch": 0.18, + "grad_norm": 2.835320417867461, + "learning_rate": 9.456801181866256e-06, + "loss": 0.3811, + "step": 6043 + }, + { + "epoch": 0.18, + "grad_norm": 1.4922809075734103, + "learning_rate": 9.456588245774855e-06, + "loss": 0.4137, + "step": 6044 + }, + { + "epoch": 0.18, + "grad_norm": 1.1039238780448217, + "learning_rate": 9.456375270354017e-06, + "loss": 0.6234, + "step": 6045 + }, + { + "epoch": 0.18, + "grad_norm": 1.4159925246017013, + "learning_rate": 9.456162255605616e-06, + "loss": 0.3762, + "step": 6046 + }, + { + "epoch": 0.18, + "grad_norm": 1.0422734969752643, + "learning_rate": 9.455949201531533e-06, + "loss": 0.7196, + "step": 6047 + }, + { + "epoch": 0.18, + "grad_norm": 1.4107848884285328, + "learning_rate": 9.455736108133651e-06, + "loss": 0.3664, + "step": 6048 + }, + { + "epoch": 0.18, + "grad_norm": 1.7544107296698077, + "learning_rate": 9.455522975413846e-06, + "loss": 0.4716, + "step": 6049 + }, + { + "epoch": 0.18, + "grad_norm": 1.4733840836577028, + "learning_rate": 9.455309803374001e-06, + "loss": 0.3909, + "step": 6050 + }, + { + "epoch": 0.18, + "grad_norm": 3.687947380287367, + "learning_rate": 9.455096592016e-06, + "loss": 0.4029, + "step": 6051 + }, + { + "epoch": 0.18, + "grad_norm": 1.0190729472152276, + "learning_rate": 9.454883341341721e-06, + "loss": 0.6491, + "step": 6052 + }, + { + "epoch": 0.18, + "grad_norm": 2.429208883386713, + "learning_rate": 9.454670051353047e-06, + "loss": 0.4278, + "step": 6053 + }, + { + "epoch": 0.18, + "grad_norm": 1.501613405096642, + "learning_rate": 9.45445672205186e-06, + "loss": 0.3718, + "step": 6054 + }, + { + "epoch": 0.18, + "grad_norm": 1.4920291614559322, + "learning_rate": 9.454243353440045e-06, + "loss": 0.4158, + "step": 6055 + }, + { + "epoch": 0.18, + "grad_norm": 1.4021585417096412, + "learning_rate": 9.454029945519481e-06, + "loss": 0.3679, + "step": 6056 + }, + { + "epoch": 0.18, + "grad_norm": 2.668121846771412, + "learning_rate": 9.453816498292054e-06, + "loss": 0.3622, + "step": 6057 + }, + { + "epoch": 0.18, + "grad_norm": 1.5161566788347547, + "learning_rate": 9.453603011759647e-06, + "loss": 0.3911, + "step": 6058 + }, + { + "epoch": 0.18, + "grad_norm": 1.480345171768433, + "learning_rate": 9.453389485924143e-06, + "loss": 0.3666, + "step": 6059 + }, + { + "epoch": 0.18, + "grad_norm": 1.5156013426103399, + "learning_rate": 9.45317592078743e-06, + "loss": 0.3809, + "step": 6060 + }, + { + "epoch": 0.18, + "grad_norm": 1.374914115927646, + "learning_rate": 9.452962316351387e-06, + "loss": 0.3893, + "step": 6061 + }, + { + "epoch": 0.18, + "grad_norm": 1.3680897192189758, + "learning_rate": 9.452748672617904e-06, + "loss": 0.3996, + "step": 6062 + }, + { + "epoch": 0.18, + "grad_norm": 1.818082868950754, + "learning_rate": 9.452534989588864e-06, + "loss": 0.3853, + "step": 6063 + }, + { + "epoch": 0.18, + "grad_norm": 1.7287636953206664, + "learning_rate": 9.452321267266154e-06, + "loss": 0.4114, + "step": 6064 + }, + { + "epoch": 0.18, + "grad_norm": 1.5534985435250166, + "learning_rate": 9.452107505651657e-06, + "loss": 0.414, + "step": 6065 + }, + { + "epoch": 0.18, + "grad_norm": 1.4006406698635885, + "learning_rate": 9.451893704747266e-06, + "loss": 0.4039, + "step": 6066 + }, + { + "epoch": 0.18, + "grad_norm": 1.3814456372207897, + "learning_rate": 9.45167986455486e-06, + "loss": 0.3694, + "step": 6067 + }, + { + "epoch": 0.18, + "grad_norm": 1.6311530413053494, + "learning_rate": 9.451465985076331e-06, + "loss": 0.3965, + "step": 6068 + }, + { + "epoch": 0.18, + "grad_norm": 1.5050026607791642, + "learning_rate": 9.451252066313565e-06, + "loss": 0.3641, + "step": 6069 + }, + { + "epoch": 0.18, + "grad_norm": 1.9635899882206729, + "learning_rate": 9.451038108268451e-06, + "loss": 0.4018, + "step": 6070 + }, + { + "epoch": 0.18, + "grad_norm": 1.4373043371510306, + "learning_rate": 9.450824110942876e-06, + "loss": 0.3791, + "step": 6071 + }, + { + "epoch": 0.18, + "grad_norm": 1.6638491013094643, + "learning_rate": 9.45061007433873e-06, + "loss": 0.4007, + "step": 6072 + }, + { + "epoch": 0.18, + "grad_norm": 1.3972977968410503, + "learning_rate": 9.4503959984579e-06, + "loss": 0.3749, + "step": 6073 + }, + { + "epoch": 0.18, + "grad_norm": 2.3151830258821176, + "learning_rate": 9.450181883302274e-06, + "loss": 0.399, + "step": 6074 + }, + { + "epoch": 0.18, + "grad_norm": 1.4393389101213265, + "learning_rate": 9.449967728873746e-06, + "loss": 0.3772, + "step": 6075 + }, + { + "epoch": 0.18, + "grad_norm": 1.3436897662740945, + "learning_rate": 9.449753535174202e-06, + "loss": 0.3652, + "step": 6076 + }, + { + "epoch": 0.18, + "grad_norm": 1.47444318549519, + "learning_rate": 9.449539302205533e-06, + "loss": 0.389, + "step": 6077 + }, + { + "epoch": 0.18, + "grad_norm": 1.3627784375046312, + "learning_rate": 9.449325029969631e-06, + "loss": 0.3913, + "step": 6078 + }, + { + "epoch": 0.18, + "grad_norm": 1.6584079510886451, + "learning_rate": 9.449110718468389e-06, + "loss": 0.3845, + "step": 6079 + }, + { + "epoch": 0.18, + "grad_norm": 2.073904202715956, + "learning_rate": 9.448896367703693e-06, + "loss": 0.3662, + "step": 6080 + }, + { + "epoch": 0.18, + "grad_norm": 1.3052387168686244, + "learning_rate": 9.448681977677437e-06, + "loss": 0.3632, + "step": 6081 + }, + { + "epoch": 0.18, + "grad_norm": 1.7041806025586617, + "learning_rate": 9.448467548391513e-06, + "loss": 0.373, + "step": 6082 + }, + { + "epoch": 0.18, + "grad_norm": 1.3772573799992656, + "learning_rate": 9.448253079847814e-06, + "loss": 0.3886, + "step": 6083 + }, + { + "epoch": 0.18, + "grad_norm": 1.4852454407898934, + "learning_rate": 9.448038572048233e-06, + "loss": 0.4124, + "step": 6084 + }, + { + "epoch": 0.18, + "grad_norm": 1.553795875802624, + "learning_rate": 9.44782402499466e-06, + "loss": 0.4322, + "step": 6085 + }, + { + "epoch": 0.18, + "grad_norm": 2.8398640262222252, + "learning_rate": 9.447609438688994e-06, + "loss": 0.3734, + "step": 6086 + }, + { + "epoch": 0.18, + "grad_norm": 1.5845679566898914, + "learning_rate": 9.447394813133122e-06, + "loss": 0.3867, + "step": 6087 + }, + { + "epoch": 0.18, + "grad_norm": 1.4384409494923727, + "learning_rate": 9.447180148328944e-06, + "loss": 0.4186, + "step": 6088 + }, + { + "epoch": 0.18, + "grad_norm": 1.498112015951116, + "learning_rate": 9.446965444278351e-06, + "loss": 0.3918, + "step": 6089 + }, + { + "epoch": 0.18, + "grad_norm": 1.8249063246056247, + "learning_rate": 9.446750700983238e-06, + "loss": 0.3929, + "step": 6090 + }, + { + "epoch": 0.18, + "grad_norm": 1.5952128875688951, + "learning_rate": 9.446535918445501e-06, + "loss": 0.3782, + "step": 6091 + }, + { + "epoch": 0.18, + "grad_norm": 1.1491729678155769, + "learning_rate": 9.446321096667036e-06, + "loss": 0.6188, + "step": 6092 + }, + { + "epoch": 0.18, + "grad_norm": 1.4430540882217846, + "learning_rate": 9.446106235649737e-06, + "loss": 0.3991, + "step": 6093 + }, + { + "epoch": 0.18, + "grad_norm": 3.016532330049497, + "learning_rate": 9.445891335395502e-06, + "loss": 0.3904, + "step": 6094 + }, + { + "epoch": 0.18, + "grad_norm": 1.4797162139313744, + "learning_rate": 9.445676395906226e-06, + "loss": 0.372, + "step": 6095 + }, + { + "epoch": 0.18, + "grad_norm": 1.3926902616668633, + "learning_rate": 9.445461417183808e-06, + "loss": 0.3708, + "step": 6096 + }, + { + "epoch": 0.18, + "grad_norm": 1.5558807860898083, + "learning_rate": 9.445246399230141e-06, + "loss": 0.3763, + "step": 6097 + }, + { + "epoch": 0.18, + "grad_norm": 1.4542081197737098, + "learning_rate": 9.445031342047127e-06, + "loss": 0.3776, + "step": 6098 + }, + { + "epoch": 0.18, + "grad_norm": 1.4736341500856518, + "learning_rate": 9.444816245636661e-06, + "loss": 0.3763, + "step": 6099 + }, + { + "epoch": 0.18, + "grad_norm": 1.3799837171333857, + "learning_rate": 9.444601110000644e-06, + "loss": 0.4003, + "step": 6100 + }, + { + "epoch": 0.18, + "grad_norm": 1.7169881107529492, + "learning_rate": 9.444385935140972e-06, + "loss": 0.4234, + "step": 6101 + }, + { + "epoch": 0.18, + "grad_norm": 1.3320630913580662, + "learning_rate": 9.444170721059544e-06, + "loss": 0.3764, + "step": 6102 + }, + { + "epoch": 0.18, + "grad_norm": 1.8041320342738454, + "learning_rate": 9.443955467758261e-06, + "loss": 0.3916, + "step": 6103 + }, + { + "epoch": 0.18, + "grad_norm": 1.054061211041934, + "learning_rate": 9.443740175239023e-06, + "loss": 0.6356, + "step": 6104 + }, + { + "epoch": 0.18, + "grad_norm": 1.9225724982339976, + "learning_rate": 9.443524843503726e-06, + "loss": 0.3767, + "step": 6105 + }, + { + "epoch": 0.18, + "grad_norm": 1.432324651444299, + "learning_rate": 9.443309472554273e-06, + "loss": 0.4176, + "step": 6106 + }, + { + "epoch": 0.18, + "grad_norm": 0.9091204090072528, + "learning_rate": 9.443094062392567e-06, + "loss": 0.6285, + "step": 6107 + }, + { + "epoch": 0.18, + "grad_norm": 1.5252383757703616, + "learning_rate": 9.442878613020506e-06, + "loss": 0.3895, + "step": 6108 + }, + { + "epoch": 0.18, + "grad_norm": 1.5991452813736564, + "learning_rate": 9.44266312443999e-06, + "loss": 0.4179, + "step": 6109 + }, + { + "epoch": 0.18, + "grad_norm": 1.4740157495522725, + "learning_rate": 9.442447596652923e-06, + "loss": 0.3938, + "step": 6110 + }, + { + "epoch": 0.18, + "grad_norm": 1.3462728114542815, + "learning_rate": 9.442232029661208e-06, + "loss": 0.3488, + "step": 6111 + }, + { + "epoch": 0.18, + "grad_norm": 1.6021967876075283, + "learning_rate": 9.442016423466747e-06, + "loss": 0.3764, + "step": 6112 + }, + { + "epoch": 0.18, + "grad_norm": 1.4360172687203627, + "learning_rate": 9.44180077807144e-06, + "loss": 0.367, + "step": 6113 + }, + { + "epoch": 0.18, + "grad_norm": 1.585128020804832, + "learning_rate": 9.441585093477193e-06, + "loss": 0.3807, + "step": 6114 + }, + { + "epoch": 0.18, + "grad_norm": 1.3465012255789583, + "learning_rate": 9.441369369685908e-06, + "loss": 0.3688, + "step": 6115 + }, + { + "epoch": 0.18, + "grad_norm": 1.5611435817512365, + "learning_rate": 9.441153606699488e-06, + "loss": 0.3715, + "step": 6116 + }, + { + "epoch": 0.18, + "grad_norm": 1.3198300383606425, + "learning_rate": 9.44093780451984e-06, + "loss": 0.3744, + "step": 6117 + }, + { + "epoch": 0.18, + "grad_norm": 1.4582162213889813, + "learning_rate": 9.440721963148864e-06, + "loss": 0.3637, + "step": 6118 + }, + { + "epoch": 0.18, + "grad_norm": 1.6084226446112402, + "learning_rate": 9.44050608258847e-06, + "loss": 0.4101, + "step": 6119 + }, + { + "epoch": 0.18, + "grad_norm": 1.54416146546543, + "learning_rate": 9.440290162840559e-06, + "loss": 0.4098, + "step": 6120 + }, + { + "epoch": 0.18, + "grad_norm": 7.462264828401996, + "learning_rate": 9.440074203907038e-06, + "loss": 0.391, + "step": 6121 + }, + { + "epoch": 0.18, + "grad_norm": 1.56068418713072, + "learning_rate": 9.439858205789813e-06, + "loss": 0.4189, + "step": 6122 + }, + { + "epoch": 0.18, + "grad_norm": 1.5052345758196763, + "learning_rate": 9.43964216849079e-06, + "loss": 0.4001, + "step": 6123 + }, + { + "epoch": 0.18, + "grad_norm": 1.4768482189846317, + "learning_rate": 9.439426092011877e-06, + "loss": 0.4014, + "step": 6124 + }, + { + "epoch": 0.18, + "grad_norm": 1.5371971568982257, + "learning_rate": 9.439209976354979e-06, + "loss": 0.4085, + "step": 6125 + }, + { + "epoch": 0.18, + "grad_norm": 7.777171205516386, + "learning_rate": 9.438993821522002e-06, + "loss": 0.4466, + "step": 6126 + }, + { + "epoch": 0.18, + "grad_norm": 1.6708082137440863, + "learning_rate": 9.438777627514858e-06, + "loss": 0.4262, + "step": 6127 + }, + { + "epoch": 0.18, + "grad_norm": 1.4518799711129322, + "learning_rate": 9.43856139433545e-06, + "loss": 0.3902, + "step": 6128 + }, + { + "epoch": 0.18, + "grad_norm": 1.7044281070793372, + "learning_rate": 9.43834512198569e-06, + "loss": 0.4249, + "step": 6129 + }, + { + "epoch": 0.18, + "grad_norm": 1.7931069211305941, + "learning_rate": 9.438128810467484e-06, + "loss": 0.4035, + "step": 6130 + }, + { + "epoch": 0.18, + "grad_norm": 1.3120907019932873, + "learning_rate": 9.437912459782741e-06, + "loss": 0.3904, + "step": 6131 + }, + { + "epoch": 0.18, + "grad_norm": 1.0492521358955853, + "learning_rate": 9.437696069933373e-06, + "loss": 0.5683, + "step": 6132 + }, + { + "epoch": 0.18, + "grad_norm": 1.5000863735698582, + "learning_rate": 9.437479640921288e-06, + "loss": 0.3873, + "step": 6133 + }, + { + "epoch": 0.18, + "grad_norm": 1.60165421566412, + "learning_rate": 9.437263172748396e-06, + "loss": 0.4019, + "step": 6134 + }, + { + "epoch": 0.18, + "grad_norm": 2.613622933794798, + "learning_rate": 9.437046665416605e-06, + "loss": 0.3931, + "step": 6135 + }, + { + "epoch": 0.18, + "grad_norm": 1.3418927162620173, + "learning_rate": 9.436830118927832e-06, + "loss": 0.3889, + "step": 6136 + }, + { + "epoch": 0.18, + "grad_norm": 1.482489452941966, + "learning_rate": 9.43661353328398e-06, + "loss": 0.4007, + "step": 6137 + }, + { + "epoch": 0.18, + "grad_norm": 1.473859299234441, + "learning_rate": 9.436396908486967e-06, + "loss": 0.3689, + "step": 6138 + }, + { + "epoch": 0.18, + "grad_norm": 1.515227047693436, + "learning_rate": 9.436180244538701e-06, + "loss": 0.4094, + "step": 6139 + }, + { + "epoch": 0.18, + "grad_norm": 1.4030034272319623, + "learning_rate": 9.435963541441095e-06, + "loss": 0.376, + "step": 6140 + }, + { + "epoch": 0.18, + "grad_norm": 1.5391609351619204, + "learning_rate": 9.435746799196061e-06, + "loss": 0.3736, + "step": 6141 + }, + { + "epoch": 0.18, + "grad_norm": 1.4625368741871405, + "learning_rate": 9.435530017805514e-06, + "loss": 0.3914, + "step": 6142 + }, + { + "epoch": 0.18, + "grad_norm": 1.367151587311832, + "learning_rate": 9.435313197271364e-06, + "loss": 0.3719, + "step": 6143 + }, + { + "epoch": 0.18, + "grad_norm": 1.285777417240178, + "learning_rate": 9.435096337595526e-06, + "loss": 0.3622, + "step": 6144 + }, + { + "epoch": 0.18, + "grad_norm": 1.4680176585913973, + "learning_rate": 9.434879438779913e-06, + "loss": 0.3678, + "step": 6145 + }, + { + "epoch": 0.18, + "grad_norm": 1.4174506062199117, + "learning_rate": 9.434662500826442e-06, + "loss": 0.3902, + "step": 6146 + }, + { + "epoch": 0.18, + "grad_norm": 1.8643517186396823, + "learning_rate": 9.434445523737024e-06, + "loss": 0.3814, + "step": 6147 + }, + { + "epoch": 0.18, + "grad_norm": 1.6650250291247437, + "learning_rate": 9.434228507513573e-06, + "loss": 0.3744, + "step": 6148 + }, + { + "epoch": 0.18, + "grad_norm": 1.9142262785367217, + "learning_rate": 9.43401145215801e-06, + "loss": 0.3816, + "step": 6149 + }, + { + "epoch": 0.18, + "grad_norm": 1.4194275625154227, + "learning_rate": 9.433794357672245e-06, + "loss": 0.3816, + "step": 6150 + }, + { + "epoch": 0.18, + "grad_norm": 1.3859896972665164, + "learning_rate": 9.433577224058195e-06, + "loss": 0.3859, + "step": 6151 + }, + { + "epoch": 0.18, + "grad_norm": 1.3714039106150657, + "learning_rate": 9.433360051317776e-06, + "loss": 0.408, + "step": 6152 + }, + { + "epoch": 0.18, + "grad_norm": 1.450338154342743, + "learning_rate": 9.43314283945291e-06, + "loss": 0.3808, + "step": 6153 + }, + { + "epoch": 0.18, + "grad_norm": 1.701309169480391, + "learning_rate": 9.432925588465504e-06, + "loss": 0.407, + "step": 6154 + }, + { + "epoch": 0.18, + "grad_norm": 1.1374061158566298, + "learning_rate": 9.432708298357483e-06, + "loss": 0.6003, + "step": 6155 + }, + { + "epoch": 0.18, + "grad_norm": 1.6122459717068256, + "learning_rate": 9.43249096913076e-06, + "loss": 0.418, + "step": 6156 + }, + { + "epoch": 0.18, + "grad_norm": 1.3998315560035937, + "learning_rate": 9.432273600787258e-06, + "loss": 0.3576, + "step": 6157 + }, + { + "epoch": 0.18, + "grad_norm": 1.5692634944036457, + "learning_rate": 9.43205619332889e-06, + "loss": 0.44, + "step": 6158 + }, + { + "epoch": 0.18, + "grad_norm": 1.607614459866989, + "learning_rate": 9.43183874675758e-06, + "loss": 0.4, + "step": 6159 + }, + { + "epoch": 0.18, + "grad_norm": 1.4197888219176236, + "learning_rate": 9.431621261075241e-06, + "loss": 0.4246, + "step": 6160 + }, + { + "epoch": 0.18, + "grad_norm": 1.3467085514600512, + "learning_rate": 9.431403736283795e-06, + "loss": 0.3656, + "step": 6161 + }, + { + "epoch": 0.18, + "grad_norm": 1.4728508104965412, + "learning_rate": 9.431186172385163e-06, + "loss": 0.3908, + "step": 6162 + }, + { + "epoch": 0.18, + "grad_norm": 1.4366112128085944, + "learning_rate": 9.430968569381265e-06, + "loss": 0.384, + "step": 6163 + }, + { + "epoch": 0.18, + "grad_norm": 1.0171407491630688, + "learning_rate": 9.430750927274018e-06, + "loss": 0.5844, + "step": 6164 + }, + { + "epoch": 0.18, + "grad_norm": 1.5486444343812042, + "learning_rate": 9.430533246065345e-06, + "loss": 0.374, + "step": 6165 + }, + { + "epoch": 0.18, + "grad_norm": 1.448402780151091, + "learning_rate": 9.430315525757168e-06, + "loss": 0.3698, + "step": 6166 + }, + { + "epoch": 0.18, + "grad_norm": 1.4055577925904967, + "learning_rate": 9.430097766351407e-06, + "loss": 0.3869, + "step": 6167 + }, + { + "epoch": 0.18, + "grad_norm": 1.4875215820747638, + "learning_rate": 9.429879967849985e-06, + "loss": 0.4065, + "step": 6168 + }, + { + "epoch": 0.18, + "grad_norm": 1.415611690412347, + "learning_rate": 9.429662130254822e-06, + "loss": 0.3488, + "step": 6169 + }, + { + "epoch": 0.18, + "grad_norm": 1.6996094202087748, + "learning_rate": 9.429444253567843e-06, + "loss": 0.388, + "step": 6170 + }, + { + "epoch": 0.18, + "grad_norm": 1.497490227365599, + "learning_rate": 9.429226337790967e-06, + "loss": 0.4273, + "step": 6171 + }, + { + "epoch": 0.18, + "grad_norm": 1.5562645114208604, + "learning_rate": 9.429008382926122e-06, + "loss": 0.4025, + "step": 6172 + }, + { + "epoch": 0.18, + "grad_norm": 1.532073353143843, + "learning_rate": 9.428790388975227e-06, + "loss": 0.4049, + "step": 6173 + }, + { + "epoch": 0.18, + "grad_norm": 1.321125621661292, + "learning_rate": 9.428572355940207e-06, + "loss": 0.3907, + "step": 6174 + }, + { + "epoch": 0.18, + "grad_norm": 1.4197784913963312, + "learning_rate": 9.428354283822989e-06, + "loss": 0.4001, + "step": 6175 + }, + { + "epoch": 0.18, + "grad_norm": 1.3802608893939823, + "learning_rate": 9.428136172625493e-06, + "loss": 0.41, + "step": 6176 + }, + { + "epoch": 0.18, + "grad_norm": 1.6673670749211875, + "learning_rate": 9.427918022349648e-06, + "loss": 0.3725, + "step": 6177 + }, + { + "epoch": 0.18, + "grad_norm": 1.5349458309377793, + "learning_rate": 9.427699832997376e-06, + "loss": 0.406, + "step": 6178 + }, + { + "epoch": 0.18, + "grad_norm": 1.8299233416919123, + "learning_rate": 9.427481604570605e-06, + "loss": 0.4353, + "step": 6179 + }, + { + "epoch": 0.18, + "grad_norm": 1.7695565559612882, + "learning_rate": 9.42726333707126e-06, + "loss": 0.445, + "step": 6180 + }, + { + "epoch": 0.18, + "grad_norm": 1.4209800887510935, + "learning_rate": 9.427045030501268e-06, + "loss": 0.3813, + "step": 6181 + }, + { + "epoch": 0.18, + "grad_norm": 1.3882045853722877, + "learning_rate": 9.42682668486255e-06, + "loss": 0.3762, + "step": 6182 + }, + { + "epoch": 0.18, + "grad_norm": 1.5463593932098298, + "learning_rate": 9.426608300157042e-06, + "loss": 0.3996, + "step": 6183 + }, + { + "epoch": 0.18, + "grad_norm": 1.5710042317684902, + "learning_rate": 9.426389876386664e-06, + "loss": 0.4549, + "step": 6184 + }, + { + "epoch": 0.18, + "grad_norm": 1.3943714995462118, + "learning_rate": 9.426171413553347e-06, + "loss": 0.3833, + "step": 6185 + }, + { + "epoch": 0.18, + "grad_norm": 1.5419148915261205, + "learning_rate": 9.425952911659018e-06, + "loss": 0.3808, + "step": 6186 + }, + { + "epoch": 0.18, + "grad_norm": 1.033172357223029, + "learning_rate": 9.425734370705606e-06, + "loss": 0.6144, + "step": 6187 + }, + { + "epoch": 0.18, + "grad_norm": 1.34235203760835, + "learning_rate": 9.425515790695039e-06, + "loss": 0.3859, + "step": 6188 + }, + { + "epoch": 0.18, + "grad_norm": 1.48887205680112, + "learning_rate": 9.425297171629246e-06, + "loss": 0.4076, + "step": 6189 + }, + { + "epoch": 0.18, + "grad_norm": 1.7616967567861868, + "learning_rate": 9.425078513510157e-06, + "loss": 0.4324, + "step": 6190 + }, + { + "epoch": 0.18, + "grad_norm": 1.4793476009728592, + "learning_rate": 9.4248598163397e-06, + "loss": 0.407, + "step": 6191 + }, + { + "epoch": 0.18, + "grad_norm": 1.3857711623829583, + "learning_rate": 9.424641080119806e-06, + "loss": 0.3889, + "step": 6192 + }, + { + "epoch": 0.18, + "grad_norm": 1.4399730423888264, + "learning_rate": 9.424422304852405e-06, + "loss": 0.3902, + "step": 6193 + }, + { + "epoch": 0.18, + "grad_norm": 1.7529385664858503, + "learning_rate": 9.42420349053943e-06, + "loss": 0.3804, + "step": 6194 + }, + { + "epoch": 0.18, + "grad_norm": 1.624943065860284, + "learning_rate": 9.423984637182807e-06, + "loss": 0.399, + "step": 6195 + }, + { + "epoch": 0.18, + "grad_norm": 1.4746004640073973, + "learning_rate": 9.423765744784475e-06, + "loss": 0.4086, + "step": 6196 + }, + { + "epoch": 0.18, + "grad_norm": 1.624171056771973, + "learning_rate": 9.423546813346358e-06, + "loss": 0.3987, + "step": 6197 + }, + { + "epoch": 0.18, + "grad_norm": 1.6585957276931758, + "learning_rate": 9.42332784287039e-06, + "loss": 0.374, + "step": 6198 + }, + { + "epoch": 0.18, + "grad_norm": 1.617432935020899, + "learning_rate": 9.423108833358508e-06, + "loss": 0.4019, + "step": 6199 + }, + { + "epoch": 0.18, + "grad_norm": 1.080322166617211, + "learning_rate": 9.42288978481264e-06, + "loss": 0.6169, + "step": 6200 + }, + { + "epoch": 0.18, + "grad_norm": 1.4483417171776323, + "learning_rate": 9.422670697234722e-06, + "loss": 0.3945, + "step": 6201 + }, + { + "epoch": 0.18, + "grad_norm": 0.9808769003779013, + "learning_rate": 9.422451570626684e-06, + "loss": 0.583, + "step": 6202 + }, + { + "epoch": 0.18, + "grad_norm": 1.5551643007368907, + "learning_rate": 9.422232404990462e-06, + "loss": 0.3839, + "step": 6203 + }, + { + "epoch": 0.18, + "grad_norm": 1.4872162734660184, + "learning_rate": 9.42201320032799e-06, + "loss": 0.3972, + "step": 6204 + }, + { + "epoch": 0.18, + "grad_norm": 1.0092865352444622, + "learning_rate": 9.421793956641204e-06, + "loss": 0.5835, + "step": 6205 + }, + { + "epoch": 0.18, + "grad_norm": 1.4420478346708208, + "learning_rate": 9.421574673932037e-06, + "loss": 0.3703, + "step": 6206 + }, + { + "epoch": 0.18, + "grad_norm": 1.4211321571038666, + "learning_rate": 9.421355352202423e-06, + "loss": 0.38, + "step": 6207 + }, + { + "epoch": 0.18, + "grad_norm": 1.5044522303700236, + "learning_rate": 9.4211359914543e-06, + "loss": 0.3919, + "step": 6208 + }, + { + "epoch": 0.18, + "grad_norm": 1.3913962836819227, + "learning_rate": 9.420916591689602e-06, + "loss": 0.3764, + "step": 6209 + }, + { + "epoch": 0.18, + "grad_norm": 1.4468148785697401, + "learning_rate": 9.420697152910268e-06, + "loss": 0.3582, + "step": 6210 + }, + { + "epoch": 0.18, + "grad_norm": 1.2631587150791441, + "learning_rate": 9.42047767511823e-06, + "loss": 0.3734, + "step": 6211 + }, + { + "epoch": 0.18, + "grad_norm": 1.3350213743814385, + "learning_rate": 9.420258158315429e-06, + "loss": 0.4006, + "step": 6212 + }, + { + "epoch": 0.18, + "grad_norm": 1.3406867783143186, + "learning_rate": 9.4200386025038e-06, + "loss": 0.3635, + "step": 6213 + }, + { + "epoch": 0.18, + "grad_norm": 1.3572201746800134, + "learning_rate": 9.419819007685282e-06, + "loss": 0.4264, + "step": 6214 + }, + { + "epoch": 0.18, + "grad_norm": 1.714242177776364, + "learning_rate": 9.419599373861813e-06, + "loss": 0.4023, + "step": 6215 + }, + { + "epoch": 0.18, + "grad_norm": 4.3053515431541465, + "learning_rate": 9.419379701035327e-06, + "loss": 0.393, + "step": 6216 + }, + { + "epoch": 0.18, + "grad_norm": 1.4253056263262496, + "learning_rate": 9.419159989207769e-06, + "loss": 0.4047, + "step": 6217 + }, + { + "epoch": 0.18, + "grad_norm": 1.3888681963385536, + "learning_rate": 9.418940238381075e-06, + "loss": 0.406, + "step": 6218 + }, + { + "epoch": 0.18, + "grad_norm": 1.4735005908658163, + "learning_rate": 9.418720448557183e-06, + "loss": 0.3726, + "step": 6219 + }, + { + "epoch": 0.18, + "grad_norm": 1.3899446923201373, + "learning_rate": 9.418500619738035e-06, + "loss": 0.3845, + "step": 6220 + }, + { + "epoch": 0.18, + "grad_norm": 1.6038422571985869, + "learning_rate": 9.41828075192557e-06, + "loss": 0.3989, + "step": 6221 + }, + { + "epoch": 0.18, + "grad_norm": 1.473213076964862, + "learning_rate": 9.418060845121729e-06, + "loss": 0.3945, + "step": 6222 + }, + { + "epoch": 0.18, + "grad_norm": 1.4391097056011677, + "learning_rate": 9.41784089932845e-06, + "loss": 0.4224, + "step": 6223 + }, + { + "epoch": 0.18, + "grad_norm": 2.2535752087527445, + "learning_rate": 9.417620914547678e-06, + "loss": 0.408, + "step": 6224 + }, + { + "epoch": 0.18, + "grad_norm": 1.5387959469281323, + "learning_rate": 9.417400890781353e-06, + "loss": 0.395, + "step": 6225 + }, + { + "epoch": 0.18, + "grad_norm": 1.5650417301759305, + "learning_rate": 9.417180828031414e-06, + "loss": 0.3959, + "step": 6226 + }, + { + "epoch": 0.18, + "grad_norm": 1.3605464553594504, + "learning_rate": 9.416960726299807e-06, + "loss": 0.3782, + "step": 6227 + }, + { + "epoch": 0.18, + "grad_norm": 1.8045582912157385, + "learning_rate": 9.416740585588471e-06, + "loss": 0.3666, + "step": 6228 + }, + { + "epoch": 0.18, + "grad_norm": 1.5144459843977967, + "learning_rate": 9.41652040589935e-06, + "loss": 0.3961, + "step": 6229 + }, + { + "epoch": 0.18, + "grad_norm": 1.5376329546483696, + "learning_rate": 9.41630018723439e-06, + "loss": 0.428, + "step": 6230 + }, + { + "epoch": 0.18, + "grad_norm": 1.3821387535502496, + "learning_rate": 9.41607992959553e-06, + "loss": 0.3747, + "step": 6231 + }, + { + "epoch": 0.18, + "grad_norm": 1.5098914500806468, + "learning_rate": 9.415859632984716e-06, + "loss": 0.3867, + "step": 6232 + }, + { + "epoch": 0.18, + "grad_norm": 1.7331540114691737, + "learning_rate": 9.415639297403891e-06, + "loss": 0.4442, + "step": 6233 + }, + { + "epoch": 0.18, + "grad_norm": 1.871748964021486, + "learning_rate": 9.415418922855002e-06, + "loss": 0.3798, + "step": 6234 + }, + { + "epoch": 0.18, + "grad_norm": 4.2083722239347425, + "learning_rate": 9.415198509339992e-06, + "loss": 0.403, + "step": 6235 + }, + { + "epoch": 0.18, + "grad_norm": 1.4372839378241813, + "learning_rate": 9.414978056860804e-06, + "loss": 0.4126, + "step": 6236 + }, + { + "epoch": 0.18, + "grad_norm": 1.4216043045719284, + "learning_rate": 9.414757565419387e-06, + "loss": 0.3854, + "step": 6237 + }, + { + "epoch": 0.18, + "grad_norm": 1.3974531359842293, + "learning_rate": 9.414537035017686e-06, + "loss": 0.3675, + "step": 6238 + }, + { + "epoch": 0.18, + "grad_norm": 1.4924159343772885, + "learning_rate": 9.414316465657645e-06, + "loss": 0.4019, + "step": 6239 + }, + { + "epoch": 0.18, + "grad_norm": 1.4073056189042135, + "learning_rate": 9.414095857341213e-06, + "loss": 0.3885, + "step": 6240 + }, + { + "epoch": 0.18, + "grad_norm": 1.499156483226212, + "learning_rate": 9.413875210070335e-06, + "loss": 0.3967, + "step": 6241 + }, + { + "epoch": 0.18, + "grad_norm": 1.5998452201168627, + "learning_rate": 9.413654523846962e-06, + "loss": 0.4318, + "step": 6242 + }, + { + "epoch": 0.18, + "grad_norm": 1.5276245788288378, + "learning_rate": 9.413433798673037e-06, + "loss": 0.3867, + "step": 6243 + }, + { + "epoch": 0.18, + "grad_norm": 1.0821460667640759, + "learning_rate": 9.41321303455051e-06, + "loss": 0.629, + "step": 6244 + }, + { + "epoch": 0.18, + "grad_norm": 1.3864331807773778, + "learning_rate": 9.41299223148133e-06, + "loss": 0.361, + "step": 6245 + }, + { + "epoch": 0.18, + "grad_norm": 1.847802425825664, + "learning_rate": 9.412771389467443e-06, + "loss": 0.4109, + "step": 6246 + }, + { + "epoch": 0.18, + "grad_norm": 1.4131429593016716, + "learning_rate": 9.412550508510802e-06, + "loss": 0.4254, + "step": 6247 + }, + { + "epoch": 0.18, + "grad_norm": 1.294979615041221, + "learning_rate": 9.412329588613352e-06, + "loss": 0.364, + "step": 6248 + }, + { + "epoch": 0.18, + "grad_norm": 1.407245243694843, + "learning_rate": 9.412108629777044e-06, + "loss": 0.3591, + "step": 6249 + }, + { + "epoch": 0.18, + "grad_norm": 1.5342824157388226, + "learning_rate": 9.411887632003828e-06, + "loss": 0.3772, + "step": 6250 + }, + { + "epoch": 0.18, + "grad_norm": 1.309881124347106, + "learning_rate": 9.411666595295657e-06, + "loss": 0.3619, + "step": 6251 + }, + { + "epoch": 0.18, + "grad_norm": 1.4962123907632574, + "learning_rate": 9.411445519654478e-06, + "loss": 0.3709, + "step": 6252 + }, + { + "epoch": 0.18, + "grad_norm": 1.575094332326788, + "learning_rate": 9.411224405082244e-06, + "loss": 0.41, + "step": 6253 + }, + { + "epoch": 0.18, + "grad_norm": 1.5115894017688691, + "learning_rate": 9.411003251580905e-06, + "loss": 0.359, + "step": 6254 + }, + { + "epoch": 0.18, + "grad_norm": 1.4397263115422732, + "learning_rate": 9.410782059152414e-06, + "loss": 0.3845, + "step": 6255 + }, + { + "epoch": 0.18, + "grad_norm": 1.390565362840096, + "learning_rate": 9.410560827798721e-06, + "loss": 0.3889, + "step": 6256 + }, + { + "epoch": 0.18, + "grad_norm": 1.440800347412385, + "learning_rate": 9.410339557521782e-06, + "loss": 0.3712, + "step": 6257 + }, + { + "epoch": 0.18, + "grad_norm": 1.3509181875949419, + "learning_rate": 9.410118248323545e-06, + "loss": 0.372, + "step": 6258 + }, + { + "epoch": 0.18, + "grad_norm": 1.3558250236078557, + "learning_rate": 9.409896900205968e-06, + "loss": 0.38, + "step": 6259 + }, + { + "epoch": 0.18, + "grad_norm": 1.3721225158315844, + "learning_rate": 9.409675513171e-06, + "loss": 0.3697, + "step": 6260 + }, + { + "epoch": 0.18, + "grad_norm": 1.5756495387259837, + "learning_rate": 9.409454087220598e-06, + "loss": 0.3702, + "step": 6261 + }, + { + "epoch": 0.18, + "grad_norm": 1.3459752852357962, + "learning_rate": 9.409232622356713e-06, + "loss": 0.3739, + "step": 6262 + }, + { + "epoch": 0.18, + "grad_norm": 1.681923727747168, + "learning_rate": 9.409011118581302e-06, + "loss": 0.3809, + "step": 6263 + }, + { + "epoch": 0.18, + "grad_norm": 1.3900229077154294, + "learning_rate": 9.40878957589632e-06, + "loss": 0.3984, + "step": 6264 + }, + { + "epoch": 0.18, + "grad_norm": 1.364936371737043, + "learning_rate": 9.40856799430372e-06, + "loss": 0.3699, + "step": 6265 + }, + { + "epoch": 0.18, + "grad_norm": 1.4357676634153262, + "learning_rate": 9.408346373805459e-06, + "loss": 0.3951, + "step": 6266 + }, + { + "epoch": 0.18, + "grad_norm": 1.4029529080822682, + "learning_rate": 9.408124714403493e-06, + "loss": 0.3608, + "step": 6267 + }, + { + "epoch": 0.18, + "grad_norm": 2.1556035928672754, + "learning_rate": 9.407903016099775e-06, + "loss": 0.3843, + "step": 6268 + }, + { + "epoch": 0.18, + "grad_norm": 1.614997513661747, + "learning_rate": 9.407681278896267e-06, + "loss": 0.4112, + "step": 6269 + }, + { + "epoch": 0.18, + "grad_norm": 1.578081925548985, + "learning_rate": 9.407459502794921e-06, + "loss": 0.3877, + "step": 6270 + }, + { + "epoch": 0.18, + "grad_norm": 1.8180162071913628, + "learning_rate": 9.407237687797697e-06, + "loss": 0.391, + "step": 6271 + }, + { + "epoch": 0.18, + "grad_norm": 1.5443491236957705, + "learning_rate": 9.40701583390655e-06, + "loss": 0.3992, + "step": 6272 + }, + { + "epoch": 0.18, + "grad_norm": 1.317283588720344, + "learning_rate": 9.406793941123441e-06, + "loss": 0.3706, + "step": 6273 + }, + { + "epoch": 0.18, + "grad_norm": 1.5141524916999791, + "learning_rate": 9.406572009450325e-06, + "loss": 0.4152, + "step": 6274 + }, + { + "epoch": 0.18, + "grad_norm": 1.4133875979584634, + "learning_rate": 9.406350038889163e-06, + "loss": 0.4018, + "step": 6275 + }, + { + "epoch": 0.18, + "grad_norm": 1.412574515607789, + "learning_rate": 9.406128029441914e-06, + "loss": 0.3711, + "step": 6276 + }, + { + "epoch": 0.18, + "grad_norm": 1.5100592461313629, + "learning_rate": 9.405905981110536e-06, + "loss": 0.4135, + "step": 6277 + }, + { + "epoch": 0.18, + "grad_norm": 1.5494003994876597, + "learning_rate": 9.405683893896989e-06, + "loss": 0.3742, + "step": 6278 + }, + { + "epoch": 0.18, + "grad_norm": 1.6863803510136546, + "learning_rate": 9.40546176780323e-06, + "loss": 0.3687, + "step": 6279 + }, + { + "epoch": 0.18, + "grad_norm": 1.9943344670224201, + "learning_rate": 9.405239602831224e-06, + "loss": 0.3791, + "step": 6280 + }, + { + "epoch": 0.18, + "grad_norm": 3.2406727298643783, + "learning_rate": 9.40501739898293e-06, + "loss": 0.3821, + "step": 6281 + }, + { + "epoch": 0.18, + "grad_norm": 1.6683451759403598, + "learning_rate": 9.40479515626031e-06, + "loss": 0.4161, + "step": 6282 + }, + { + "epoch": 0.18, + "grad_norm": 2.075815406070019, + "learning_rate": 9.404572874665322e-06, + "loss": 0.3963, + "step": 6283 + }, + { + "epoch": 0.18, + "grad_norm": 1.47828385043267, + "learning_rate": 9.404350554199928e-06, + "loss": 0.3566, + "step": 6284 + }, + { + "epoch": 0.18, + "grad_norm": 1.9927054362745529, + "learning_rate": 9.404128194866093e-06, + "loss": 0.3837, + "step": 6285 + }, + { + "epoch": 0.18, + "grad_norm": 1.4766652911890363, + "learning_rate": 9.403905796665778e-06, + "loss": 0.3889, + "step": 6286 + }, + { + "epoch": 0.18, + "grad_norm": 1.5425607301896702, + "learning_rate": 9.403683359600946e-06, + "loss": 0.3759, + "step": 6287 + }, + { + "epoch": 0.18, + "grad_norm": 1.5318537333694455, + "learning_rate": 9.40346088367356e-06, + "loss": 0.3917, + "step": 6288 + }, + { + "epoch": 0.18, + "grad_norm": 1.651774386303037, + "learning_rate": 9.403238368885582e-06, + "loss": 0.3674, + "step": 6289 + }, + { + "epoch": 0.18, + "grad_norm": 1.4944766155090934, + "learning_rate": 9.403015815238977e-06, + "loss": 0.3826, + "step": 6290 + }, + { + "epoch": 0.18, + "grad_norm": 1.639282605181195, + "learning_rate": 9.402793222735706e-06, + "loss": 0.3646, + "step": 6291 + }, + { + "epoch": 0.18, + "grad_norm": 1.4169752379508358, + "learning_rate": 9.402570591377738e-06, + "loss": 0.3983, + "step": 6292 + }, + { + "epoch": 0.18, + "grad_norm": 1.3745077624304307, + "learning_rate": 9.402347921167036e-06, + "loss": 0.3976, + "step": 6293 + }, + { + "epoch": 0.18, + "grad_norm": 1.4431794955814994, + "learning_rate": 9.402125212105564e-06, + "loss": 0.4038, + "step": 6294 + }, + { + "epoch": 0.18, + "grad_norm": 1.5619902968271602, + "learning_rate": 9.40190246419529e-06, + "loss": 0.3722, + "step": 6295 + }, + { + "epoch": 0.18, + "grad_norm": 1.5261375125992438, + "learning_rate": 9.401679677438175e-06, + "loss": 0.3985, + "step": 6296 + }, + { + "epoch": 0.18, + "grad_norm": 1.5466227150232839, + "learning_rate": 9.401456851836191e-06, + "loss": 0.3645, + "step": 6297 + }, + { + "epoch": 0.18, + "grad_norm": 1.4182733918163162, + "learning_rate": 9.401233987391298e-06, + "loss": 0.3649, + "step": 6298 + }, + { + "epoch": 0.18, + "grad_norm": 1.7228566121158375, + "learning_rate": 9.40101108410547e-06, + "loss": 0.3924, + "step": 6299 + }, + { + "epoch": 0.18, + "grad_norm": 1.5339298062454474, + "learning_rate": 9.400788141980668e-06, + "loss": 0.3695, + "step": 6300 + }, + { + "epoch": 0.18, + "grad_norm": 1.8711619333768206, + "learning_rate": 9.400565161018861e-06, + "loss": 0.4066, + "step": 6301 + }, + { + "epoch": 0.18, + "grad_norm": 1.5434090546700376, + "learning_rate": 9.400342141222019e-06, + "loss": 0.3722, + "step": 6302 + }, + { + "epoch": 0.18, + "grad_norm": 1.529576649205665, + "learning_rate": 9.40011908259211e-06, + "loss": 0.3832, + "step": 6303 + }, + { + "epoch": 0.18, + "grad_norm": 1.786343142287555, + "learning_rate": 9.3998959851311e-06, + "loss": 0.4047, + "step": 6304 + }, + { + "epoch": 0.18, + "grad_norm": 1.5810328080422558, + "learning_rate": 9.399672848840959e-06, + "loss": 0.3653, + "step": 6305 + }, + { + "epoch": 0.18, + "grad_norm": 1.4011086826957537, + "learning_rate": 9.399449673723656e-06, + "loss": 0.4084, + "step": 6306 + }, + { + "epoch": 0.18, + "grad_norm": 1.5569557161354846, + "learning_rate": 9.399226459781162e-06, + "loss": 0.422, + "step": 6307 + }, + { + "epoch": 0.18, + "grad_norm": 1.8000783257700599, + "learning_rate": 9.399003207015444e-06, + "loss": 0.4056, + "step": 6308 + }, + { + "epoch": 0.18, + "grad_norm": 2.648874775630899, + "learning_rate": 9.398779915428476e-06, + "loss": 0.3953, + "step": 6309 + }, + { + "epoch": 0.18, + "grad_norm": 1.8555093058717067, + "learning_rate": 9.398556585022224e-06, + "loss": 0.4044, + "step": 6310 + }, + { + "epoch": 0.18, + "grad_norm": 1.3713232715951673, + "learning_rate": 9.398333215798663e-06, + "loss": 0.4039, + "step": 6311 + }, + { + "epoch": 0.18, + "grad_norm": 1.4255535239043238, + "learning_rate": 9.398109807759764e-06, + "loss": 0.3626, + "step": 6312 + }, + { + "epoch": 0.18, + "grad_norm": 1.4004603766803312, + "learning_rate": 9.397886360907493e-06, + "loss": 0.3668, + "step": 6313 + }, + { + "epoch": 0.18, + "grad_norm": 1.5826201088870104, + "learning_rate": 9.39766287524383e-06, + "loss": 0.3628, + "step": 6314 + }, + { + "epoch": 0.18, + "grad_norm": 1.4896062812175432, + "learning_rate": 9.397439350770742e-06, + "loss": 0.3843, + "step": 6315 + }, + { + "epoch": 0.18, + "grad_norm": 1.4921844485882403, + "learning_rate": 9.397215787490202e-06, + "loss": 0.3752, + "step": 6316 + }, + { + "epoch": 0.18, + "grad_norm": 1.7725792417522706, + "learning_rate": 9.396992185404185e-06, + "loss": 0.4194, + "step": 6317 + }, + { + "epoch": 0.18, + "grad_norm": 1.8415882255046985, + "learning_rate": 9.396768544514663e-06, + "loss": 0.3898, + "step": 6318 + }, + { + "epoch": 0.18, + "grad_norm": 2.7928068740183227, + "learning_rate": 9.39654486482361e-06, + "loss": 0.3765, + "step": 6319 + }, + { + "epoch": 0.18, + "grad_norm": 1.4233558836189655, + "learning_rate": 9.396321146332998e-06, + "loss": 0.4009, + "step": 6320 + }, + { + "epoch": 0.18, + "grad_norm": 1.424841164319386, + "learning_rate": 9.396097389044804e-06, + "loss": 0.3885, + "step": 6321 + }, + { + "epoch": 0.18, + "grad_norm": 1.6647542496358105, + "learning_rate": 9.395873592961004e-06, + "loss": 0.3889, + "step": 6322 + }, + { + "epoch": 0.18, + "grad_norm": 1.713666113848817, + "learning_rate": 9.395649758083569e-06, + "loss": 0.3833, + "step": 6323 + }, + { + "epoch": 0.18, + "grad_norm": 1.4410025594750997, + "learning_rate": 9.395425884414476e-06, + "loss": 0.3758, + "step": 6324 + }, + { + "epoch": 0.18, + "grad_norm": 1.8029443980770354, + "learning_rate": 9.395201971955701e-06, + "loss": 0.3796, + "step": 6325 + }, + { + "epoch": 0.18, + "grad_norm": 1.3720644689835284, + "learning_rate": 9.39497802070922e-06, + "loss": 0.3803, + "step": 6326 + }, + { + "epoch": 0.18, + "grad_norm": 1.525945866964546, + "learning_rate": 9.394754030677009e-06, + "loss": 0.3801, + "step": 6327 + }, + { + "epoch": 0.18, + "grad_norm": 1.4563876419125692, + "learning_rate": 9.394530001861047e-06, + "loss": 0.3938, + "step": 6328 + }, + { + "epoch": 0.18, + "grad_norm": 1.3905781324191195, + "learning_rate": 9.394305934263307e-06, + "loss": 0.3745, + "step": 6329 + }, + { + "epoch": 0.18, + "grad_norm": 1.6083479850185716, + "learning_rate": 9.394081827885768e-06, + "loss": 0.3846, + "step": 6330 + }, + { + "epoch": 0.18, + "grad_norm": 1.3829415577958584, + "learning_rate": 9.393857682730408e-06, + "loss": 0.3995, + "step": 6331 + }, + { + "epoch": 0.18, + "grad_norm": 1.4120468142163694, + "learning_rate": 9.393633498799206e-06, + "loss": 0.3953, + "step": 6332 + }, + { + "epoch": 0.18, + "grad_norm": 1.3425649562352635, + "learning_rate": 9.393409276094138e-06, + "loss": 0.3771, + "step": 6333 + }, + { + "epoch": 0.18, + "grad_norm": 1.938611184257305, + "learning_rate": 9.393185014617187e-06, + "loss": 0.3842, + "step": 6334 + }, + { + "epoch": 0.18, + "grad_norm": 1.7852761139739142, + "learning_rate": 9.392960714370327e-06, + "loss": 0.3924, + "step": 6335 + }, + { + "epoch": 0.18, + "grad_norm": 1.4646109733518682, + "learning_rate": 9.392736375355541e-06, + "loss": 0.3803, + "step": 6336 + }, + { + "epoch": 0.18, + "grad_norm": 1.5255033841024033, + "learning_rate": 9.392511997574809e-06, + "loss": 0.4098, + "step": 6337 + }, + { + "epoch": 0.18, + "grad_norm": 1.8627402502958257, + "learning_rate": 9.39228758103011e-06, + "loss": 0.4064, + "step": 6338 + }, + { + "epoch": 0.18, + "grad_norm": 1.3308903890244863, + "learning_rate": 9.392063125723423e-06, + "loss": 0.4043, + "step": 6339 + }, + { + "epoch": 0.18, + "grad_norm": 1.548057226949686, + "learning_rate": 9.39183863165673e-06, + "loss": 0.4282, + "step": 6340 + }, + { + "epoch": 0.18, + "grad_norm": 1.3189219113777388, + "learning_rate": 9.391614098832013e-06, + "loss": 0.3588, + "step": 6341 + }, + { + "epoch": 0.18, + "grad_norm": 1.5017160833224075, + "learning_rate": 9.391389527251253e-06, + "loss": 0.3755, + "step": 6342 + }, + { + "epoch": 0.18, + "grad_norm": 1.003871335929373, + "learning_rate": 9.39116491691643e-06, + "loss": 0.5938, + "step": 6343 + }, + { + "epoch": 0.18, + "grad_norm": 1.3902576622500056, + "learning_rate": 9.39094026782953e-06, + "loss": 0.363, + "step": 6344 + }, + { + "epoch": 0.18, + "grad_norm": 1.496776567436097, + "learning_rate": 9.390715579992533e-06, + "loss": 0.4103, + "step": 6345 + }, + { + "epoch": 0.18, + "grad_norm": 3.557821260457942, + "learning_rate": 9.390490853407421e-06, + "loss": 0.4273, + "step": 6346 + }, + { + "epoch": 0.18, + "grad_norm": 1.368321041561715, + "learning_rate": 9.39026608807618e-06, + "loss": 0.3855, + "step": 6347 + }, + { + "epoch": 0.18, + "grad_norm": 2.930690018531611, + "learning_rate": 9.390041284000793e-06, + "loss": 0.4121, + "step": 6348 + }, + { + "epoch": 0.18, + "grad_norm": 1.3953862887348987, + "learning_rate": 9.38981644118324e-06, + "loss": 0.3999, + "step": 6349 + }, + { + "epoch": 0.18, + "grad_norm": 1.545474762581967, + "learning_rate": 9.389591559625511e-06, + "loss": 0.3602, + "step": 6350 + }, + { + "epoch": 0.18, + "grad_norm": 1.4967954178069305, + "learning_rate": 9.389366639329587e-06, + "loss": 0.3719, + "step": 6351 + }, + { + "epoch": 0.18, + "grad_norm": 1.6503962903973362, + "learning_rate": 9.389141680297453e-06, + "loss": 0.4235, + "step": 6352 + }, + { + "epoch": 0.18, + "grad_norm": 1.5258516178496726, + "learning_rate": 9.388916682531095e-06, + "loss": 0.3794, + "step": 6353 + }, + { + "epoch": 0.18, + "grad_norm": 1.3951337313002783, + "learning_rate": 9.388691646032499e-06, + "loss": 0.3547, + "step": 6354 + }, + { + "epoch": 0.18, + "grad_norm": 1.4921263578887727, + "learning_rate": 9.38846657080365e-06, + "loss": 0.3779, + "step": 6355 + }, + { + "epoch": 0.18, + "grad_norm": 1.3113001011841758, + "learning_rate": 9.388241456846536e-06, + "loss": 0.3988, + "step": 6356 + }, + { + "epoch": 0.18, + "grad_norm": 1.4635122683331934, + "learning_rate": 9.38801630416314e-06, + "loss": 0.3816, + "step": 6357 + }, + { + "epoch": 0.18, + "grad_norm": 1.404276342687973, + "learning_rate": 9.387791112755453e-06, + "loss": 0.382, + "step": 6358 + }, + { + "epoch": 0.18, + "grad_norm": 1.3608932711781396, + "learning_rate": 9.387565882625462e-06, + "loss": 0.3656, + "step": 6359 + }, + { + "epoch": 0.18, + "grad_norm": 1.6503782614847766, + "learning_rate": 9.387340613775153e-06, + "loss": 0.4406, + "step": 6360 + }, + { + "epoch": 0.18, + "grad_norm": 1.3464046694551353, + "learning_rate": 9.387115306206513e-06, + "loss": 0.3747, + "step": 6361 + }, + { + "epoch": 0.18, + "grad_norm": 1.4071880256390934, + "learning_rate": 9.386889959921532e-06, + "loss": 0.3794, + "step": 6362 + }, + { + "epoch": 0.18, + "grad_norm": 1.4608390771308033, + "learning_rate": 9.3866645749222e-06, + "loss": 0.3973, + "step": 6363 + }, + { + "epoch": 0.18, + "grad_norm": 1.4857435983091802, + "learning_rate": 9.386439151210503e-06, + "loss": 0.4014, + "step": 6364 + }, + { + "epoch": 0.18, + "grad_norm": 1.605160184342833, + "learning_rate": 9.386213688788432e-06, + "loss": 0.3858, + "step": 6365 + }, + { + "epoch": 0.18, + "grad_norm": 2.196717220825788, + "learning_rate": 9.385988187657978e-06, + "loss": 0.371, + "step": 6366 + }, + { + "epoch": 0.18, + "grad_norm": 1.4368480982189955, + "learning_rate": 9.385762647821125e-06, + "loss": 0.3832, + "step": 6367 + }, + { + "epoch": 0.18, + "grad_norm": 1.6459900421362075, + "learning_rate": 9.385537069279873e-06, + "loss": 0.3859, + "step": 6368 + }, + { + "epoch": 0.18, + "grad_norm": 1.3502829330925463, + "learning_rate": 9.385311452036205e-06, + "loss": 0.3695, + "step": 6369 + }, + { + "epoch": 0.18, + "grad_norm": 1.36882354549488, + "learning_rate": 9.385085796092116e-06, + "loss": 0.384, + "step": 6370 + }, + { + "epoch": 0.18, + "grad_norm": 1.408896901198847, + "learning_rate": 9.384860101449598e-06, + "loss": 0.3843, + "step": 6371 + }, + { + "epoch": 0.18, + "grad_norm": 1.4572854044482517, + "learning_rate": 9.384634368110637e-06, + "loss": 0.3836, + "step": 6372 + }, + { + "epoch": 0.18, + "grad_norm": 1.4690179692395335, + "learning_rate": 9.384408596077232e-06, + "loss": 0.3789, + "step": 6373 + }, + { + "epoch": 0.18, + "grad_norm": 1.6313451590788846, + "learning_rate": 9.384182785351371e-06, + "loss": 0.4041, + "step": 6374 + }, + { + "epoch": 0.18, + "grad_norm": 1.5446387828186636, + "learning_rate": 9.383956935935049e-06, + "loss": 0.3857, + "step": 6375 + }, + { + "epoch": 0.18, + "grad_norm": 1.475789318193104, + "learning_rate": 9.383731047830258e-06, + "loss": 0.3803, + "step": 6376 + }, + { + "epoch": 0.18, + "grad_norm": 1.4437827812262864, + "learning_rate": 9.383505121038992e-06, + "loss": 0.4269, + "step": 6377 + }, + { + "epoch": 0.18, + "grad_norm": 1.5486185465689086, + "learning_rate": 9.383279155563245e-06, + "loss": 0.411, + "step": 6378 + }, + { + "epoch": 0.19, + "grad_norm": 1.5976100457178335, + "learning_rate": 9.383053151405011e-06, + "loss": 0.3825, + "step": 6379 + }, + { + "epoch": 0.19, + "grad_norm": 1.402059204300273, + "learning_rate": 9.382827108566283e-06, + "loss": 0.3698, + "step": 6380 + }, + { + "epoch": 0.19, + "grad_norm": 1.5767293638204256, + "learning_rate": 9.382601027049059e-06, + "loss": 0.4085, + "step": 6381 + }, + { + "epoch": 0.19, + "grad_norm": 1.420374709471692, + "learning_rate": 9.382374906855331e-06, + "loss": 0.3646, + "step": 6382 + }, + { + "epoch": 0.19, + "grad_norm": 1.3998853270939813, + "learning_rate": 9.382148747987097e-06, + "loss": 0.3966, + "step": 6383 + }, + { + "epoch": 0.19, + "grad_norm": 2.184163399887005, + "learning_rate": 9.381922550446349e-06, + "loss": 0.3766, + "step": 6384 + }, + { + "epoch": 0.19, + "grad_norm": 1.4045187591626065, + "learning_rate": 9.38169631423509e-06, + "loss": 0.4133, + "step": 6385 + }, + { + "epoch": 0.19, + "grad_norm": 1.4046309186634556, + "learning_rate": 9.38147003935531e-06, + "loss": 0.395, + "step": 6386 + }, + { + "epoch": 0.19, + "grad_norm": 1.3939512763817286, + "learning_rate": 9.381243725809009e-06, + "loss": 0.3651, + "step": 6387 + }, + { + "epoch": 0.19, + "grad_norm": 1.414243908655285, + "learning_rate": 9.381017373598181e-06, + "loss": 0.4251, + "step": 6388 + }, + { + "epoch": 0.19, + "grad_norm": 1.533416454168378, + "learning_rate": 9.38079098272483e-06, + "loss": 0.4035, + "step": 6389 + }, + { + "epoch": 0.19, + "grad_norm": 1.5242559007281786, + "learning_rate": 9.380564553190948e-06, + "loss": 0.3771, + "step": 6390 + }, + { + "epoch": 0.19, + "grad_norm": 1.4182121072421885, + "learning_rate": 9.380338084998534e-06, + "loss": 0.3661, + "step": 6391 + }, + { + "epoch": 0.19, + "grad_norm": 1.4221083010468865, + "learning_rate": 9.380111578149589e-06, + "loss": 0.3835, + "step": 6392 + }, + { + "epoch": 0.19, + "grad_norm": 1.4889036324623237, + "learning_rate": 9.37988503264611e-06, + "loss": 0.4201, + "step": 6393 + }, + { + "epoch": 0.19, + "grad_norm": 1.5759544405814372, + "learning_rate": 9.3796584484901e-06, + "loss": 0.415, + "step": 6394 + }, + { + "epoch": 0.19, + "grad_norm": 1.6296238400526135, + "learning_rate": 9.379431825683551e-06, + "loss": 0.3798, + "step": 6395 + }, + { + "epoch": 0.19, + "grad_norm": 1.480115895690805, + "learning_rate": 9.37920516422847e-06, + "loss": 0.3933, + "step": 6396 + }, + { + "epoch": 0.19, + "grad_norm": 1.5126074073772833, + "learning_rate": 9.378978464126856e-06, + "loss": 0.3815, + "step": 6397 + }, + { + "epoch": 0.19, + "grad_norm": 1.5158287087192672, + "learning_rate": 9.378751725380708e-06, + "loss": 0.3982, + "step": 6398 + }, + { + "epoch": 0.19, + "grad_norm": 1.453355428048618, + "learning_rate": 9.378524947992026e-06, + "loss": 0.3796, + "step": 6399 + }, + { + "epoch": 0.19, + "grad_norm": 1.4970433665010403, + "learning_rate": 9.378298131962813e-06, + "loss": 0.3462, + "step": 6400 + }, + { + "epoch": 0.19, + "grad_norm": 1.4382986665286372, + "learning_rate": 9.378071277295071e-06, + "loss": 0.375, + "step": 6401 + }, + { + "epoch": 0.19, + "grad_norm": 1.4003906602109004, + "learning_rate": 9.377844383990802e-06, + "loss": 0.3949, + "step": 6402 + }, + { + "epoch": 0.19, + "grad_norm": 1.4912504917057248, + "learning_rate": 9.377617452052007e-06, + "loss": 0.3514, + "step": 6403 + }, + { + "epoch": 0.19, + "grad_norm": 1.5090971408882943, + "learning_rate": 9.377390481480689e-06, + "loss": 0.363, + "step": 6404 + }, + { + "epoch": 0.19, + "grad_norm": 1.510546145104496, + "learning_rate": 9.377163472278852e-06, + "loss": 0.4139, + "step": 6405 + }, + { + "epoch": 0.19, + "grad_norm": 1.4983044964367913, + "learning_rate": 9.3769364244485e-06, + "loss": 0.3936, + "step": 6406 + }, + { + "epoch": 0.19, + "grad_norm": 1.43888946453237, + "learning_rate": 9.376709337991634e-06, + "loss": 0.3916, + "step": 6407 + }, + { + "epoch": 0.19, + "grad_norm": 1.4370825490555539, + "learning_rate": 9.376482212910259e-06, + "loss": 0.4549, + "step": 6408 + }, + { + "epoch": 0.19, + "grad_norm": 1.5367656662902978, + "learning_rate": 9.37625504920638e-06, + "loss": 0.4092, + "step": 6409 + }, + { + "epoch": 0.19, + "grad_norm": 1.4754148632718647, + "learning_rate": 9.376027846882003e-06, + "loss": 0.3927, + "step": 6410 + }, + { + "epoch": 0.19, + "grad_norm": 1.5864330662250055, + "learning_rate": 9.375800605939131e-06, + "loss": 0.4062, + "step": 6411 + }, + { + "epoch": 0.19, + "grad_norm": 1.0526776600887005, + "learning_rate": 9.37557332637977e-06, + "loss": 0.5971, + "step": 6412 + }, + { + "epoch": 0.19, + "grad_norm": 1.8996321084608847, + "learning_rate": 9.375346008205925e-06, + "loss": 0.4192, + "step": 6413 + }, + { + "epoch": 0.19, + "grad_norm": 1.8882259786531201, + "learning_rate": 9.375118651419606e-06, + "loss": 0.3803, + "step": 6414 + }, + { + "epoch": 0.19, + "grad_norm": 1.3341231996881606, + "learning_rate": 9.374891256022813e-06, + "loss": 0.3846, + "step": 6415 + }, + { + "epoch": 0.19, + "grad_norm": 1.5922790018314144, + "learning_rate": 9.374663822017558e-06, + "loss": 0.3853, + "step": 6416 + }, + { + "epoch": 0.19, + "grad_norm": 1.6302462350687403, + "learning_rate": 9.374436349405847e-06, + "loss": 0.3865, + "step": 6417 + }, + { + "epoch": 0.19, + "grad_norm": 1.4391316568666772, + "learning_rate": 9.374208838189684e-06, + "loss": 0.3836, + "step": 6418 + }, + { + "epoch": 0.19, + "grad_norm": 0.9801241389776965, + "learning_rate": 9.373981288371081e-06, + "loss": 0.6084, + "step": 6419 + }, + { + "epoch": 0.19, + "grad_norm": 3.678784818388282, + "learning_rate": 9.373753699952044e-06, + "loss": 0.3823, + "step": 6420 + }, + { + "epoch": 0.19, + "grad_norm": 1.4292990930464773, + "learning_rate": 9.373526072934584e-06, + "loss": 0.3857, + "step": 6421 + }, + { + "epoch": 0.19, + "grad_norm": 1.6058248750193227, + "learning_rate": 9.373298407320706e-06, + "loss": 0.4113, + "step": 6422 + }, + { + "epoch": 0.19, + "grad_norm": 1.5512460496340938, + "learning_rate": 9.373070703112422e-06, + "loss": 0.3721, + "step": 6423 + }, + { + "epoch": 0.19, + "grad_norm": 1.7368953261955808, + "learning_rate": 9.372842960311738e-06, + "loss": 0.3899, + "step": 6424 + }, + { + "epoch": 0.19, + "grad_norm": 1.6617964910243308, + "learning_rate": 9.37261517892067e-06, + "loss": 0.3806, + "step": 6425 + }, + { + "epoch": 0.19, + "grad_norm": 1.3907215938736741, + "learning_rate": 9.372387358941223e-06, + "loss": 0.3859, + "step": 6426 + }, + { + "epoch": 0.19, + "grad_norm": 3.1754655914401293, + "learning_rate": 9.372159500375409e-06, + "loss": 0.3764, + "step": 6427 + }, + { + "epoch": 0.19, + "grad_norm": 1.5199036895769444, + "learning_rate": 9.371931603225237e-06, + "loss": 0.4045, + "step": 6428 + }, + { + "epoch": 0.19, + "grad_norm": 1.715871328096641, + "learning_rate": 9.371703667492724e-06, + "loss": 0.4111, + "step": 6429 + }, + { + "epoch": 0.19, + "grad_norm": 2.0410372295429977, + "learning_rate": 9.371475693179875e-06, + "loss": 0.3847, + "step": 6430 + }, + { + "epoch": 0.19, + "grad_norm": 2.0472166685845896, + "learning_rate": 9.371247680288704e-06, + "loss": 0.4098, + "step": 6431 + }, + { + "epoch": 0.19, + "grad_norm": 1.340805146961474, + "learning_rate": 9.371019628821225e-06, + "loss": 0.3654, + "step": 6432 + }, + { + "epoch": 0.19, + "grad_norm": 1.4751738297432522, + "learning_rate": 9.37079153877945e-06, + "loss": 0.4023, + "step": 6433 + }, + { + "epoch": 0.19, + "grad_norm": 2.6803229490417833, + "learning_rate": 9.37056341016539e-06, + "loss": 0.3916, + "step": 6434 + }, + { + "epoch": 0.19, + "grad_norm": 1.4689918262914126, + "learning_rate": 9.37033524298106e-06, + "loss": 0.3438, + "step": 6435 + }, + { + "epoch": 0.19, + "grad_norm": 1.5849607460998363, + "learning_rate": 9.370107037228473e-06, + "loss": 0.3852, + "step": 6436 + }, + { + "epoch": 0.19, + "grad_norm": 1.3862815132113162, + "learning_rate": 9.369878792909642e-06, + "loss": 0.3613, + "step": 6437 + }, + { + "epoch": 0.19, + "grad_norm": 1.4807816055272751, + "learning_rate": 9.369650510026584e-06, + "loss": 0.3744, + "step": 6438 + }, + { + "epoch": 0.19, + "grad_norm": 1.468712884859972, + "learning_rate": 9.36942218858131e-06, + "loss": 0.4245, + "step": 6439 + }, + { + "epoch": 0.19, + "grad_norm": 1.4750925662038836, + "learning_rate": 9.369193828575838e-06, + "loss": 0.3934, + "step": 6440 + }, + { + "epoch": 0.19, + "grad_norm": 2.071709236696693, + "learning_rate": 9.36896543001218e-06, + "loss": 0.3737, + "step": 6441 + }, + { + "epoch": 0.19, + "grad_norm": 1.307758069044056, + "learning_rate": 9.368736992892355e-06, + "loss": 0.352, + "step": 6442 + }, + { + "epoch": 0.19, + "grad_norm": 1.7367690225916994, + "learning_rate": 9.368508517218377e-06, + "loss": 0.3621, + "step": 6443 + }, + { + "epoch": 0.19, + "grad_norm": 1.3799708119807117, + "learning_rate": 9.368280002992264e-06, + "loss": 0.3649, + "step": 6444 + }, + { + "epoch": 0.19, + "grad_norm": 1.0167311841414346, + "learning_rate": 9.368051450216032e-06, + "loss": 0.6226, + "step": 6445 + }, + { + "epoch": 0.19, + "grad_norm": 1.364000497127217, + "learning_rate": 9.367822858891695e-06, + "loss": 0.3727, + "step": 6446 + }, + { + "epoch": 0.19, + "grad_norm": 1.5540810172646708, + "learning_rate": 9.367594229021275e-06, + "loss": 0.3743, + "step": 6447 + }, + { + "epoch": 0.19, + "grad_norm": 1.5358886154388365, + "learning_rate": 9.367365560606786e-06, + "loss": 0.3546, + "step": 6448 + }, + { + "epoch": 0.19, + "grad_norm": 1.9563394994598151, + "learning_rate": 9.367136853650249e-06, + "loss": 0.3975, + "step": 6449 + }, + { + "epoch": 0.19, + "grad_norm": 1.3274857600729169, + "learning_rate": 9.36690810815368e-06, + "loss": 0.3959, + "step": 6450 + }, + { + "epoch": 0.19, + "grad_norm": 1.3686572650827709, + "learning_rate": 9.3666793241191e-06, + "loss": 0.3492, + "step": 6451 + }, + { + "epoch": 0.19, + "grad_norm": 1.4154383829689114, + "learning_rate": 9.366450501548522e-06, + "loss": 0.387, + "step": 6452 + }, + { + "epoch": 0.19, + "grad_norm": 1.4393611577650482, + "learning_rate": 9.366221640443974e-06, + "loss": 0.3744, + "step": 6453 + }, + { + "epoch": 0.19, + "grad_norm": 1.4850499021715464, + "learning_rate": 9.36599274080747e-06, + "loss": 0.396, + "step": 6454 + }, + { + "epoch": 0.19, + "grad_norm": 1.815641629522184, + "learning_rate": 9.365763802641033e-06, + "loss": 0.377, + "step": 6455 + }, + { + "epoch": 0.19, + "grad_norm": 1.5931624636277841, + "learning_rate": 9.36553482594668e-06, + "loss": 0.3799, + "step": 6456 + }, + { + "epoch": 0.19, + "grad_norm": 1.4847700380422635, + "learning_rate": 9.365305810726435e-06, + "loss": 0.3727, + "step": 6457 + }, + { + "epoch": 0.19, + "grad_norm": 1.4682939679385998, + "learning_rate": 9.365076756982319e-06, + "loss": 0.3798, + "step": 6458 + }, + { + "epoch": 0.19, + "grad_norm": 1.3871980043999868, + "learning_rate": 9.36484766471635e-06, + "loss": 0.3643, + "step": 6459 + }, + { + "epoch": 0.19, + "grad_norm": 1.63811149616507, + "learning_rate": 9.364618533930553e-06, + "loss": 0.4099, + "step": 6460 + }, + { + "epoch": 0.19, + "grad_norm": 1.8771066365736007, + "learning_rate": 9.36438936462695e-06, + "loss": 0.3752, + "step": 6461 + }, + { + "epoch": 0.19, + "grad_norm": 1.6878428014913542, + "learning_rate": 9.36416015680756e-06, + "loss": 0.387, + "step": 6462 + }, + { + "epoch": 0.19, + "grad_norm": 1.6476886223856, + "learning_rate": 9.36393091047441e-06, + "loss": 0.3961, + "step": 6463 + }, + { + "epoch": 0.19, + "grad_norm": 1.4533511332680773, + "learning_rate": 9.36370162562952e-06, + "loss": 0.3695, + "step": 6464 + }, + { + "epoch": 0.19, + "grad_norm": 1.583385833115519, + "learning_rate": 9.363472302274916e-06, + "loss": 0.3596, + "step": 6465 + }, + { + "epoch": 0.19, + "grad_norm": 1.6926776540255095, + "learning_rate": 9.363242940412621e-06, + "loss": 0.3882, + "step": 6466 + }, + { + "epoch": 0.19, + "grad_norm": 1.4298531182419565, + "learning_rate": 9.363013540044657e-06, + "loss": 0.3758, + "step": 6467 + }, + { + "epoch": 0.19, + "grad_norm": 1.5377132562635791, + "learning_rate": 9.36278410117305e-06, + "loss": 0.3728, + "step": 6468 + }, + { + "epoch": 0.19, + "grad_norm": 1.332942033558245, + "learning_rate": 9.362554623799826e-06, + "loss": 0.371, + "step": 6469 + }, + { + "epoch": 0.19, + "grad_norm": 1.490012081523726, + "learning_rate": 9.362325107927009e-06, + "loss": 0.3838, + "step": 6470 + }, + { + "epoch": 0.19, + "grad_norm": 1.1011804803948426, + "learning_rate": 9.362095553556625e-06, + "loss": 0.62, + "step": 6471 + }, + { + "epoch": 0.19, + "grad_norm": 1.7554967530079446, + "learning_rate": 9.361865960690696e-06, + "loss": 0.3819, + "step": 6472 + }, + { + "epoch": 0.19, + "grad_norm": 1.4097391547989122, + "learning_rate": 9.361636329331257e-06, + "loss": 0.3687, + "step": 6473 + }, + { + "epoch": 0.19, + "grad_norm": 1.530162329558319, + "learning_rate": 9.361406659480325e-06, + "loss": 0.4066, + "step": 6474 + }, + { + "epoch": 0.19, + "grad_norm": 1.955997039110596, + "learning_rate": 9.361176951139933e-06, + "loss": 0.406, + "step": 6475 + }, + { + "epoch": 0.19, + "grad_norm": 1.5299554942578155, + "learning_rate": 9.360947204312103e-06, + "loss": 0.3929, + "step": 6476 + }, + { + "epoch": 0.19, + "grad_norm": 1.4141127913101854, + "learning_rate": 9.360717418998867e-06, + "loss": 0.3618, + "step": 6477 + }, + { + "epoch": 0.19, + "grad_norm": 1.4692938622809892, + "learning_rate": 9.360487595202253e-06, + "loss": 0.3827, + "step": 6478 + }, + { + "epoch": 0.19, + "grad_norm": 1.4198261588276926, + "learning_rate": 9.360257732924287e-06, + "loss": 0.4076, + "step": 6479 + }, + { + "epoch": 0.19, + "grad_norm": 1.4646560334228123, + "learning_rate": 9.360027832166998e-06, + "loss": 0.377, + "step": 6480 + }, + { + "epoch": 0.19, + "grad_norm": 1.4145177356135772, + "learning_rate": 9.359797892932414e-06, + "loss": 0.3932, + "step": 6481 + }, + { + "epoch": 0.19, + "grad_norm": 1.3091977897064417, + "learning_rate": 9.359567915222566e-06, + "loss": 0.4294, + "step": 6482 + }, + { + "epoch": 0.19, + "grad_norm": 1.1604130609344079, + "learning_rate": 9.359337899039483e-06, + "loss": 0.6443, + "step": 6483 + }, + { + "epoch": 0.19, + "grad_norm": 1.335646082032203, + "learning_rate": 9.359107844385194e-06, + "loss": 0.3845, + "step": 6484 + }, + { + "epoch": 0.19, + "grad_norm": 1.5546248682061545, + "learning_rate": 9.35887775126173e-06, + "loss": 0.3978, + "step": 6485 + }, + { + "epoch": 0.19, + "grad_norm": 1.6364541081210877, + "learning_rate": 9.358647619671123e-06, + "loss": 0.3663, + "step": 6486 + }, + { + "epoch": 0.19, + "grad_norm": 1.7662475504658428, + "learning_rate": 9.3584174496154e-06, + "loss": 0.3629, + "step": 6487 + }, + { + "epoch": 0.19, + "grad_norm": 1.7318582951755972, + "learning_rate": 9.358187241096598e-06, + "loss": 0.3983, + "step": 6488 + }, + { + "epoch": 0.19, + "grad_norm": 1.3274004890824953, + "learning_rate": 9.357956994116741e-06, + "loss": 0.3822, + "step": 6489 + }, + { + "epoch": 0.19, + "grad_norm": 1.3842742385666125, + "learning_rate": 9.357726708677867e-06, + "loss": 0.3584, + "step": 6490 + }, + { + "epoch": 0.19, + "grad_norm": 1.4176848038699161, + "learning_rate": 9.357496384782008e-06, + "loss": 0.3743, + "step": 6491 + }, + { + "epoch": 0.19, + "grad_norm": 1.5654132726525352, + "learning_rate": 9.357266022431192e-06, + "loss": 0.3924, + "step": 6492 + }, + { + "epoch": 0.19, + "grad_norm": 1.448361851831288, + "learning_rate": 9.357035621627459e-06, + "loss": 0.3726, + "step": 6493 + }, + { + "epoch": 0.19, + "grad_norm": 1.8130908870720717, + "learning_rate": 9.356805182372836e-06, + "loss": 0.3533, + "step": 6494 + }, + { + "epoch": 0.19, + "grad_norm": 1.438719579927028, + "learning_rate": 9.356574704669359e-06, + "loss": 0.3529, + "step": 6495 + }, + { + "epoch": 0.19, + "grad_norm": 1.4168573064472358, + "learning_rate": 9.356344188519062e-06, + "loss": 0.3457, + "step": 6496 + }, + { + "epoch": 0.19, + "grad_norm": 3.3671673301937983, + "learning_rate": 9.356113633923978e-06, + "loss": 0.404, + "step": 6497 + }, + { + "epoch": 0.19, + "grad_norm": 1.5438951051249503, + "learning_rate": 9.355883040886145e-06, + "loss": 0.3706, + "step": 6498 + }, + { + "epoch": 0.19, + "grad_norm": 1.4449656237113109, + "learning_rate": 9.355652409407594e-06, + "loss": 0.3916, + "step": 6499 + }, + { + "epoch": 0.19, + "grad_norm": 1.4060404852276487, + "learning_rate": 9.355421739490364e-06, + "loss": 0.3728, + "step": 6500 + }, + { + "epoch": 0.19, + "grad_norm": 1.5176074678034002, + "learning_rate": 9.355191031136487e-06, + "loss": 0.3856, + "step": 6501 + }, + { + "epoch": 0.19, + "grad_norm": 2.062864450950193, + "learning_rate": 9.354960284348003e-06, + "loss": 0.392, + "step": 6502 + }, + { + "epoch": 0.19, + "grad_norm": 1.4071550252710319, + "learning_rate": 9.354729499126944e-06, + "loss": 0.3745, + "step": 6503 + }, + { + "epoch": 0.19, + "grad_norm": 1.474428546194582, + "learning_rate": 9.35449867547535e-06, + "loss": 0.3711, + "step": 6504 + }, + { + "epoch": 0.19, + "grad_norm": 1.5654978069390935, + "learning_rate": 9.354267813395257e-06, + "loss": 0.3804, + "step": 6505 + }, + { + "epoch": 0.19, + "grad_norm": 1.8590271646660435, + "learning_rate": 9.354036912888703e-06, + "loss": 0.3721, + "step": 6506 + }, + { + "epoch": 0.19, + "grad_norm": 1.5119495956145927, + "learning_rate": 9.353805973957721e-06, + "loss": 0.4037, + "step": 6507 + }, + { + "epoch": 0.19, + "grad_norm": 1.4842334613311938, + "learning_rate": 9.353574996604358e-06, + "loss": 0.3858, + "step": 6508 + }, + { + "epoch": 0.19, + "grad_norm": 1.4434278547830488, + "learning_rate": 9.353343980830644e-06, + "loss": 0.4209, + "step": 6509 + }, + { + "epoch": 0.19, + "grad_norm": 2.4740677738253907, + "learning_rate": 9.353112926638623e-06, + "loss": 0.3619, + "step": 6510 + }, + { + "epoch": 0.19, + "grad_norm": 1.6436135702365593, + "learning_rate": 9.35288183403033e-06, + "loss": 0.3757, + "step": 6511 + }, + { + "epoch": 0.19, + "grad_norm": 1.6586537455576766, + "learning_rate": 9.352650703007808e-06, + "loss": 0.4023, + "step": 6512 + }, + { + "epoch": 0.19, + "grad_norm": 1.4227200216084417, + "learning_rate": 9.352419533573097e-06, + "loss": 0.3788, + "step": 6513 + }, + { + "epoch": 0.19, + "grad_norm": 1.3574333245699715, + "learning_rate": 9.352188325728233e-06, + "loss": 0.3777, + "step": 6514 + }, + { + "epoch": 0.19, + "grad_norm": 1.9886484854461632, + "learning_rate": 9.351957079475259e-06, + "loss": 0.3741, + "step": 6515 + }, + { + "epoch": 0.19, + "grad_norm": 1.348758074143643, + "learning_rate": 9.351725794816216e-06, + "loss": 0.3838, + "step": 6516 + }, + { + "epoch": 0.19, + "grad_norm": 1.5410343602401881, + "learning_rate": 9.351494471753144e-06, + "loss": 0.3649, + "step": 6517 + }, + { + "epoch": 0.19, + "grad_norm": 2.857369109717055, + "learning_rate": 9.351263110288084e-06, + "loss": 0.4067, + "step": 6518 + }, + { + "epoch": 0.19, + "grad_norm": 1.761090831210116, + "learning_rate": 9.351031710423081e-06, + "loss": 0.3807, + "step": 6519 + }, + { + "epoch": 0.19, + "grad_norm": 1.7370559784658057, + "learning_rate": 9.350800272160174e-06, + "loss": 0.3872, + "step": 6520 + }, + { + "epoch": 0.19, + "grad_norm": 1.5078371899248597, + "learning_rate": 9.350568795501407e-06, + "loss": 0.3618, + "step": 6521 + }, + { + "epoch": 0.19, + "grad_norm": 1.3500308885297634, + "learning_rate": 9.35033728044882e-06, + "loss": 0.3703, + "step": 6522 + }, + { + "epoch": 0.19, + "grad_norm": 1.7707700848741164, + "learning_rate": 9.350105727004461e-06, + "loss": 0.3478, + "step": 6523 + }, + { + "epoch": 0.19, + "grad_norm": 0.9726728378865773, + "learning_rate": 9.349874135170369e-06, + "loss": 0.6114, + "step": 6524 + }, + { + "epoch": 0.19, + "grad_norm": 1.528145624457397, + "learning_rate": 9.349642504948589e-06, + "loss": 0.3608, + "step": 6525 + }, + { + "epoch": 0.19, + "grad_norm": 1.5068885688652498, + "learning_rate": 9.349410836341167e-06, + "loss": 0.3789, + "step": 6526 + }, + { + "epoch": 0.19, + "grad_norm": 1.5710282440135974, + "learning_rate": 9.349179129350145e-06, + "loss": 0.3823, + "step": 6527 + }, + { + "epoch": 0.19, + "grad_norm": 1.528081843271436, + "learning_rate": 9.348947383977572e-06, + "loss": 0.373, + "step": 6528 + }, + { + "epoch": 0.19, + "grad_norm": 2.424095109071285, + "learning_rate": 9.348715600225487e-06, + "loss": 0.4261, + "step": 6529 + }, + { + "epoch": 0.19, + "grad_norm": 1.5853528075212826, + "learning_rate": 9.348483778095938e-06, + "loss": 0.3791, + "step": 6530 + }, + { + "epoch": 0.19, + "grad_norm": 1.5263426882874072, + "learning_rate": 9.348251917590974e-06, + "loss": 0.3983, + "step": 6531 + }, + { + "epoch": 0.19, + "grad_norm": 1.8541359286584096, + "learning_rate": 9.348020018712636e-06, + "loss": 0.3619, + "step": 6532 + }, + { + "epoch": 0.19, + "grad_norm": 1.3641093840815337, + "learning_rate": 9.347788081462975e-06, + "loss": 0.3662, + "step": 6533 + }, + { + "epoch": 0.19, + "grad_norm": 1.5996041721109122, + "learning_rate": 9.347556105844036e-06, + "loss": 0.3978, + "step": 6534 + }, + { + "epoch": 0.19, + "grad_norm": 4.41943719552812, + "learning_rate": 9.347324091857865e-06, + "loss": 0.4019, + "step": 6535 + }, + { + "epoch": 0.19, + "grad_norm": 1.7653405016629051, + "learning_rate": 9.347092039506512e-06, + "loss": 0.3573, + "step": 6536 + }, + { + "epoch": 0.19, + "grad_norm": 1.4438136821544658, + "learning_rate": 9.346859948792023e-06, + "loss": 0.354, + "step": 6537 + }, + { + "epoch": 0.19, + "grad_norm": 1.5303845342773426, + "learning_rate": 9.346627819716445e-06, + "loss": 0.4006, + "step": 6538 + }, + { + "epoch": 0.19, + "grad_norm": 1.5653674931063104, + "learning_rate": 9.34639565228183e-06, + "loss": 0.3599, + "step": 6539 + }, + { + "epoch": 0.19, + "grad_norm": 1.4426754577861867, + "learning_rate": 9.346163446490224e-06, + "loss": 0.3724, + "step": 6540 + }, + { + "epoch": 0.19, + "grad_norm": 1.4776398628783853, + "learning_rate": 9.345931202343679e-06, + "loss": 0.3901, + "step": 6541 + }, + { + "epoch": 0.19, + "grad_norm": 1.5330718161459551, + "learning_rate": 9.345698919844242e-06, + "loss": 0.3606, + "step": 6542 + }, + { + "epoch": 0.19, + "grad_norm": 1.4820004099406963, + "learning_rate": 9.345466598993964e-06, + "loss": 0.3758, + "step": 6543 + }, + { + "epoch": 0.19, + "grad_norm": 1.461763849761568, + "learning_rate": 9.345234239794895e-06, + "loss": 0.3725, + "step": 6544 + }, + { + "epoch": 0.19, + "grad_norm": 1.4976308490026273, + "learning_rate": 9.345001842249085e-06, + "loss": 0.4149, + "step": 6545 + }, + { + "epoch": 0.19, + "grad_norm": 1.9743557455609626, + "learning_rate": 9.344769406358587e-06, + "loss": 0.4338, + "step": 6546 + }, + { + "epoch": 0.19, + "grad_norm": 1.5458160307166486, + "learning_rate": 9.34453693212545e-06, + "loss": 0.3815, + "step": 6547 + }, + { + "epoch": 0.19, + "grad_norm": 1.0212873423436875, + "learning_rate": 9.344304419551725e-06, + "loss": 0.5783, + "step": 6548 + }, + { + "epoch": 0.19, + "grad_norm": 1.5551504208966287, + "learning_rate": 9.344071868639467e-06, + "loss": 0.3791, + "step": 6549 + }, + { + "epoch": 0.19, + "grad_norm": 1.47507097281657, + "learning_rate": 9.343839279390726e-06, + "loss": 0.3622, + "step": 6550 + }, + { + "epoch": 0.19, + "grad_norm": 3.5780335703532353, + "learning_rate": 9.343606651807555e-06, + "loss": 0.3939, + "step": 6551 + }, + { + "epoch": 0.19, + "grad_norm": 2.0390706092380566, + "learning_rate": 9.343373985892009e-06, + "loss": 0.3829, + "step": 6552 + }, + { + "epoch": 0.19, + "grad_norm": 1.5265141267924207, + "learning_rate": 9.343141281646137e-06, + "loss": 0.3872, + "step": 6553 + }, + { + "epoch": 0.19, + "grad_norm": 1.4047766698898212, + "learning_rate": 9.342908539071995e-06, + "loss": 0.4313, + "step": 6554 + }, + { + "epoch": 0.19, + "grad_norm": 1.419291067680888, + "learning_rate": 9.342675758171638e-06, + "loss": 0.4131, + "step": 6555 + }, + { + "epoch": 0.19, + "grad_norm": 1.402831686217155, + "learning_rate": 9.34244293894712e-06, + "loss": 0.4059, + "step": 6556 + }, + { + "epoch": 0.19, + "grad_norm": 1.3426918464140314, + "learning_rate": 9.342210081400494e-06, + "loss": 0.3894, + "step": 6557 + }, + { + "epoch": 0.19, + "grad_norm": 1.4555051898866989, + "learning_rate": 9.341977185533816e-06, + "loss": 0.392, + "step": 6558 + }, + { + "epoch": 0.19, + "grad_norm": 1.6481515419178003, + "learning_rate": 9.34174425134914e-06, + "loss": 0.3867, + "step": 6559 + }, + { + "epoch": 0.19, + "grad_norm": 1.3665123933890801, + "learning_rate": 9.341511278848525e-06, + "loss": 0.4055, + "step": 6560 + }, + { + "epoch": 0.19, + "grad_norm": 1.732546994636401, + "learning_rate": 9.341278268034025e-06, + "loss": 0.3833, + "step": 6561 + }, + { + "epoch": 0.19, + "grad_norm": 1.428652262355988, + "learning_rate": 9.341045218907694e-06, + "loss": 0.3837, + "step": 6562 + }, + { + "epoch": 0.19, + "grad_norm": 1.611320090597992, + "learning_rate": 9.34081213147159e-06, + "loss": 0.3877, + "step": 6563 + }, + { + "epoch": 0.19, + "grad_norm": 1.4543223964309595, + "learning_rate": 9.340579005727772e-06, + "loss": 0.3785, + "step": 6564 + }, + { + "epoch": 0.19, + "grad_norm": 1.4562333198140287, + "learning_rate": 9.340345841678297e-06, + "loss": 0.3651, + "step": 6565 + }, + { + "epoch": 0.19, + "grad_norm": 1.6280049778882115, + "learning_rate": 9.340112639325222e-06, + "loss": 0.4023, + "step": 6566 + }, + { + "epoch": 0.19, + "grad_norm": 1.5282591431977186, + "learning_rate": 9.339879398670605e-06, + "loss": 0.3798, + "step": 6567 + }, + { + "epoch": 0.19, + "grad_norm": 1.5315365977229018, + "learning_rate": 9.339646119716503e-06, + "loss": 0.3742, + "step": 6568 + }, + { + "epoch": 0.19, + "grad_norm": 1.357470216567636, + "learning_rate": 9.339412802464976e-06, + "loss": 0.3678, + "step": 6569 + }, + { + "epoch": 0.19, + "grad_norm": 1.7658502596633419, + "learning_rate": 9.339179446918083e-06, + "loss": 0.3788, + "step": 6570 + }, + { + "epoch": 0.19, + "grad_norm": 1.5951559176749714, + "learning_rate": 9.338946053077882e-06, + "loss": 0.39, + "step": 6571 + }, + { + "epoch": 0.19, + "grad_norm": 1.6198193482237804, + "learning_rate": 9.338712620946435e-06, + "loss": 0.3962, + "step": 6572 + }, + { + "epoch": 0.19, + "grad_norm": 1.620325421315236, + "learning_rate": 9.338479150525801e-06, + "loss": 0.3701, + "step": 6573 + }, + { + "epoch": 0.19, + "grad_norm": 1.4544153447476016, + "learning_rate": 9.338245641818041e-06, + "loss": 0.3599, + "step": 6574 + }, + { + "epoch": 0.19, + "grad_norm": 1.458063433452463, + "learning_rate": 9.338012094825215e-06, + "loss": 0.4044, + "step": 6575 + }, + { + "epoch": 0.19, + "grad_norm": 1.8928503788578188, + "learning_rate": 9.337778509549384e-06, + "loss": 0.4016, + "step": 6576 + }, + { + "epoch": 0.19, + "grad_norm": 1.644645874122185, + "learning_rate": 9.33754488599261e-06, + "loss": 0.3626, + "step": 6577 + }, + { + "epoch": 0.19, + "grad_norm": 1.4068212978276076, + "learning_rate": 9.337311224156952e-06, + "loss": 0.3729, + "step": 6578 + }, + { + "epoch": 0.19, + "grad_norm": 1.4871398279452812, + "learning_rate": 9.337077524044476e-06, + "loss": 0.4062, + "step": 6579 + }, + { + "epoch": 0.19, + "grad_norm": 1.5359545517892386, + "learning_rate": 9.336843785657244e-06, + "loss": 0.3757, + "step": 6580 + }, + { + "epoch": 0.19, + "grad_norm": 1.7125362223445906, + "learning_rate": 9.336610008997315e-06, + "loss": 0.3807, + "step": 6581 + }, + { + "epoch": 0.19, + "grad_norm": 1.727744170511473, + "learning_rate": 9.336376194066756e-06, + "loss": 0.4142, + "step": 6582 + }, + { + "epoch": 0.19, + "grad_norm": 1.477829368639588, + "learning_rate": 9.336142340867629e-06, + "loss": 0.3874, + "step": 6583 + }, + { + "epoch": 0.19, + "grad_norm": 7.11208987848876, + "learning_rate": 9.335908449401995e-06, + "loss": 0.3951, + "step": 6584 + }, + { + "epoch": 0.19, + "grad_norm": 1.33202154034346, + "learning_rate": 9.335674519671925e-06, + "loss": 0.3874, + "step": 6585 + }, + { + "epoch": 0.19, + "grad_norm": 1.4554075929993557, + "learning_rate": 9.335440551679477e-06, + "loss": 0.3707, + "step": 6586 + }, + { + "epoch": 0.19, + "grad_norm": 3.3770930027404416, + "learning_rate": 9.335206545426719e-06, + "loss": 0.3689, + "step": 6587 + }, + { + "epoch": 0.19, + "grad_norm": 1.4999803488878127, + "learning_rate": 9.334972500915715e-06, + "loss": 0.3971, + "step": 6588 + }, + { + "epoch": 0.19, + "grad_norm": 1.593407641794209, + "learning_rate": 9.33473841814853e-06, + "loss": 0.3907, + "step": 6589 + }, + { + "epoch": 0.19, + "grad_norm": 2.1654574497980366, + "learning_rate": 9.334504297127231e-06, + "loss": 0.3734, + "step": 6590 + }, + { + "epoch": 0.19, + "grad_norm": 1.5907044564321684, + "learning_rate": 9.334270137853883e-06, + "loss": 0.3698, + "step": 6591 + }, + { + "epoch": 0.19, + "grad_norm": 1.5455944304081954, + "learning_rate": 9.334035940330553e-06, + "loss": 0.383, + "step": 6592 + }, + { + "epoch": 0.19, + "grad_norm": 1.4100875607805146, + "learning_rate": 9.333801704559307e-06, + "loss": 0.4074, + "step": 6593 + }, + { + "epoch": 0.19, + "grad_norm": 2.157502267813318, + "learning_rate": 9.333567430542213e-06, + "loss": 0.4049, + "step": 6594 + }, + { + "epoch": 0.19, + "grad_norm": 1.3590788207805538, + "learning_rate": 9.33333311828134e-06, + "loss": 0.3946, + "step": 6595 + }, + { + "epoch": 0.19, + "grad_norm": 1.424121066252056, + "learning_rate": 9.333098767778753e-06, + "loss": 0.4239, + "step": 6596 + }, + { + "epoch": 0.19, + "grad_norm": 1.4061755856851883, + "learning_rate": 9.332864379036522e-06, + "loss": 0.3776, + "step": 6597 + }, + { + "epoch": 0.19, + "grad_norm": 1.606762410137221, + "learning_rate": 9.332629952056714e-06, + "loss": 0.3892, + "step": 6598 + }, + { + "epoch": 0.19, + "grad_norm": 2.3954856525884187, + "learning_rate": 9.332395486841399e-06, + "loss": 0.4069, + "step": 6599 + }, + { + "epoch": 0.19, + "grad_norm": 1.5329656612189753, + "learning_rate": 9.332160983392645e-06, + "loss": 0.3811, + "step": 6600 + }, + { + "epoch": 0.19, + "grad_norm": 3.1921188501737223, + "learning_rate": 9.331926441712522e-06, + "loss": 0.3825, + "step": 6601 + }, + { + "epoch": 0.19, + "grad_norm": 1.6630329436247635, + "learning_rate": 9.331691861803102e-06, + "loss": 0.414, + "step": 6602 + }, + { + "epoch": 0.19, + "grad_norm": 1.4670698159817426, + "learning_rate": 9.331457243666454e-06, + "loss": 0.3848, + "step": 6603 + }, + { + "epoch": 0.19, + "grad_norm": 1.5722359244950712, + "learning_rate": 9.331222587304644e-06, + "loss": 0.3714, + "step": 6604 + }, + { + "epoch": 0.19, + "grad_norm": 1.698065754081516, + "learning_rate": 9.33098789271975e-06, + "loss": 0.3842, + "step": 6605 + }, + { + "epoch": 0.19, + "grad_norm": 1.5625317218698922, + "learning_rate": 9.330753159913837e-06, + "loss": 0.3552, + "step": 6606 + }, + { + "epoch": 0.19, + "grad_norm": 1.8680796846105323, + "learning_rate": 9.330518388888981e-06, + "loss": 0.4026, + "step": 6607 + }, + { + "epoch": 0.19, + "grad_norm": 2.2131761285825933, + "learning_rate": 9.330283579647253e-06, + "loss": 0.3833, + "step": 6608 + }, + { + "epoch": 0.19, + "grad_norm": 1.468406975607408, + "learning_rate": 9.330048732190723e-06, + "loss": 0.3768, + "step": 6609 + }, + { + "epoch": 0.19, + "grad_norm": 1.6537024064942336, + "learning_rate": 9.329813846521465e-06, + "loss": 0.3575, + "step": 6610 + }, + { + "epoch": 0.19, + "grad_norm": 1.41721633624557, + "learning_rate": 9.329578922641552e-06, + "loss": 0.3744, + "step": 6611 + }, + { + "epoch": 0.19, + "grad_norm": 1.507499034432479, + "learning_rate": 9.329343960553057e-06, + "loss": 0.3671, + "step": 6612 + }, + { + "epoch": 0.19, + "grad_norm": 1.4868256832198952, + "learning_rate": 9.329108960258054e-06, + "loss": 0.3878, + "step": 6613 + }, + { + "epoch": 0.19, + "grad_norm": 1.7098343508647535, + "learning_rate": 9.328873921758617e-06, + "loss": 0.3869, + "step": 6614 + }, + { + "epoch": 0.19, + "grad_norm": 2.1299388880124996, + "learning_rate": 9.328638845056817e-06, + "loss": 0.3709, + "step": 6615 + }, + { + "epoch": 0.19, + "grad_norm": 1.5949842497994189, + "learning_rate": 9.328403730154734e-06, + "loss": 0.3929, + "step": 6616 + }, + { + "epoch": 0.19, + "grad_norm": 1.5697499456210546, + "learning_rate": 9.328168577054439e-06, + "loss": 0.3826, + "step": 6617 + }, + { + "epoch": 0.19, + "grad_norm": 1.377513960358907, + "learning_rate": 9.327933385758008e-06, + "loss": 0.3734, + "step": 6618 + }, + { + "epoch": 0.19, + "grad_norm": 1.2534044781873135, + "learning_rate": 9.327698156267516e-06, + "loss": 0.3357, + "step": 6619 + }, + { + "epoch": 0.19, + "grad_norm": 1.5122678736315294, + "learning_rate": 9.327462888585042e-06, + "loss": 0.3721, + "step": 6620 + }, + { + "epoch": 0.19, + "grad_norm": 1.4607785398730473, + "learning_rate": 9.327227582712657e-06, + "loss": 0.3938, + "step": 6621 + }, + { + "epoch": 0.19, + "grad_norm": 1.8565131493837432, + "learning_rate": 9.326992238652443e-06, + "loss": 0.3849, + "step": 6622 + }, + { + "epoch": 0.19, + "grad_norm": 1.48737747964642, + "learning_rate": 9.326756856406473e-06, + "loss": 0.3905, + "step": 6623 + }, + { + "epoch": 0.19, + "grad_norm": 1.622638640736252, + "learning_rate": 9.326521435976827e-06, + "loss": 0.393, + "step": 6624 + }, + { + "epoch": 0.19, + "grad_norm": 1.771520002160455, + "learning_rate": 9.326285977365581e-06, + "loss": 0.4339, + "step": 6625 + }, + { + "epoch": 0.19, + "grad_norm": 1.6268483675355778, + "learning_rate": 9.326050480574814e-06, + "loss": 0.384, + "step": 6626 + }, + { + "epoch": 0.19, + "grad_norm": 1.5998846918166942, + "learning_rate": 9.325814945606601e-06, + "loss": 0.3988, + "step": 6627 + }, + { + "epoch": 0.19, + "grad_norm": 1.4108129158871083, + "learning_rate": 9.325579372463025e-06, + "loss": 0.3587, + "step": 6628 + }, + { + "epoch": 0.19, + "grad_norm": 1.3843288187288376, + "learning_rate": 9.325343761146164e-06, + "loss": 0.3874, + "step": 6629 + }, + { + "epoch": 0.19, + "grad_norm": 1.7033755482926232, + "learning_rate": 9.325108111658094e-06, + "loss": 0.3524, + "step": 6630 + }, + { + "epoch": 0.19, + "grad_norm": 1.4963525677982135, + "learning_rate": 9.3248724240009e-06, + "loss": 0.3909, + "step": 6631 + }, + { + "epoch": 0.19, + "grad_norm": 2.282529613386422, + "learning_rate": 9.324636698176655e-06, + "loss": 0.3658, + "step": 6632 + }, + { + "epoch": 0.19, + "grad_norm": 1.5582807535145402, + "learning_rate": 9.324400934187445e-06, + "loss": 0.4003, + "step": 6633 + }, + { + "epoch": 0.19, + "grad_norm": 1.4925760291306982, + "learning_rate": 9.32416513203535e-06, + "loss": 0.4068, + "step": 6634 + }, + { + "epoch": 0.19, + "grad_norm": 1.3852282370480369, + "learning_rate": 9.323929291722448e-06, + "loss": 0.3947, + "step": 6635 + }, + { + "epoch": 0.19, + "grad_norm": 1.5923379582871382, + "learning_rate": 9.323693413250823e-06, + "loss": 0.4017, + "step": 6636 + }, + { + "epoch": 0.19, + "grad_norm": 1.3661830058286015, + "learning_rate": 9.323457496622555e-06, + "loss": 0.3922, + "step": 6637 + }, + { + "epoch": 0.19, + "grad_norm": 3.8661500951367787, + "learning_rate": 9.323221541839727e-06, + "loss": 0.3643, + "step": 6638 + }, + { + "epoch": 0.19, + "grad_norm": 2.19324710846936, + "learning_rate": 9.32298554890442e-06, + "loss": 0.3852, + "step": 6639 + }, + { + "epoch": 0.19, + "grad_norm": 1.689882198150117, + "learning_rate": 9.322749517818719e-06, + "loss": 0.3955, + "step": 6640 + }, + { + "epoch": 0.19, + "grad_norm": 1.464941327743589, + "learning_rate": 9.322513448584705e-06, + "loss": 0.3747, + "step": 6641 + }, + { + "epoch": 0.19, + "grad_norm": 1.4406735423313184, + "learning_rate": 9.322277341204462e-06, + "loss": 0.3666, + "step": 6642 + }, + { + "epoch": 0.19, + "grad_norm": 1.5418297839547017, + "learning_rate": 9.322041195680075e-06, + "loss": 0.3716, + "step": 6643 + }, + { + "epoch": 0.19, + "grad_norm": 1.9682077250229377, + "learning_rate": 9.321805012013622e-06, + "loss": 0.3858, + "step": 6644 + }, + { + "epoch": 0.19, + "grad_norm": 1.7846404029816996, + "learning_rate": 9.321568790207196e-06, + "loss": 0.3639, + "step": 6645 + }, + { + "epoch": 0.19, + "grad_norm": 2.2310324947548397, + "learning_rate": 9.321332530262877e-06, + "loss": 0.3716, + "step": 6646 + }, + { + "epoch": 0.19, + "grad_norm": 1.6331668061042024, + "learning_rate": 9.32109623218275e-06, + "loss": 0.4447, + "step": 6647 + }, + { + "epoch": 0.19, + "grad_norm": 1.75849236588562, + "learning_rate": 9.320859895968899e-06, + "loss": 0.4067, + "step": 6648 + }, + { + "epoch": 0.19, + "grad_norm": 1.6682914780941074, + "learning_rate": 9.320623521623412e-06, + "loss": 0.3609, + "step": 6649 + }, + { + "epoch": 0.19, + "grad_norm": 1.558471474566446, + "learning_rate": 9.320387109148377e-06, + "loss": 0.3835, + "step": 6650 + }, + { + "epoch": 0.19, + "grad_norm": 1.4976715053850478, + "learning_rate": 9.320150658545876e-06, + "loss": 0.4126, + "step": 6651 + }, + { + "epoch": 0.19, + "grad_norm": 1.8389849020669056, + "learning_rate": 9.319914169817999e-06, + "loss": 0.3972, + "step": 6652 + }, + { + "epoch": 0.19, + "grad_norm": 2.2794375976422745, + "learning_rate": 9.319677642966829e-06, + "loss": 0.3822, + "step": 6653 + }, + { + "epoch": 0.19, + "grad_norm": 1.5005018399095325, + "learning_rate": 9.319441077994458e-06, + "loss": 0.3835, + "step": 6654 + }, + { + "epoch": 0.19, + "grad_norm": 7.525484084708995, + "learning_rate": 9.31920447490297e-06, + "loss": 0.3711, + "step": 6655 + }, + { + "epoch": 0.19, + "grad_norm": 1.8944017643607671, + "learning_rate": 9.318967833694457e-06, + "loss": 0.3972, + "step": 6656 + }, + { + "epoch": 0.19, + "grad_norm": 1.4356215759791404, + "learning_rate": 9.318731154371002e-06, + "loss": 0.42, + "step": 6657 + }, + { + "epoch": 0.19, + "grad_norm": 1.4147650212265932, + "learning_rate": 9.318494436934698e-06, + "loss": 0.3759, + "step": 6658 + }, + { + "epoch": 0.19, + "grad_norm": 1.7272838404996627, + "learning_rate": 9.318257681387632e-06, + "loss": 0.3898, + "step": 6659 + }, + { + "epoch": 0.19, + "grad_norm": 1.493647850621412, + "learning_rate": 9.318020887731895e-06, + "loss": 0.3936, + "step": 6660 + }, + { + "epoch": 0.19, + "grad_norm": 1.3765026908493803, + "learning_rate": 9.317784055969576e-06, + "loss": 0.3781, + "step": 6661 + }, + { + "epoch": 0.19, + "grad_norm": 1.5248222719008606, + "learning_rate": 9.317547186102765e-06, + "loss": 0.3705, + "step": 6662 + }, + { + "epoch": 0.19, + "grad_norm": 1.622175681992782, + "learning_rate": 9.317310278133552e-06, + "loss": 0.3726, + "step": 6663 + }, + { + "epoch": 0.19, + "grad_norm": 1.8556070487309406, + "learning_rate": 9.317073332064026e-06, + "loss": 0.38, + "step": 6664 + }, + { + "epoch": 0.19, + "grad_norm": 1.5087620935585477, + "learning_rate": 9.316836347896281e-06, + "loss": 0.3869, + "step": 6665 + }, + { + "epoch": 0.19, + "grad_norm": 1.4338359475368971, + "learning_rate": 9.316599325632408e-06, + "loss": 0.3971, + "step": 6666 + }, + { + "epoch": 0.19, + "grad_norm": 1.5561847879859472, + "learning_rate": 9.316362265274499e-06, + "loss": 0.4058, + "step": 6667 + }, + { + "epoch": 0.19, + "grad_norm": 1.5260628840034527, + "learning_rate": 9.316125166824644e-06, + "loss": 0.3685, + "step": 6668 + }, + { + "epoch": 0.19, + "grad_norm": 1.364400387969098, + "learning_rate": 9.315888030284935e-06, + "loss": 0.3891, + "step": 6669 + }, + { + "epoch": 0.19, + "grad_norm": 1.4964521531391166, + "learning_rate": 9.315650855657468e-06, + "loss": 0.3523, + "step": 6670 + }, + { + "epoch": 0.19, + "grad_norm": 1.4892278326765789, + "learning_rate": 9.315413642944334e-06, + "loss": 0.3714, + "step": 6671 + }, + { + "epoch": 0.19, + "grad_norm": 1.4654080838623045, + "learning_rate": 9.315176392147626e-06, + "loss": 0.4126, + "step": 6672 + }, + { + "epoch": 0.19, + "grad_norm": 1.5267755025863816, + "learning_rate": 9.31493910326944e-06, + "loss": 0.3715, + "step": 6673 + }, + { + "epoch": 0.19, + "grad_norm": 1.0825163239839748, + "learning_rate": 9.314701776311868e-06, + "loss": 0.6299, + "step": 6674 + }, + { + "epoch": 0.19, + "grad_norm": 1.5869077907719167, + "learning_rate": 9.314464411277004e-06, + "loss": 0.3906, + "step": 6675 + }, + { + "epoch": 0.19, + "grad_norm": 1.6609538836625877, + "learning_rate": 9.314227008166945e-06, + "loss": 0.3788, + "step": 6676 + }, + { + "epoch": 0.19, + "grad_norm": 1.5578750901517224, + "learning_rate": 9.313989566983784e-06, + "loss": 0.3641, + "step": 6677 + }, + { + "epoch": 0.19, + "grad_norm": 1.4971726643614451, + "learning_rate": 9.313752087729617e-06, + "loss": 0.4042, + "step": 6678 + }, + { + "epoch": 0.19, + "grad_norm": 1.6235263953245291, + "learning_rate": 9.31351457040654e-06, + "loss": 0.3827, + "step": 6679 + }, + { + "epoch": 0.19, + "grad_norm": 1.4727621573182494, + "learning_rate": 9.31327701501665e-06, + "loss": 0.3742, + "step": 6680 + }, + { + "epoch": 0.19, + "grad_norm": 1.5485613534801541, + "learning_rate": 9.313039421562042e-06, + "loss": 0.3875, + "step": 6681 + }, + { + "epoch": 0.19, + "grad_norm": 1.656169919232705, + "learning_rate": 9.312801790044814e-06, + "loss": 0.4011, + "step": 6682 + }, + { + "epoch": 0.19, + "grad_norm": 1.6537103172199459, + "learning_rate": 9.312564120467061e-06, + "loss": 0.3864, + "step": 6683 + }, + { + "epoch": 0.19, + "grad_norm": 0.9655754615074321, + "learning_rate": 9.312326412830884e-06, + "loss": 0.5756, + "step": 6684 + }, + { + "epoch": 0.19, + "grad_norm": 1.8434108139400713, + "learning_rate": 9.312088667138375e-06, + "loss": 0.3995, + "step": 6685 + }, + { + "epoch": 0.19, + "grad_norm": 2.2311007013809676, + "learning_rate": 9.311850883391638e-06, + "loss": 0.3902, + "step": 6686 + }, + { + "epoch": 0.19, + "grad_norm": 1.939427914099453, + "learning_rate": 9.311613061592768e-06, + "loss": 0.3795, + "step": 6687 + }, + { + "epoch": 0.19, + "grad_norm": 1.5992769726755696, + "learning_rate": 9.311375201743867e-06, + "loss": 0.3703, + "step": 6688 + }, + { + "epoch": 0.19, + "grad_norm": 1.5699262258453632, + "learning_rate": 9.31113730384703e-06, + "loss": 0.365, + "step": 6689 + }, + { + "epoch": 0.19, + "grad_norm": 1.5626223463960323, + "learning_rate": 9.31089936790436e-06, + "loss": 0.3731, + "step": 6690 + }, + { + "epoch": 0.19, + "grad_norm": 1.585608791264811, + "learning_rate": 9.310661393917953e-06, + "loss": 0.3785, + "step": 6691 + }, + { + "epoch": 0.19, + "grad_norm": 1.3557194158677017, + "learning_rate": 9.310423381889913e-06, + "loss": 0.3651, + "step": 6692 + }, + { + "epoch": 0.19, + "grad_norm": 1.4498762698982728, + "learning_rate": 9.310185331822338e-06, + "loss": 0.4008, + "step": 6693 + }, + { + "epoch": 0.19, + "grad_norm": 1.3955924472489085, + "learning_rate": 9.30994724371733e-06, + "loss": 0.3606, + "step": 6694 + }, + { + "epoch": 0.19, + "grad_norm": 1.5237738831663998, + "learning_rate": 9.30970911757699e-06, + "loss": 0.3837, + "step": 6695 + }, + { + "epoch": 0.19, + "grad_norm": 1.5249027906494577, + "learning_rate": 9.30947095340342e-06, + "loss": 0.3696, + "step": 6696 + }, + { + "epoch": 0.19, + "grad_norm": 1.5348853993749778, + "learning_rate": 9.30923275119872e-06, + "loss": 0.3825, + "step": 6697 + }, + { + "epoch": 0.19, + "grad_norm": 1.6967191633070506, + "learning_rate": 9.308994510964994e-06, + "loss": 0.3748, + "step": 6698 + }, + { + "epoch": 0.19, + "grad_norm": 0.9780537288960107, + "learning_rate": 9.308756232704343e-06, + "loss": 0.6179, + "step": 6699 + }, + { + "epoch": 0.19, + "grad_norm": 1.6407306561464188, + "learning_rate": 9.308517916418871e-06, + "loss": 0.4156, + "step": 6700 + }, + { + "epoch": 0.19, + "grad_norm": 1.949527716680275, + "learning_rate": 9.308279562110679e-06, + "loss": 0.4145, + "step": 6701 + }, + { + "epoch": 0.19, + "grad_norm": 1.3939321390307087, + "learning_rate": 9.308041169781874e-06, + "loss": 0.3724, + "step": 6702 + }, + { + "epoch": 0.19, + "grad_norm": 1.5880673063162822, + "learning_rate": 9.307802739434557e-06, + "loss": 0.3778, + "step": 6703 + }, + { + "epoch": 0.19, + "grad_norm": 1.3271756369324361, + "learning_rate": 9.307564271070833e-06, + "loss": 0.3701, + "step": 6704 + }, + { + "epoch": 0.19, + "grad_norm": 1.4733690101914432, + "learning_rate": 9.307325764692807e-06, + "loss": 0.382, + "step": 6705 + }, + { + "epoch": 0.19, + "grad_norm": 1.5251002283081083, + "learning_rate": 9.307087220302582e-06, + "loss": 0.3781, + "step": 6706 + }, + { + "epoch": 0.19, + "grad_norm": 1.672774047808991, + "learning_rate": 9.306848637902266e-06, + "loss": 0.399, + "step": 6707 + }, + { + "epoch": 0.19, + "grad_norm": 1.6156615222244675, + "learning_rate": 9.306610017493962e-06, + "loss": 0.3979, + "step": 6708 + }, + { + "epoch": 0.19, + "grad_norm": 1.7423974953695975, + "learning_rate": 9.306371359079777e-06, + "loss": 0.4147, + "step": 6709 + }, + { + "epoch": 0.19, + "grad_norm": 1.551541439245796, + "learning_rate": 9.306132662661818e-06, + "loss": 0.384, + "step": 6710 + }, + { + "epoch": 0.19, + "grad_norm": 2.023536120996763, + "learning_rate": 9.30589392824219e-06, + "loss": 0.3774, + "step": 6711 + }, + { + "epoch": 0.19, + "grad_norm": 1.7561445509214297, + "learning_rate": 9.305655155823e-06, + "loss": 0.4279, + "step": 6712 + }, + { + "epoch": 0.19, + "grad_norm": 1.7635962614621654, + "learning_rate": 9.305416345406356e-06, + "loss": 0.3925, + "step": 6713 + }, + { + "epoch": 0.19, + "grad_norm": 1.6796659401481648, + "learning_rate": 9.305177496994363e-06, + "loss": 0.3819, + "step": 6714 + }, + { + "epoch": 0.19, + "grad_norm": 1.4761728340865068, + "learning_rate": 9.304938610589133e-06, + "loss": 0.3678, + "step": 6715 + }, + { + "epoch": 0.19, + "grad_norm": 1.4019016700732168, + "learning_rate": 9.304699686192771e-06, + "loss": 0.3768, + "step": 6716 + }, + { + "epoch": 0.19, + "grad_norm": 1.5234652432603235, + "learning_rate": 9.304460723807386e-06, + "loss": 0.3821, + "step": 6717 + }, + { + "epoch": 0.19, + "grad_norm": 1.5863705250079794, + "learning_rate": 9.30422172343509e-06, + "loss": 0.3936, + "step": 6718 + }, + { + "epoch": 0.19, + "grad_norm": 1.7293471346736349, + "learning_rate": 9.303982685077986e-06, + "loss": 0.3784, + "step": 6719 + }, + { + "epoch": 0.19, + "grad_norm": 1.4960815431683792, + "learning_rate": 9.30374360873819e-06, + "loss": 0.3566, + "step": 6720 + }, + { + "epoch": 0.19, + "grad_norm": 1.503676888224154, + "learning_rate": 9.303504494417808e-06, + "loss": 0.396, + "step": 6721 + }, + { + "epoch": 0.19, + "grad_norm": 1.732161185979919, + "learning_rate": 9.30326534211895e-06, + "loss": 0.4249, + "step": 6722 + }, + { + "epoch": 0.19, + "grad_norm": 1.6059849625245837, + "learning_rate": 9.303026151843729e-06, + "loss": 0.3733, + "step": 6723 + }, + { + "epoch": 0.2, + "grad_norm": 1.5442979404976027, + "learning_rate": 9.302786923594253e-06, + "loss": 0.3773, + "step": 6724 + }, + { + "epoch": 0.2, + "grad_norm": 1.8548304250294776, + "learning_rate": 9.302547657372636e-06, + "loss": 0.4065, + "step": 6725 + }, + { + "epoch": 0.2, + "grad_norm": 1.3999495561189141, + "learning_rate": 9.302308353180988e-06, + "loss": 0.3777, + "step": 6726 + }, + { + "epoch": 0.2, + "grad_norm": 1.4398464987416695, + "learning_rate": 9.30206901102142e-06, + "loss": 0.3896, + "step": 6727 + }, + { + "epoch": 0.2, + "grad_norm": 2.2404884280689528, + "learning_rate": 9.301829630896047e-06, + "loss": 0.3721, + "step": 6728 + }, + { + "epoch": 0.2, + "grad_norm": 1.5305652356008668, + "learning_rate": 9.301590212806979e-06, + "loss": 0.3999, + "step": 6729 + }, + { + "epoch": 0.2, + "grad_norm": 1.5073022597764638, + "learning_rate": 9.301350756756329e-06, + "loss": 0.3898, + "step": 6730 + }, + { + "epoch": 0.2, + "grad_norm": 1.4888897845844054, + "learning_rate": 9.301111262746212e-06, + "loss": 0.3834, + "step": 6731 + }, + { + "epoch": 0.2, + "grad_norm": 1.411526776053762, + "learning_rate": 9.300871730778739e-06, + "loss": 0.4044, + "step": 6732 + }, + { + "epoch": 0.2, + "grad_norm": 1.4190222185389822, + "learning_rate": 9.300632160856027e-06, + "loss": 0.3602, + "step": 6733 + }, + { + "epoch": 0.2, + "grad_norm": 2.1291690508449714, + "learning_rate": 9.300392552980186e-06, + "loss": 0.3632, + "step": 6734 + }, + { + "epoch": 0.2, + "grad_norm": 1.460916007346979, + "learning_rate": 9.300152907153335e-06, + "loss": 0.3687, + "step": 6735 + }, + { + "epoch": 0.2, + "grad_norm": 1.5207353583643184, + "learning_rate": 9.299913223377587e-06, + "loss": 0.3737, + "step": 6736 + }, + { + "epoch": 0.2, + "grad_norm": 0.9737453997115759, + "learning_rate": 9.299673501655056e-06, + "loss": 0.6249, + "step": 6737 + }, + { + "epoch": 0.2, + "grad_norm": 1.460970632326652, + "learning_rate": 9.29943374198786e-06, + "loss": 0.4002, + "step": 6738 + }, + { + "epoch": 0.2, + "grad_norm": 1.3405908069700938, + "learning_rate": 9.299193944378112e-06, + "loss": 0.3746, + "step": 6739 + }, + { + "epoch": 0.2, + "grad_norm": 1.701502740971858, + "learning_rate": 9.298954108827931e-06, + "loss": 0.4059, + "step": 6740 + }, + { + "epoch": 0.2, + "grad_norm": 1.4974624356373771, + "learning_rate": 9.29871423533943e-06, + "loss": 0.362, + "step": 6741 + }, + { + "epoch": 0.2, + "grad_norm": 1.37999759852841, + "learning_rate": 9.298474323914731e-06, + "loss": 0.377, + "step": 6742 + }, + { + "epoch": 0.2, + "grad_norm": 1.6547826989492165, + "learning_rate": 9.298234374555948e-06, + "loss": 0.4334, + "step": 6743 + }, + { + "epoch": 0.2, + "grad_norm": 1.397997250623214, + "learning_rate": 9.297994387265198e-06, + "loss": 0.3723, + "step": 6744 + }, + { + "epoch": 0.2, + "grad_norm": 1.4310889465859329, + "learning_rate": 9.2977543620446e-06, + "loss": 0.3626, + "step": 6745 + }, + { + "epoch": 0.2, + "grad_norm": 1.956165994949043, + "learning_rate": 9.297514298896272e-06, + "loss": 0.3979, + "step": 6746 + }, + { + "epoch": 0.2, + "grad_norm": 0.978125294998051, + "learning_rate": 9.297274197822333e-06, + "loss": 0.5933, + "step": 6747 + }, + { + "epoch": 0.2, + "grad_norm": 0.9591978963551683, + "learning_rate": 9.2970340588249e-06, + "loss": 0.6322, + "step": 6748 + }, + { + "epoch": 0.2, + "grad_norm": 1.671197014164038, + "learning_rate": 9.296793881906096e-06, + "loss": 0.3981, + "step": 6749 + }, + { + "epoch": 0.2, + "grad_norm": 0.9875295026869534, + "learning_rate": 9.296553667068038e-06, + "loss": 0.6429, + "step": 6750 + }, + { + "epoch": 0.2, + "grad_norm": 1.3808445982816941, + "learning_rate": 9.296313414312846e-06, + "loss": 0.36, + "step": 6751 + }, + { + "epoch": 0.2, + "grad_norm": 1.4593351183208905, + "learning_rate": 9.29607312364264e-06, + "loss": 0.3968, + "step": 6752 + }, + { + "epoch": 0.2, + "grad_norm": 1.5231454112990523, + "learning_rate": 9.29583279505954e-06, + "loss": 0.372, + "step": 6753 + }, + { + "epoch": 0.2, + "grad_norm": 1.4812565572653038, + "learning_rate": 9.29559242856567e-06, + "loss": 0.3763, + "step": 6754 + }, + { + "epoch": 0.2, + "grad_norm": 2.032344804132942, + "learning_rate": 9.295352024163149e-06, + "loss": 0.3932, + "step": 6755 + }, + { + "epoch": 0.2, + "grad_norm": 1.6513101299631252, + "learning_rate": 9.295111581854098e-06, + "loss": 0.3709, + "step": 6756 + }, + { + "epoch": 0.2, + "grad_norm": 1.5406652334236866, + "learning_rate": 9.294871101640639e-06, + "loss": 0.3934, + "step": 6757 + }, + { + "epoch": 0.2, + "grad_norm": 1.510572065039961, + "learning_rate": 9.294630583524895e-06, + "loss": 0.3691, + "step": 6758 + }, + { + "epoch": 0.2, + "grad_norm": 1.6590769070603506, + "learning_rate": 9.29439002750899e-06, + "loss": 0.3687, + "step": 6759 + }, + { + "epoch": 0.2, + "grad_norm": 1.4481960188026684, + "learning_rate": 9.294149433595045e-06, + "loss": 0.3858, + "step": 6760 + }, + { + "epoch": 0.2, + "grad_norm": 1.5126965665115315, + "learning_rate": 9.293908801785184e-06, + "loss": 0.3837, + "step": 6761 + }, + { + "epoch": 0.2, + "grad_norm": 1.349087477146877, + "learning_rate": 9.293668132081528e-06, + "loss": 0.3764, + "step": 6762 + }, + { + "epoch": 0.2, + "grad_norm": 1.4323413235935687, + "learning_rate": 9.293427424486203e-06, + "loss": 0.3626, + "step": 6763 + }, + { + "epoch": 0.2, + "grad_norm": 1.7893635684411384, + "learning_rate": 9.293186679001336e-06, + "loss": 0.3853, + "step": 6764 + }, + { + "epoch": 0.2, + "grad_norm": 1.6055511767602793, + "learning_rate": 9.292945895629047e-06, + "loss": 0.3915, + "step": 6765 + }, + { + "epoch": 0.2, + "grad_norm": 1.6948988404868741, + "learning_rate": 9.292705074371465e-06, + "loss": 0.3793, + "step": 6766 + }, + { + "epoch": 0.2, + "grad_norm": 1.6510548210415443, + "learning_rate": 9.29246421523071e-06, + "loss": 0.4168, + "step": 6767 + }, + { + "epoch": 0.2, + "grad_norm": 1.4659872552045599, + "learning_rate": 9.292223318208913e-06, + "loss": 0.4281, + "step": 6768 + }, + { + "epoch": 0.2, + "grad_norm": 1.3815240080303557, + "learning_rate": 9.291982383308195e-06, + "loss": 0.3865, + "step": 6769 + }, + { + "epoch": 0.2, + "grad_norm": 1.9348689651507818, + "learning_rate": 9.291741410530688e-06, + "loss": 0.3845, + "step": 6770 + }, + { + "epoch": 0.2, + "grad_norm": 1.470220006695454, + "learning_rate": 9.291500399878514e-06, + "loss": 0.3828, + "step": 6771 + }, + { + "epoch": 0.2, + "grad_norm": 2.4432860414142583, + "learning_rate": 9.291259351353801e-06, + "loss": 0.3763, + "step": 6772 + }, + { + "epoch": 0.2, + "grad_norm": 1.758471541840049, + "learning_rate": 9.291018264958678e-06, + "loss": 0.4141, + "step": 6773 + }, + { + "epoch": 0.2, + "grad_norm": 1.177570579641299, + "learning_rate": 9.290777140695269e-06, + "loss": 0.628, + "step": 6774 + }, + { + "epoch": 0.2, + "grad_norm": 2.0386907249440758, + "learning_rate": 9.290535978565706e-06, + "loss": 0.3982, + "step": 6775 + }, + { + "epoch": 0.2, + "grad_norm": 2.073717361152975, + "learning_rate": 9.290294778572114e-06, + "loss": 0.3497, + "step": 6776 + }, + { + "epoch": 0.2, + "grad_norm": 1.533583102924578, + "learning_rate": 9.290053540716624e-06, + "loss": 0.3626, + "step": 6777 + }, + { + "epoch": 0.2, + "grad_norm": 1.4408656955923016, + "learning_rate": 9.289812265001364e-06, + "loss": 0.4179, + "step": 6778 + }, + { + "epoch": 0.2, + "grad_norm": 1.368601829962562, + "learning_rate": 9.289570951428463e-06, + "loss": 0.3905, + "step": 6779 + }, + { + "epoch": 0.2, + "grad_norm": 1.3913546213883623, + "learning_rate": 9.289329600000049e-06, + "loss": 0.3909, + "step": 6780 + }, + { + "epoch": 0.2, + "grad_norm": 1.487326576471379, + "learning_rate": 9.289088210718256e-06, + "loss": 0.372, + "step": 6781 + }, + { + "epoch": 0.2, + "grad_norm": 1.47339717929625, + "learning_rate": 9.288846783585213e-06, + "loss": 0.3869, + "step": 6782 + }, + { + "epoch": 0.2, + "grad_norm": 1.4253014192526647, + "learning_rate": 9.288605318603047e-06, + "loss": 0.3696, + "step": 6783 + }, + { + "epoch": 0.2, + "grad_norm": 2.0439577319548508, + "learning_rate": 9.288363815773894e-06, + "loss": 0.367, + "step": 6784 + }, + { + "epoch": 0.2, + "grad_norm": 1.4519066920876174, + "learning_rate": 9.28812227509988e-06, + "loss": 0.3771, + "step": 6785 + }, + { + "epoch": 0.2, + "grad_norm": 1.508628465299942, + "learning_rate": 9.28788069658314e-06, + "loss": 0.3766, + "step": 6786 + }, + { + "epoch": 0.2, + "grad_norm": 1.450095678190024, + "learning_rate": 9.287639080225805e-06, + "loss": 0.3969, + "step": 6787 + }, + { + "epoch": 0.2, + "grad_norm": 1.5623961824742294, + "learning_rate": 9.287397426030009e-06, + "loss": 0.3533, + "step": 6788 + }, + { + "epoch": 0.2, + "grad_norm": 2.1696346801357294, + "learning_rate": 9.287155733997883e-06, + "loss": 0.3736, + "step": 6789 + }, + { + "epoch": 0.2, + "grad_norm": 1.9186922319181903, + "learning_rate": 9.286914004131558e-06, + "loss": 0.3833, + "step": 6790 + }, + { + "epoch": 0.2, + "grad_norm": 1.5947937382383426, + "learning_rate": 9.286672236433171e-06, + "loss": 0.4394, + "step": 6791 + }, + { + "epoch": 0.2, + "grad_norm": 1.6605896104170474, + "learning_rate": 9.286430430904854e-06, + "loss": 0.4321, + "step": 6792 + }, + { + "epoch": 0.2, + "grad_norm": 1.39314987975629, + "learning_rate": 9.28618858754874e-06, + "loss": 0.3736, + "step": 6793 + }, + { + "epoch": 0.2, + "grad_norm": 1.2696760833228293, + "learning_rate": 9.285946706366966e-06, + "loss": 0.3554, + "step": 6794 + }, + { + "epoch": 0.2, + "grad_norm": 2.102174492622109, + "learning_rate": 9.285704787361664e-06, + "loss": 0.3861, + "step": 6795 + }, + { + "epoch": 0.2, + "grad_norm": 1.4981198860678657, + "learning_rate": 9.285462830534969e-06, + "loss": 0.4018, + "step": 6796 + }, + { + "epoch": 0.2, + "grad_norm": 1.5251828900743236, + "learning_rate": 9.285220835889018e-06, + "loss": 0.3715, + "step": 6797 + }, + { + "epoch": 0.2, + "grad_norm": 1.4922347385105672, + "learning_rate": 9.284978803425944e-06, + "loss": 0.3587, + "step": 6798 + }, + { + "epoch": 0.2, + "grad_norm": 1.5019408161288588, + "learning_rate": 9.284736733147885e-06, + "loss": 0.3836, + "step": 6799 + }, + { + "epoch": 0.2, + "grad_norm": 1.6923631032027915, + "learning_rate": 9.284494625056978e-06, + "loss": 0.3673, + "step": 6800 + }, + { + "epoch": 0.2, + "grad_norm": 1.5604857943183776, + "learning_rate": 9.284252479155358e-06, + "loss": 0.3512, + "step": 6801 + }, + { + "epoch": 0.2, + "grad_norm": 1.58640063469759, + "learning_rate": 9.284010295445163e-06, + "loss": 0.3985, + "step": 6802 + }, + { + "epoch": 0.2, + "grad_norm": 1.500493358049995, + "learning_rate": 9.283768073928527e-06, + "loss": 0.3966, + "step": 6803 + }, + { + "epoch": 0.2, + "grad_norm": 2.355367996829947, + "learning_rate": 9.283525814607593e-06, + "loss": 0.4288, + "step": 6804 + }, + { + "epoch": 0.2, + "grad_norm": 1.9617089241919496, + "learning_rate": 9.283283517484494e-06, + "loss": 0.3727, + "step": 6805 + }, + { + "epoch": 0.2, + "grad_norm": 1.4642120796659808, + "learning_rate": 9.283041182561371e-06, + "loss": 0.3819, + "step": 6806 + }, + { + "epoch": 0.2, + "grad_norm": 1.17756739077189, + "learning_rate": 9.282798809840364e-06, + "loss": 0.6727, + "step": 6807 + }, + { + "epoch": 0.2, + "grad_norm": 1.3746689572260558, + "learning_rate": 9.282556399323608e-06, + "loss": 0.3623, + "step": 6808 + }, + { + "epoch": 0.2, + "grad_norm": 1.4978639492905876, + "learning_rate": 9.282313951013245e-06, + "loss": 0.3677, + "step": 6809 + }, + { + "epoch": 0.2, + "grad_norm": 1.4396290211377194, + "learning_rate": 9.282071464911414e-06, + "loss": 0.3802, + "step": 6810 + }, + { + "epoch": 0.2, + "grad_norm": 1.4229517113049097, + "learning_rate": 9.281828941020255e-06, + "loss": 0.3476, + "step": 6811 + }, + { + "epoch": 0.2, + "grad_norm": 1.3890298801511551, + "learning_rate": 9.281586379341907e-06, + "loss": 0.3916, + "step": 6812 + }, + { + "epoch": 0.2, + "grad_norm": 1.5168220694496934, + "learning_rate": 9.281343779878513e-06, + "loss": 0.3593, + "step": 6813 + }, + { + "epoch": 0.2, + "grad_norm": 1.4880653696305457, + "learning_rate": 9.281101142632211e-06, + "loss": 0.3682, + "step": 6814 + }, + { + "epoch": 0.2, + "grad_norm": 1.587574455406432, + "learning_rate": 9.280858467605145e-06, + "loss": 0.3665, + "step": 6815 + }, + { + "epoch": 0.2, + "grad_norm": 3.277271877957765, + "learning_rate": 9.280615754799455e-06, + "loss": 0.396, + "step": 6816 + }, + { + "epoch": 0.2, + "grad_norm": 1.4523072729238216, + "learning_rate": 9.280373004217285e-06, + "loss": 0.3628, + "step": 6817 + }, + { + "epoch": 0.2, + "grad_norm": 1.6883801552159663, + "learning_rate": 9.280130215860775e-06, + "loss": 0.3854, + "step": 6818 + }, + { + "epoch": 0.2, + "grad_norm": 2.619802546430154, + "learning_rate": 9.279887389732067e-06, + "loss": 0.3607, + "step": 6819 + }, + { + "epoch": 0.2, + "grad_norm": 1.4751407115216957, + "learning_rate": 9.279644525833307e-06, + "loss": 0.3409, + "step": 6820 + }, + { + "epoch": 0.2, + "grad_norm": 1.5243124334566625, + "learning_rate": 9.279401624166635e-06, + "loss": 0.3613, + "step": 6821 + }, + { + "epoch": 0.2, + "grad_norm": 1.6197547671833359, + "learning_rate": 9.279158684734197e-06, + "loss": 0.3673, + "step": 6822 + }, + { + "epoch": 0.2, + "grad_norm": 1.1178745312409124, + "learning_rate": 9.278915707538137e-06, + "loss": 0.5997, + "step": 6823 + }, + { + "epoch": 0.2, + "grad_norm": 1.5898585814080086, + "learning_rate": 9.278672692580597e-06, + "loss": 0.3842, + "step": 6824 + }, + { + "epoch": 0.2, + "grad_norm": 1.4135938981304368, + "learning_rate": 9.278429639863723e-06, + "loss": 0.3844, + "step": 6825 + }, + { + "epoch": 0.2, + "grad_norm": 1.6789231326576701, + "learning_rate": 9.278186549389661e-06, + "loss": 0.3727, + "step": 6826 + }, + { + "epoch": 0.2, + "grad_norm": 1.7604176558179663, + "learning_rate": 9.277943421160554e-06, + "loss": 0.4253, + "step": 6827 + }, + { + "epoch": 0.2, + "grad_norm": 1.412471212420851, + "learning_rate": 9.27770025517855e-06, + "loss": 0.3471, + "step": 6828 + }, + { + "epoch": 0.2, + "grad_norm": 2.1016534601010957, + "learning_rate": 9.277457051445794e-06, + "loss": 0.3968, + "step": 6829 + }, + { + "epoch": 0.2, + "grad_norm": 1.4308540808154213, + "learning_rate": 9.277213809964431e-06, + "loss": 0.3894, + "step": 6830 + }, + { + "epoch": 0.2, + "grad_norm": 1.388868359741515, + "learning_rate": 9.27697053073661e-06, + "loss": 0.3772, + "step": 6831 + }, + { + "epoch": 0.2, + "grad_norm": 2.534981336563927, + "learning_rate": 9.276727213764476e-06, + "loss": 0.3597, + "step": 6832 + }, + { + "epoch": 0.2, + "grad_norm": 1.5038529809874925, + "learning_rate": 9.276483859050177e-06, + "loss": 0.3493, + "step": 6833 + }, + { + "epoch": 0.2, + "grad_norm": 1.704078503313522, + "learning_rate": 9.27624046659586e-06, + "loss": 0.3986, + "step": 6834 + }, + { + "epoch": 0.2, + "grad_norm": 1.4782858930683345, + "learning_rate": 9.275997036403673e-06, + "loss": 0.3777, + "step": 6835 + }, + { + "epoch": 0.2, + "grad_norm": 1.4339785461987555, + "learning_rate": 9.275753568475764e-06, + "loss": 0.3883, + "step": 6836 + }, + { + "epoch": 0.2, + "grad_norm": 1.3941141670488923, + "learning_rate": 9.275510062814286e-06, + "loss": 0.3636, + "step": 6837 + }, + { + "epoch": 0.2, + "grad_norm": 1.4151924587330849, + "learning_rate": 9.275266519421382e-06, + "loss": 0.3399, + "step": 6838 + }, + { + "epoch": 0.2, + "grad_norm": 1.919035803769355, + "learning_rate": 9.275022938299203e-06, + "loss": 0.3717, + "step": 6839 + }, + { + "epoch": 0.2, + "grad_norm": 2.205425219638696, + "learning_rate": 9.2747793194499e-06, + "loss": 0.3945, + "step": 6840 + }, + { + "epoch": 0.2, + "grad_norm": 1.6222331202377118, + "learning_rate": 9.274535662875621e-06, + "loss": 0.3773, + "step": 6841 + }, + { + "epoch": 0.2, + "grad_norm": 1.6103402017965096, + "learning_rate": 9.27429196857852e-06, + "loss": 0.3887, + "step": 6842 + }, + { + "epoch": 0.2, + "grad_norm": 1.6712912941326672, + "learning_rate": 9.274048236560741e-06, + "loss": 0.3828, + "step": 6843 + }, + { + "epoch": 0.2, + "grad_norm": 1.4352621745060912, + "learning_rate": 9.273804466824442e-06, + "loss": 0.3813, + "step": 6844 + }, + { + "epoch": 0.2, + "grad_norm": 1.6334886141126501, + "learning_rate": 9.27356065937177e-06, + "loss": 0.4025, + "step": 6845 + }, + { + "epoch": 0.2, + "grad_norm": 1.8235318589455647, + "learning_rate": 9.273316814204878e-06, + "loss": 0.3615, + "step": 6846 + }, + { + "epoch": 0.2, + "grad_norm": 1.5006619524690197, + "learning_rate": 9.273072931325917e-06, + "loss": 0.3679, + "step": 6847 + }, + { + "epoch": 0.2, + "grad_norm": 1.4400367916764216, + "learning_rate": 9.272829010737042e-06, + "loss": 0.3742, + "step": 6848 + }, + { + "epoch": 0.2, + "grad_norm": 1.576634269198227, + "learning_rate": 9.272585052440402e-06, + "loss": 0.4137, + "step": 6849 + }, + { + "epoch": 0.2, + "grad_norm": 1.3443400291043526, + "learning_rate": 9.27234105643815e-06, + "loss": 0.3887, + "step": 6850 + }, + { + "epoch": 0.2, + "grad_norm": 1.6865021496131325, + "learning_rate": 9.272097022732444e-06, + "loss": 0.3708, + "step": 6851 + }, + { + "epoch": 0.2, + "grad_norm": 1.4131588680971843, + "learning_rate": 9.271852951325433e-06, + "loss": 0.3652, + "step": 6852 + }, + { + "epoch": 0.2, + "grad_norm": 1.4748990024237032, + "learning_rate": 9.271608842219272e-06, + "loss": 0.4044, + "step": 6853 + }, + { + "epoch": 0.2, + "grad_norm": 1.5220168034909662, + "learning_rate": 9.271364695416115e-06, + "loss": 0.3671, + "step": 6854 + }, + { + "epoch": 0.2, + "grad_norm": 1.3017682512088828, + "learning_rate": 9.271120510918119e-06, + "loss": 0.3571, + "step": 6855 + }, + { + "epoch": 0.2, + "grad_norm": 1.540351014717601, + "learning_rate": 9.270876288727435e-06, + "loss": 0.3749, + "step": 6856 + }, + { + "epoch": 0.2, + "grad_norm": 1.6359994899855455, + "learning_rate": 9.270632028846222e-06, + "loss": 0.3504, + "step": 6857 + }, + { + "epoch": 0.2, + "grad_norm": 2.193380222046512, + "learning_rate": 9.270387731276633e-06, + "loss": 0.3921, + "step": 6858 + }, + { + "epoch": 0.2, + "grad_norm": 1.4247402895476817, + "learning_rate": 9.270143396020826e-06, + "loss": 0.4064, + "step": 6859 + }, + { + "epoch": 0.2, + "grad_norm": 1.4067904450001771, + "learning_rate": 9.269899023080955e-06, + "loss": 0.3748, + "step": 6860 + }, + { + "epoch": 0.2, + "grad_norm": 1.1684502899381073, + "learning_rate": 9.269654612459178e-06, + "loss": 0.574, + "step": 6861 + }, + { + "epoch": 0.2, + "grad_norm": 1.3249382711702353, + "learning_rate": 9.269410164157652e-06, + "loss": 0.3787, + "step": 6862 + }, + { + "epoch": 0.2, + "grad_norm": 1.9723229962245006, + "learning_rate": 9.269165678178534e-06, + "loss": 0.3988, + "step": 6863 + }, + { + "epoch": 0.2, + "grad_norm": 2.673030033886343, + "learning_rate": 9.268921154523983e-06, + "loss": 0.362, + "step": 6864 + }, + { + "epoch": 0.2, + "grad_norm": 1.7478680833601696, + "learning_rate": 9.268676593196152e-06, + "loss": 0.3905, + "step": 6865 + }, + { + "epoch": 0.2, + "grad_norm": 1.65199426622166, + "learning_rate": 9.268431994197204e-06, + "loss": 0.3842, + "step": 6866 + }, + { + "epoch": 0.2, + "grad_norm": 1.7500144912253972, + "learning_rate": 9.268187357529297e-06, + "loss": 0.3577, + "step": 6867 + }, + { + "epoch": 0.2, + "grad_norm": 1.4587557495877774, + "learning_rate": 9.26794268319459e-06, + "loss": 0.3705, + "step": 6868 + }, + { + "epoch": 0.2, + "grad_norm": 1.4359299567181134, + "learning_rate": 9.267697971195239e-06, + "loss": 0.3894, + "step": 6869 + }, + { + "epoch": 0.2, + "grad_norm": 1.5386932158238609, + "learning_rate": 9.267453221533409e-06, + "loss": 0.3959, + "step": 6870 + }, + { + "epoch": 0.2, + "grad_norm": 1.585208097989137, + "learning_rate": 9.267208434211256e-06, + "loss": 0.3874, + "step": 6871 + }, + { + "epoch": 0.2, + "grad_norm": 1.4061611588557703, + "learning_rate": 9.26696360923094e-06, + "loss": 0.3538, + "step": 6872 + }, + { + "epoch": 0.2, + "grad_norm": 1.4763855925110196, + "learning_rate": 9.266718746594625e-06, + "loss": 0.3782, + "step": 6873 + }, + { + "epoch": 0.2, + "grad_norm": 1.5075597027903471, + "learning_rate": 9.266473846304466e-06, + "loss": 0.3915, + "step": 6874 + }, + { + "epoch": 0.2, + "grad_norm": 2.353508144992176, + "learning_rate": 9.266228908362631e-06, + "loss": 0.3513, + "step": 6875 + }, + { + "epoch": 0.2, + "grad_norm": 5.381043702269806, + "learning_rate": 9.265983932771278e-06, + "loss": 0.3597, + "step": 6876 + }, + { + "epoch": 0.2, + "grad_norm": 2.2916298049270263, + "learning_rate": 9.26573891953257e-06, + "loss": 0.3497, + "step": 6877 + }, + { + "epoch": 0.2, + "grad_norm": 1.8820119371886839, + "learning_rate": 9.265493868648668e-06, + "loss": 0.3914, + "step": 6878 + }, + { + "epoch": 0.2, + "grad_norm": 1.5731184698097713, + "learning_rate": 9.265248780121736e-06, + "loss": 0.3998, + "step": 6879 + }, + { + "epoch": 0.2, + "grad_norm": 1.4595827995813437, + "learning_rate": 9.265003653953935e-06, + "loss": 0.3636, + "step": 6880 + }, + { + "epoch": 0.2, + "grad_norm": 1.4294735826692606, + "learning_rate": 9.26475849014743e-06, + "loss": 0.3667, + "step": 6881 + }, + { + "epoch": 0.2, + "grad_norm": 1.5110334594430073, + "learning_rate": 9.264513288704385e-06, + "loss": 0.3517, + "step": 6882 + }, + { + "epoch": 0.2, + "grad_norm": 2.437043196652584, + "learning_rate": 9.26426804962696e-06, + "loss": 0.3986, + "step": 6883 + }, + { + "epoch": 0.2, + "grad_norm": 1.761467002280804, + "learning_rate": 9.264022772917325e-06, + "loss": 0.4093, + "step": 6884 + }, + { + "epoch": 0.2, + "grad_norm": 1.9740573098699579, + "learning_rate": 9.26377745857764e-06, + "loss": 0.3997, + "step": 6885 + }, + { + "epoch": 0.2, + "grad_norm": 1.349039150108259, + "learning_rate": 9.263532106610072e-06, + "loss": 0.362, + "step": 6886 + }, + { + "epoch": 0.2, + "grad_norm": 1.4130921714796612, + "learning_rate": 9.263286717016788e-06, + "loss": 0.3583, + "step": 6887 + }, + { + "epoch": 0.2, + "grad_norm": 1.7370001600520901, + "learning_rate": 9.26304128979995e-06, + "loss": 0.4034, + "step": 6888 + }, + { + "epoch": 0.2, + "grad_norm": 1.7071881300487977, + "learning_rate": 9.262795824961723e-06, + "loss": 0.4112, + "step": 6889 + }, + { + "epoch": 0.2, + "grad_norm": 1.6301593467507118, + "learning_rate": 9.262550322504278e-06, + "loss": 0.3668, + "step": 6890 + }, + { + "epoch": 0.2, + "grad_norm": 1.7055079579121832, + "learning_rate": 9.262304782429779e-06, + "loss": 0.3636, + "step": 6891 + }, + { + "epoch": 0.2, + "grad_norm": 1.7549945279716803, + "learning_rate": 9.262059204740393e-06, + "loss": 0.4011, + "step": 6892 + }, + { + "epoch": 0.2, + "grad_norm": 2.005875842659025, + "learning_rate": 9.261813589438287e-06, + "loss": 0.3467, + "step": 6893 + }, + { + "epoch": 0.2, + "grad_norm": 1.9708759748345208, + "learning_rate": 9.261567936525628e-06, + "loss": 0.4408, + "step": 6894 + }, + { + "epoch": 0.2, + "grad_norm": 1.4925375428373713, + "learning_rate": 9.261322246004586e-06, + "loss": 0.3933, + "step": 6895 + }, + { + "epoch": 0.2, + "grad_norm": 1.4821207333755124, + "learning_rate": 9.261076517877327e-06, + "loss": 0.36, + "step": 6896 + }, + { + "epoch": 0.2, + "grad_norm": 3.3456950578526943, + "learning_rate": 9.260830752146021e-06, + "loss": 0.3973, + "step": 6897 + }, + { + "epoch": 0.2, + "grad_norm": 1.503995301252874, + "learning_rate": 9.260584948812836e-06, + "loss": 0.3655, + "step": 6898 + }, + { + "epoch": 0.2, + "grad_norm": 1.5866239519194574, + "learning_rate": 9.260339107879942e-06, + "loss": 0.3656, + "step": 6899 + }, + { + "epoch": 0.2, + "grad_norm": 1.50033006602845, + "learning_rate": 9.260093229349507e-06, + "loss": 0.3872, + "step": 6900 + }, + { + "epoch": 0.2, + "grad_norm": 1.4058209623957612, + "learning_rate": 9.259847313223705e-06, + "loss": 0.342, + "step": 6901 + }, + { + "epoch": 0.2, + "grad_norm": 1.3226201855087982, + "learning_rate": 9.2596013595047e-06, + "loss": 0.374, + "step": 6902 + }, + { + "epoch": 0.2, + "grad_norm": 1.6811471223218943, + "learning_rate": 9.259355368194666e-06, + "loss": 0.3935, + "step": 6903 + }, + { + "epoch": 0.2, + "grad_norm": 1.713769860684354, + "learning_rate": 9.259109339295775e-06, + "loss": 0.3492, + "step": 6904 + }, + { + "epoch": 0.2, + "grad_norm": 1.6920816732944441, + "learning_rate": 9.258863272810197e-06, + "loss": 0.3812, + "step": 6905 + }, + { + "epoch": 0.2, + "grad_norm": 1.5828296855365596, + "learning_rate": 9.258617168740101e-06, + "loss": 0.3744, + "step": 6906 + }, + { + "epoch": 0.2, + "grad_norm": 1.5618480705504285, + "learning_rate": 9.258371027087663e-06, + "loss": 0.3906, + "step": 6907 + }, + { + "epoch": 0.2, + "grad_norm": 1.641029950050965, + "learning_rate": 9.258124847855055e-06, + "loss": 0.3591, + "step": 6908 + }, + { + "epoch": 0.2, + "grad_norm": 1.5109232643176373, + "learning_rate": 9.257878631044446e-06, + "loss": 0.382, + "step": 6909 + }, + { + "epoch": 0.2, + "grad_norm": 1.586766267514478, + "learning_rate": 9.25763237665801e-06, + "loss": 0.3736, + "step": 6910 + }, + { + "epoch": 0.2, + "grad_norm": 1.3764676222186485, + "learning_rate": 9.257386084697922e-06, + "loss": 0.3919, + "step": 6911 + }, + { + "epoch": 0.2, + "grad_norm": 1.4040791893473568, + "learning_rate": 9.257139755166356e-06, + "loss": 0.3979, + "step": 6912 + }, + { + "epoch": 0.2, + "grad_norm": 1.6036594711642633, + "learning_rate": 9.256893388065482e-06, + "loss": 0.4168, + "step": 6913 + }, + { + "epoch": 0.2, + "grad_norm": 1.5103916215908773, + "learning_rate": 9.256646983397479e-06, + "loss": 0.391, + "step": 6914 + }, + { + "epoch": 0.2, + "grad_norm": 1.315042239745074, + "learning_rate": 9.256400541164517e-06, + "loss": 0.3696, + "step": 6915 + }, + { + "epoch": 0.2, + "grad_norm": 1.881521071942437, + "learning_rate": 9.256154061368774e-06, + "loss": 0.3769, + "step": 6916 + }, + { + "epoch": 0.2, + "grad_norm": 1.3885021342280797, + "learning_rate": 9.255907544012424e-06, + "loss": 0.3865, + "step": 6917 + }, + { + "epoch": 0.2, + "grad_norm": 1.6521912289713123, + "learning_rate": 9.255660989097644e-06, + "loss": 0.4135, + "step": 6918 + }, + { + "epoch": 0.2, + "grad_norm": 2.0319663041259433, + "learning_rate": 9.255414396626608e-06, + "loss": 0.3811, + "step": 6919 + }, + { + "epoch": 0.2, + "grad_norm": 1.344539803416349, + "learning_rate": 9.255167766601491e-06, + "loss": 0.3668, + "step": 6920 + }, + { + "epoch": 0.2, + "grad_norm": 1.4802480072366395, + "learning_rate": 9.254921099024471e-06, + "loss": 0.3687, + "step": 6921 + }, + { + "epoch": 0.2, + "grad_norm": 1.540972722186747, + "learning_rate": 9.254674393897727e-06, + "loss": 0.3712, + "step": 6922 + }, + { + "epoch": 0.2, + "grad_norm": 1.4943805807263062, + "learning_rate": 9.254427651223434e-06, + "loss": 0.4108, + "step": 6923 + }, + { + "epoch": 0.2, + "grad_norm": 1.045599231105839, + "learning_rate": 9.25418087100377e-06, + "loss": 0.6277, + "step": 6924 + }, + { + "epoch": 0.2, + "grad_norm": 1.5059136906270298, + "learning_rate": 9.253934053240912e-06, + "loss": 0.3768, + "step": 6925 + }, + { + "epoch": 0.2, + "grad_norm": 1.3399674143535705, + "learning_rate": 9.25368719793704e-06, + "loss": 0.3871, + "step": 6926 + }, + { + "epoch": 0.2, + "grad_norm": 1.7105248049284294, + "learning_rate": 9.25344030509433e-06, + "loss": 0.4038, + "step": 6927 + }, + { + "epoch": 0.2, + "grad_norm": 1.656109041086713, + "learning_rate": 9.253193374714962e-06, + "loss": 0.4124, + "step": 6928 + }, + { + "epoch": 0.2, + "grad_norm": 1.9033287375183454, + "learning_rate": 9.252946406801115e-06, + "loss": 0.3976, + "step": 6929 + }, + { + "epoch": 0.2, + "grad_norm": 1.381956255087604, + "learning_rate": 9.25269940135497e-06, + "loss": 0.3733, + "step": 6930 + }, + { + "epoch": 0.2, + "grad_norm": 1.5048095102147792, + "learning_rate": 9.252452358378705e-06, + "loss": 0.3548, + "step": 6931 + }, + { + "epoch": 0.2, + "grad_norm": 1.4321230269384948, + "learning_rate": 9.252205277874501e-06, + "loss": 0.3568, + "step": 6932 + }, + { + "epoch": 0.2, + "grad_norm": 1.3824292753950278, + "learning_rate": 9.251958159844538e-06, + "loss": 0.3618, + "step": 6933 + }, + { + "epoch": 0.2, + "grad_norm": 1.78518302321767, + "learning_rate": 9.251711004290998e-06, + "loss": 0.3718, + "step": 6934 + }, + { + "epoch": 0.2, + "grad_norm": 1.4777567339186881, + "learning_rate": 9.251463811216059e-06, + "loss": 0.3519, + "step": 6935 + }, + { + "epoch": 0.2, + "grad_norm": 1.4144523961351618, + "learning_rate": 9.251216580621906e-06, + "loss": 0.35, + "step": 6936 + }, + { + "epoch": 0.2, + "grad_norm": 1.5907060104066693, + "learning_rate": 9.25096931251072e-06, + "loss": 0.3722, + "step": 6937 + }, + { + "epoch": 0.2, + "grad_norm": 1.4846725174017235, + "learning_rate": 9.250722006884683e-06, + "loss": 0.3908, + "step": 6938 + }, + { + "epoch": 0.2, + "grad_norm": 1.4125398374578066, + "learning_rate": 9.250474663745976e-06, + "loss": 0.3772, + "step": 6939 + }, + { + "epoch": 0.2, + "grad_norm": 1.6238622325207763, + "learning_rate": 9.250227283096784e-06, + "loss": 0.3947, + "step": 6940 + }, + { + "epoch": 0.2, + "grad_norm": 2.1724652870313412, + "learning_rate": 9.249979864939287e-06, + "loss": 0.3894, + "step": 6941 + }, + { + "epoch": 0.2, + "grad_norm": 1.5589133714808046, + "learning_rate": 9.249732409275673e-06, + "loss": 0.3783, + "step": 6942 + }, + { + "epoch": 0.2, + "grad_norm": 1.4541715849521446, + "learning_rate": 9.249484916108123e-06, + "loss": 0.3754, + "step": 6943 + }, + { + "epoch": 0.2, + "grad_norm": 1.7786431728641137, + "learning_rate": 9.24923738543882e-06, + "loss": 0.4075, + "step": 6944 + }, + { + "epoch": 0.2, + "grad_norm": 1.1210756525684584, + "learning_rate": 9.248989817269952e-06, + "loss": 0.6339, + "step": 6945 + }, + { + "epoch": 0.2, + "grad_norm": 1.5351414761488114, + "learning_rate": 9.248742211603699e-06, + "loss": 0.4274, + "step": 6946 + }, + { + "epoch": 0.2, + "grad_norm": 1.4384009752107236, + "learning_rate": 9.24849456844225e-06, + "loss": 0.3703, + "step": 6947 + }, + { + "epoch": 0.2, + "grad_norm": 1.5005646037866356, + "learning_rate": 9.24824688778779e-06, + "loss": 0.3685, + "step": 6948 + }, + { + "epoch": 0.2, + "grad_norm": 0.9884375090755987, + "learning_rate": 9.247999169642503e-06, + "loss": 0.6238, + "step": 6949 + }, + { + "epoch": 0.2, + "grad_norm": 1.4436838500818014, + "learning_rate": 9.247751414008577e-06, + "loss": 0.3812, + "step": 6950 + }, + { + "epoch": 0.2, + "grad_norm": 1.4236989937546223, + "learning_rate": 9.247503620888199e-06, + "loss": 0.3889, + "step": 6951 + }, + { + "epoch": 0.2, + "grad_norm": 1.7786038221887626, + "learning_rate": 9.247255790283551e-06, + "loss": 0.3839, + "step": 6952 + }, + { + "epoch": 0.2, + "grad_norm": 1.8039203175494363, + "learning_rate": 9.247007922196827e-06, + "loss": 0.41, + "step": 6953 + }, + { + "epoch": 0.2, + "grad_norm": 1.4820428281322213, + "learning_rate": 9.246760016630208e-06, + "loss": 0.3994, + "step": 6954 + }, + { + "epoch": 0.2, + "grad_norm": 1.7290126526332383, + "learning_rate": 9.246512073585887e-06, + "loss": 0.3927, + "step": 6955 + }, + { + "epoch": 0.2, + "grad_norm": 1.8419493847922543, + "learning_rate": 9.246264093066049e-06, + "loss": 0.3553, + "step": 6956 + }, + { + "epoch": 0.2, + "grad_norm": 1.5513862991983958, + "learning_rate": 9.246016075072884e-06, + "loss": 0.36, + "step": 6957 + }, + { + "epoch": 0.2, + "grad_norm": 1.4138714937291283, + "learning_rate": 9.245768019608579e-06, + "loss": 0.4243, + "step": 6958 + }, + { + "epoch": 0.2, + "grad_norm": 1.825083118503669, + "learning_rate": 9.245519926675324e-06, + "loss": 0.4167, + "step": 6959 + }, + { + "epoch": 0.2, + "grad_norm": 1.9439513078230903, + "learning_rate": 9.24527179627531e-06, + "loss": 0.4107, + "step": 6960 + }, + { + "epoch": 0.2, + "grad_norm": 3.8997679214611027, + "learning_rate": 9.245023628410723e-06, + "loss": 0.3797, + "step": 6961 + }, + { + "epoch": 0.2, + "grad_norm": 1.1194757452212591, + "learning_rate": 9.244775423083757e-06, + "loss": 0.6099, + "step": 6962 + }, + { + "epoch": 0.2, + "grad_norm": 1.8665217443171906, + "learning_rate": 9.2445271802966e-06, + "loss": 0.3704, + "step": 6963 + }, + { + "epoch": 0.2, + "grad_norm": 2.205540625620336, + "learning_rate": 9.244278900051444e-06, + "loss": 0.4133, + "step": 6964 + }, + { + "epoch": 0.2, + "grad_norm": 1.520582321965046, + "learning_rate": 9.244030582350481e-06, + "loss": 0.3709, + "step": 6965 + }, + { + "epoch": 0.2, + "grad_norm": 1.6809831894449545, + "learning_rate": 9.243782227195899e-06, + "loss": 0.3923, + "step": 6966 + }, + { + "epoch": 0.2, + "grad_norm": 1.9082286227029746, + "learning_rate": 9.243533834589893e-06, + "loss": 0.4067, + "step": 6967 + }, + { + "epoch": 0.2, + "grad_norm": 1.6372230143967132, + "learning_rate": 9.243285404534652e-06, + "loss": 0.366, + "step": 6968 + }, + { + "epoch": 0.2, + "grad_norm": 1.559643256699904, + "learning_rate": 9.243036937032373e-06, + "loss": 0.383, + "step": 6969 + }, + { + "epoch": 0.2, + "grad_norm": 1.6323104485318396, + "learning_rate": 9.242788432085243e-06, + "loss": 0.3879, + "step": 6970 + }, + { + "epoch": 0.2, + "grad_norm": 1.43657722826103, + "learning_rate": 9.24253988969546e-06, + "loss": 0.3548, + "step": 6971 + }, + { + "epoch": 0.2, + "grad_norm": 3.165909970908382, + "learning_rate": 9.242291309865216e-06, + "loss": 0.3911, + "step": 6972 + }, + { + "epoch": 0.2, + "grad_norm": 1.4413357158683993, + "learning_rate": 9.242042692596702e-06, + "loss": 0.3783, + "step": 6973 + }, + { + "epoch": 0.2, + "grad_norm": 1.7485922182952247, + "learning_rate": 9.241794037892116e-06, + "loss": 0.3813, + "step": 6974 + }, + { + "epoch": 0.2, + "grad_norm": 1.4219586268738063, + "learning_rate": 9.24154534575365e-06, + "loss": 0.3682, + "step": 6975 + }, + { + "epoch": 0.2, + "grad_norm": 1.391174788990699, + "learning_rate": 9.241296616183499e-06, + "loss": 0.3574, + "step": 6976 + }, + { + "epoch": 0.2, + "grad_norm": 2.99464323548315, + "learning_rate": 9.241047849183857e-06, + "loss": 0.3863, + "step": 6977 + }, + { + "epoch": 0.2, + "grad_norm": 1.5147733159584922, + "learning_rate": 9.240799044756923e-06, + "loss": 0.374, + "step": 6978 + }, + { + "epoch": 0.2, + "grad_norm": 1.6762820471656585, + "learning_rate": 9.24055020290489e-06, + "loss": 0.38, + "step": 6979 + }, + { + "epoch": 0.2, + "grad_norm": 1.6929312018264482, + "learning_rate": 9.240301323629954e-06, + "loss": 0.3917, + "step": 6980 + }, + { + "epoch": 0.2, + "grad_norm": 1.8789043159171699, + "learning_rate": 9.240052406934311e-06, + "loss": 0.3559, + "step": 6981 + }, + { + "epoch": 0.2, + "grad_norm": 1.7077704641267557, + "learning_rate": 9.23980345282016e-06, + "loss": 0.3829, + "step": 6982 + }, + { + "epoch": 0.2, + "grad_norm": 2.322487886968134, + "learning_rate": 9.239554461289694e-06, + "loss": 0.4049, + "step": 6983 + }, + { + "epoch": 0.2, + "grad_norm": 1.6560402180443266, + "learning_rate": 9.239305432345115e-06, + "loss": 0.3793, + "step": 6984 + }, + { + "epoch": 0.2, + "grad_norm": 1.5907613421593043, + "learning_rate": 9.239056365988619e-06, + "loss": 0.3741, + "step": 6985 + }, + { + "epoch": 0.2, + "grad_norm": 1.416636880915686, + "learning_rate": 9.238807262222403e-06, + "loss": 0.3846, + "step": 6986 + }, + { + "epoch": 0.2, + "grad_norm": 2.0954383867281083, + "learning_rate": 9.238558121048664e-06, + "loss": 0.3626, + "step": 6987 + }, + { + "epoch": 0.2, + "grad_norm": 1.1276254529364345, + "learning_rate": 9.238308942469604e-06, + "loss": 0.5694, + "step": 6988 + }, + { + "epoch": 0.2, + "grad_norm": 9.46318199782732, + "learning_rate": 9.238059726487421e-06, + "loss": 0.3787, + "step": 6989 + }, + { + "epoch": 0.2, + "grad_norm": 1.747368619138754, + "learning_rate": 9.237810473104314e-06, + "loss": 0.3755, + "step": 6990 + }, + { + "epoch": 0.2, + "grad_norm": 1.4248021845734213, + "learning_rate": 9.237561182322481e-06, + "loss": 0.3729, + "step": 6991 + }, + { + "epoch": 0.2, + "grad_norm": 1.5301419247501622, + "learning_rate": 9.237311854144125e-06, + "loss": 0.3783, + "step": 6992 + }, + { + "epoch": 0.2, + "grad_norm": 1.5698321778981759, + "learning_rate": 9.237062488571444e-06, + "loss": 0.3813, + "step": 6993 + }, + { + "epoch": 0.2, + "grad_norm": 1.7274166879220536, + "learning_rate": 9.23681308560664e-06, + "loss": 0.404, + "step": 6994 + }, + { + "epoch": 0.2, + "grad_norm": 1.7281453571441345, + "learning_rate": 9.236563645251914e-06, + "loss": 0.3985, + "step": 6995 + }, + { + "epoch": 0.2, + "grad_norm": 1.8160545331157683, + "learning_rate": 9.236314167509466e-06, + "loss": 0.3889, + "step": 6996 + }, + { + "epoch": 0.2, + "grad_norm": 1.5196545435295703, + "learning_rate": 9.2360646523815e-06, + "loss": 0.3574, + "step": 6997 + }, + { + "epoch": 0.2, + "grad_norm": 1.568581471547852, + "learning_rate": 9.235815099870214e-06, + "loss": 0.3872, + "step": 6998 + }, + { + "epoch": 0.2, + "grad_norm": 1.6456443105113618, + "learning_rate": 9.235565509977815e-06, + "loss": 0.3487, + "step": 6999 + }, + { + "epoch": 0.2, + "grad_norm": 1.531867915133788, + "learning_rate": 9.235315882706501e-06, + "loss": 0.3697, + "step": 7000 + }, + { + "epoch": 0.2, + "grad_norm": 1.6412015044181802, + "learning_rate": 9.235066218058479e-06, + "loss": 0.3683, + "step": 7001 + }, + { + "epoch": 0.2, + "grad_norm": 2.0097965404112803, + "learning_rate": 9.234816516035952e-06, + "loss": 0.3709, + "step": 7002 + }, + { + "epoch": 0.2, + "grad_norm": 1.5313706877554891, + "learning_rate": 9.23456677664112e-06, + "loss": 0.3836, + "step": 7003 + }, + { + "epoch": 0.2, + "grad_norm": 1.4881198294708247, + "learning_rate": 9.23431699987619e-06, + "loss": 0.3733, + "step": 7004 + }, + { + "epoch": 0.2, + "grad_norm": 1.7681699789428271, + "learning_rate": 9.234067185743365e-06, + "loss": 0.3741, + "step": 7005 + }, + { + "epoch": 0.2, + "grad_norm": 1.8033211503669142, + "learning_rate": 9.23381733424485e-06, + "loss": 0.3785, + "step": 7006 + }, + { + "epoch": 0.2, + "grad_norm": 1.5643307579595023, + "learning_rate": 9.233567445382852e-06, + "loss": 0.3867, + "step": 7007 + }, + { + "epoch": 0.2, + "grad_norm": 1.5878278043023415, + "learning_rate": 9.233317519159573e-06, + "loss": 0.3779, + "step": 7008 + }, + { + "epoch": 0.2, + "grad_norm": 1.7130762208072332, + "learning_rate": 9.233067555577219e-06, + "loss": 0.3573, + "step": 7009 + }, + { + "epoch": 0.2, + "grad_norm": 1.460285482994176, + "learning_rate": 9.232817554637998e-06, + "loss": 0.3631, + "step": 7010 + }, + { + "epoch": 0.2, + "grad_norm": 2.8101535864527416, + "learning_rate": 9.232567516344115e-06, + "loss": 0.3676, + "step": 7011 + }, + { + "epoch": 0.2, + "grad_norm": 1.7216534707458988, + "learning_rate": 9.232317440697776e-06, + "loss": 0.3553, + "step": 7012 + }, + { + "epoch": 0.2, + "grad_norm": 1.9730042330615227, + "learning_rate": 9.232067327701188e-06, + "loss": 0.3745, + "step": 7013 + }, + { + "epoch": 0.2, + "grad_norm": 2.3200762940335347, + "learning_rate": 9.231817177356561e-06, + "loss": 0.3747, + "step": 7014 + }, + { + "epoch": 0.2, + "grad_norm": 3.0443613123214495, + "learning_rate": 9.2315669896661e-06, + "loss": 0.3787, + "step": 7015 + }, + { + "epoch": 0.2, + "grad_norm": 1.435088073126304, + "learning_rate": 9.231316764632013e-06, + "loss": 0.3596, + "step": 7016 + }, + { + "epoch": 0.2, + "grad_norm": 1.5722518894084287, + "learning_rate": 9.231066502256508e-06, + "loss": 0.3878, + "step": 7017 + }, + { + "epoch": 0.2, + "grad_norm": 1.5747043688009812, + "learning_rate": 9.230816202541795e-06, + "loss": 0.3897, + "step": 7018 + }, + { + "epoch": 0.2, + "grad_norm": 1.5498282327547543, + "learning_rate": 9.230565865490082e-06, + "loss": 0.3801, + "step": 7019 + }, + { + "epoch": 0.2, + "grad_norm": 1.4974750020038443, + "learning_rate": 9.230315491103577e-06, + "loss": 0.3568, + "step": 7020 + }, + { + "epoch": 0.2, + "grad_norm": 1.5117784881859888, + "learning_rate": 9.230065079384493e-06, + "loss": 0.3878, + "step": 7021 + }, + { + "epoch": 0.2, + "grad_norm": 1.490022387858742, + "learning_rate": 9.229814630335037e-06, + "loss": 0.3851, + "step": 7022 + }, + { + "epoch": 0.2, + "grad_norm": 1.5034769360861646, + "learning_rate": 9.229564143957419e-06, + "loss": 0.3849, + "step": 7023 + }, + { + "epoch": 0.2, + "grad_norm": 3.0222132763003184, + "learning_rate": 9.22931362025385e-06, + "loss": 0.3764, + "step": 7024 + }, + { + "epoch": 0.2, + "grad_norm": 1.8186029579764416, + "learning_rate": 9.229063059226544e-06, + "loss": 0.3819, + "step": 7025 + }, + { + "epoch": 0.2, + "grad_norm": 1.587097536431477, + "learning_rate": 9.228812460877708e-06, + "loss": 0.3596, + "step": 7026 + }, + { + "epoch": 0.2, + "grad_norm": 1.0218801836495404, + "learning_rate": 9.228561825209555e-06, + "loss": 0.5847, + "step": 7027 + }, + { + "epoch": 0.2, + "grad_norm": 1.6814684088235161, + "learning_rate": 9.228311152224297e-06, + "loss": 0.3547, + "step": 7028 + }, + { + "epoch": 0.2, + "grad_norm": 2.225493626365544, + "learning_rate": 9.228060441924147e-06, + "loss": 0.3824, + "step": 7029 + }, + { + "epoch": 0.2, + "grad_norm": 1.5008650770514016, + "learning_rate": 9.227809694311316e-06, + "loss": 0.3949, + "step": 7030 + }, + { + "epoch": 0.2, + "grad_norm": 1.6731566645465186, + "learning_rate": 9.227558909388018e-06, + "loss": 0.3439, + "step": 7031 + }, + { + "epoch": 0.2, + "grad_norm": 1.8728602953339086, + "learning_rate": 9.227308087156464e-06, + "loss": 0.3901, + "step": 7032 + }, + { + "epoch": 0.2, + "grad_norm": 1.9371883618861105, + "learning_rate": 9.227057227618872e-06, + "loss": 0.3705, + "step": 7033 + }, + { + "epoch": 0.2, + "grad_norm": 1.5149869534222304, + "learning_rate": 9.226806330777451e-06, + "loss": 0.3701, + "step": 7034 + }, + { + "epoch": 0.2, + "grad_norm": 1.8217688974959052, + "learning_rate": 9.226555396634419e-06, + "loss": 0.3782, + "step": 7035 + }, + { + "epoch": 0.2, + "grad_norm": 1.6296834739241064, + "learning_rate": 9.226304425191987e-06, + "loss": 0.3845, + "step": 7036 + }, + { + "epoch": 0.2, + "grad_norm": 1.4241465111752547, + "learning_rate": 9.226053416452372e-06, + "loss": 0.3758, + "step": 7037 + }, + { + "epoch": 0.2, + "grad_norm": 4.291649782591441, + "learning_rate": 9.225802370417789e-06, + "loss": 0.3612, + "step": 7038 + }, + { + "epoch": 0.2, + "grad_norm": 1.6534092737828467, + "learning_rate": 9.225551287090453e-06, + "loss": 0.3847, + "step": 7039 + }, + { + "epoch": 0.2, + "grad_norm": 1.5613734455223574, + "learning_rate": 9.22530016647258e-06, + "loss": 0.3875, + "step": 7040 + }, + { + "epoch": 0.2, + "grad_norm": 1.8242272805246125, + "learning_rate": 9.225049008566386e-06, + "loss": 0.3918, + "step": 7041 + }, + { + "epoch": 0.2, + "grad_norm": 2.465285514735929, + "learning_rate": 9.22479781337409e-06, + "loss": 0.3672, + "step": 7042 + }, + { + "epoch": 0.2, + "grad_norm": 1.6362975096442034, + "learning_rate": 9.224546580897903e-06, + "loss": 0.3778, + "step": 7043 + }, + { + "epoch": 0.2, + "grad_norm": 1.9583811385825352, + "learning_rate": 9.224295311140047e-06, + "loss": 0.3682, + "step": 7044 + }, + { + "epoch": 0.2, + "grad_norm": 1.5308001173681713, + "learning_rate": 9.224044004102737e-06, + "loss": 0.4051, + "step": 7045 + }, + { + "epoch": 0.2, + "grad_norm": 1.803529567228599, + "learning_rate": 9.223792659788194e-06, + "loss": 0.3616, + "step": 7046 + }, + { + "epoch": 0.2, + "grad_norm": 1.7735334374516734, + "learning_rate": 9.223541278198632e-06, + "loss": 0.3406, + "step": 7047 + }, + { + "epoch": 0.2, + "grad_norm": 1.563998671966421, + "learning_rate": 9.223289859336272e-06, + "loss": 0.3646, + "step": 7048 + }, + { + "epoch": 0.2, + "grad_norm": 1.883646303870704, + "learning_rate": 9.223038403203333e-06, + "loss": 0.3609, + "step": 7049 + }, + { + "epoch": 0.2, + "grad_norm": 1.7281731139641048, + "learning_rate": 9.222786909802032e-06, + "loss": 0.3706, + "step": 7050 + }, + { + "epoch": 0.2, + "grad_norm": 1.731098112040117, + "learning_rate": 9.22253537913459e-06, + "loss": 0.3673, + "step": 7051 + }, + { + "epoch": 0.2, + "grad_norm": 2.690856751455843, + "learning_rate": 9.222283811203227e-06, + "loss": 0.3792, + "step": 7052 + }, + { + "epoch": 0.2, + "grad_norm": 2.0662682461786144, + "learning_rate": 9.222032206010161e-06, + "loss": 0.3732, + "step": 7053 + }, + { + "epoch": 0.2, + "grad_norm": 1.528915197226644, + "learning_rate": 9.221780563557616e-06, + "loss": 0.3912, + "step": 7054 + }, + { + "epoch": 0.2, + "grad_norm": 1.4668686366231558, + "learning_rate": 9.22152888384781e-06, + "loss": 0.3553, + "step": 7055 + }, + { + "epoch": 0.2, + "grad_norm": 1.8226936691085267, + "learning_rate": 9.221277166882963e-06, + "loss": 0.4221, + "step": 7056 + }, + { + "epoch": 0.2, + "grad_norm": 1.5718917467783766, + "learning_rate": 9.221025412665301e-06, + "loss": 0.3628, + "step": 7057 + }, + { + "epoch": 0.2, + "grad_norm": 1.6369497967854003, + "learning_rate": 9.22077362119704e-06, + "loss": 0.3743, + "step": 7058 + }, + { + "epoch": 0.2, + "grad_norm": 1.7952058573786858, + "learning_rate": 9.220521792480409e-06, + "loss": 0.3984, + "step": 7059 + }, + { + "epoch": 0.2, + "grad_norm": 1.7297527679454732, + "learning_rate": 9.220269926517622e-06, + "loss": 0.4158, + "step": 7060 + }, + { + "epoch": 0.2, + "grad_norm": 2.1097870803315986, + "learning_rate": 9.220018023310908e-06, + "loss": 0.3704, + "step": 7061 + }, + { + "epoch": 0.2, + "grad_norm": 2.0275598909964803, + "learning_rate": 9.219766082862489e-06, + "loss": 0.3826, + "step": 7062 + }, + { + "epoch": 0.2, + "grad_norm": 1.8067805819336802, + "learning_rate": 9.219514105174586e-06, + "loss": 0.3569, + "step": 7063 + }, + { + "epoch": 0.2, + "grad_norm": 2.6727638256765256, + "learning_rate": 9.219262090249425e-06, + "loss": 0.3725, + "step": 7064 + }, + { + "epoch": 0.2, + "grad_norm": 1.6525750834140678, + "learning_rate": 9.219010038089229e-06, + "loss": 0.3506, + "step": 7065 + }, + { + "epoch": 0.2, + "grad_norm": 1.7567200698284835, + "learning_rate": 9.218757948696221e-06, + "loss": 0.4235, + "step": 7066 + }, + { + "epoch": 0.2, + "grad_norm": 1.5134679350169828, + "learning_rate": 9.218505822072631e-06, + "loss": 0.3583, + "step": 7067 + }, + { + "epoch": 0.21, + "grad_norm": 1.728349126073742, + "learning_rate": 9.218253658220676e-06, + "loss": 0.3555, + "step": 7068 + }, + { + "epoch": 0.21, + "grad_norm": 1.5513251850885532, + "learning_rate": 9.218001457142589e-06, + "loss": 0.4101, + "step": 7069 + }, + { + "epoch": 0.21, + "grad_norm": 1.7619384209331008, + "learning_rate": 9.217749218840593e-06, + "loss": 0.3511, + "step": 7070 + }, + { + "epoch": 0.21, + "grad_norm": 1.555903660354483, + "learning_rate": 9.21749694331691e-06, + "loss": 0.3771, + "step": 7071 + }, + { + "epoch": 0.21, + "grad_norm": 1.8355955423183812, + "learning_rate": 9.217244630573772e-06, + "loss": 0.3629, + "step": 7072 + }, + { + "epoch": 0.21, + "grad_norm": 1.7875506540525563, + "learning_rate": 9.216992280613403e-06, + "loss": 0.3662, + "step": 7073 + }, + { + "epoch": 0.21, + "grad_norm": 1.0482665150712585, + "learning_rate": 9.21673989343803e-06, + "loss": 0.5856, + "step": 7074 + }, + { + "epoch": 0.21, + "grad_norm": 1.6237726123798457, + "learning_rate": 9.21648746904988e-06, + "loss": 0.3644, + "step": 7075 + }, + { + "epoch": 0.21, + "grad_norm": 2.517727804958999, + "learning_rate": 9.216235007451184e-06, + "loss": 0.3997, + "step": 7076 + }, + { + "epoch": 0.21, + "grad_norm": 2.7178164727587415, + "learning_rate": 9.215982508644163e-06, + "loss": 0.3577, + "step": 7077 + }, + { + "epoch": 0.21, + "grad_norm": 1.6839857297780578, + "learning_rate": 9.215729972631054e-06, + "loss": 0.4115, + "step": 7078 + }, + { + "epoch": 0.21, + "grad_norm": 2.008192591916228, + "learning_rate": 9.215477399414079e-06, + "loss": 0.3659, + "step": 7079 + }, + { + "epoch": 0.21, + "grad_norm": 1.6233456838502645, + "learning_rate": 9.215224788995468e-06, + "loss": 0.3725, + "step": 7080 + }, + { + "epoch": 0.21, + "grad_norm": 1.9857907889637727, + "learning_rate": 9.214972141377453e-06, + "loss": 0.3812, + "step": 7081 + }, + { + "epoch": 0.21, + "grad_norm": 1.863799836098526, + "learning_rate": 9.214719456562262e-06, + "loss": 0.3628, + "step": 7082 + }, + { + "epoch": 0.21, + "grad_norm": 1.7349209890562867, + "learning_rate": 9.214466734552125e-06, + "loss": 0.3544, + "step": 7083 + }, + { + "epoch": 0.21, + "grad_norm": 2.798934720917214, + "learning_rate": 9.214213975349272e-06, + "loss": 0.3842, + "step": 7084 + }, + { + "epoch": 0.21, + "grad_norm": 1.653085972270976, + "learning_rate": 9.213961178955934e-06, + "loss": 0.3548, + "step": 7085 + }, + { + "epoch": 0.21, + "grad_norm": 1.925206858994661, + "learning_rate": 9.213708345374343e-06, + "loss": 0.3678, + "step": 7086 + }, + { + "epoch": 0.21, + "grad_norm": 1.5000259815373254, + "learning_rate": 9.213455474606727e-06, + "loss": 0.3872, + "step": 7087 + }, + { + "epoch": 0.21, + "grad_norm": 1.6246898969612749, + "learning_rate": 9.213202566655322e-06, + "loss": 0.39, + "step": 7088 + }, + { + "epoch": 0.21, + "grad_norm": 1.705737021922213, + "learning_rate": 9.212949621522357e-06, + "loss": 0.3863, + "step": 7089 + }, + { + "epoch": 0.21, + "grad_norm": 3.5226981179405805, + "learning_rate": 9.212696639210064e-06, + "loss": 0.361, + "step": 7090 + }, + { + "epoch": 0.21, + "grad_norm": 2.190310367485849, + "learning_rate": 9.212443619720676e-06, + "loss": 0.3992, + "step": 7091 + }, + { + "epoch": 0.21, + "grad_norm": 1.9719684770305888, + "learning_rate": 9.212190563056426e-06, + "loss": 0.3876, + "step": 7092 + }, + { + "epoch": 0.21, + "grad_norm": 1.7485479880336134, + "learning_rate": 9.21193746921955e-06, + "loss": 0.3709, + "step": 7093 + }, + { + "epoch": 0.21, + "grad_norm": 1.610984159363754, + "learning_rate": 9.211684338212277e-06, + "loss": 0.3656, + "step": 7094 + }, + { + "epoch": 0.21, + "grad_norm": 1.4878972135141635, + "learning_rate": 9.211431170036844e-06, + "loss": 0.3688, + "step": 7095 + }, + { + "epoch": 0.21, + "grad_norm": 2.185408653712937, + "learning_rate": 9.211177964695483e-06, + "loss": 0.3576, + "step": 7096 + }, + { + "epoch": 0.21, + "grad_norm": 1.690710227620507, + "learning_rate": 9.21092472219043e-06, + "loss": 0.3821, + "step": 7097 + }, + { + "epoch": 0.21, + "grad_norm": 1.9297378590426173, + "learning_rate": 9.210671442523922e-06, + "loss": 0.4036, + "step": 7098 + }, + { + "epoch": 0.21, + "grad_norm": 1.507639631878265, + "learning_rate": 9.21041812569819e-06, + "loss": 0.355, + "step": 7099 + }, + { + "epoch": 0.21, + "grad_norm": 1.444041454019282, + "learning_rate": 9.21016477171547e-06, + "loss": 0.3688, + "step": 7100 + }, + { + "epoch": 0.21, + "grad_norm": 1.4927297444397705, + "learning_rate": 9.209911380578002e-06, + "loss": 0.4021, + "step": 7101 + }, + { + "epoch": 0.21, + "grad_norm": 1.596726129350717, + "learning_rate": 9.209657952288017e-06, + "loss": 0.3601, + "step": 7102 + }, + { + "epoch": 0.21, + "grad_norm": 1.7135284465074516, + "learning_rate": 9.209404486847754e-06, + "loss": 0.372, + "step": 7103 + }, + { + "epoch": 0.21, + "grad_norm": 1.8435422024492079, + "learning_rate": 9.209150984259452e-06, + "loss": 0.3626, + "step": 7104 + }, + { + "epoch": 0.21, + "grad_norm": 1.5813745218926756, + "learning_rate": 9.208897444525345e-06, + "loss": 0.3646, + "step": 7105 + }, + { + "epoch": 0.21, + "grad_norm": 1.6231846667146108, + "learning_rate": 9.20864386764767e-06, + "loss": 0.3725, + "step": 7106 + }, + { + "epoch": 0.21, + "grad_norm": 2.14572659553367, + "learning_rate": 9.208390253628667e-06, + "loss": 0.3621, + "step": 7107 + }, + { + "epoch": 0.21, + "grad_norm": 1.6759884729851666, + "learning_rate": 9.208136602470574e-06, + "loss": 0.3706, + "step": 7108 + }, + { + "epoch": 0.21, + "grad_norm": 2.0373474269996827, + "learning_rate": 9.207882914175627e-06, + "loss": 0.3313, + "step": 7109 + }, + { + "epoch": 0.21, + "grad_norm": 2.20777751609752, + "learning_rate": 9.207629188746068e-06, + "loss": 0.3604, + "step": 7110 + }, + { + "epoch": 0.21, + "grad_norm": 1.7869656659945992, + "learning_rate": 9.207375426184135e-06, + "loss": 0.3603, + "step": 7111 + }, + { + "epoch": 0.21, + "grad_norm": 1.772197916525323, + "learning_rate": 9.207121626492066e-06, + "loss": 0.3567, + "step": 7112 + }, + { + "epoch": 0.21, + "grad_norm": 2.352238353761793, + "learning_rate": 9.206867789672103e-06, + "loss": 0.3706, + "step": 7113 + }, + { + "epoch": 0.21, + "grad_norm": 1.804250973244036, + "learning_rate": 9.206613915726484e-06, + "loss": 0.3757, + "step": 7114 + }, + { + "epoch": 0.21, + "grad_norm": 1.8700927990110314, + "learning_rate": 9.20636000465745e-06, + "loss": 0.3745, + "step": 7115 + }, + { + "epoch": 0.21, + "grad_norm": 2.8690963839925607, + "learning_rate": 9.206106056467246e-06, + "loss": 0.3696, + "step": 7116 + }, + { + "epoch": 0.21, + "grad_norm": 4.168364152744223, + "learning_rate": 9.205852071158106e-06, + "loss": 0.3925, + "step": 7117 + }, + { + "epoch": 0.21, + "grad_norm": 2.364091204775608, + "learning_rate": 9.205598048732275e-06, + "loss": 0.3868, + "step": 7118 + }, + { + "epoch": 0.21, + "grad_norm": 2.2124692254516765, + "learning_rate": 9.205343989191995e-06, + "loss": 0.3523, + "step": 7119 + }, + { + "epoch": 0.21, + "grad_norm": 1.7270924056962238, + "learning_rate": 9.205089892539509e-06, + "loss": 0.3952, + "step": 7120 + }, + { + "epoch": 0.21, + "grad_norm": 2.3886804858033375, + "learning_rate": 9.204835758777057e-06, + "loss": 0.3828, + "step": 7121 + }, + { + "epoch": 0.21, + "grad_norm": 1.4858062946401966, + "learning_rate": 9.204581587906883e-06, + "loss": 0.3856, + "step": 7122 + }, + { + "epoch": 0.21, + "grad_norm": 1.7360117447418983, + "learning_rate": 9.204327379931228e-06, + "loss": 0.4125, + "step": 7123 + }, + { + "epoch": 0.21, + "grad_norm": 1.6450067043478223, + "learning_rate": 9.204073134852341e-06, + "loss": 0.3667, + "step": 7124 + }, + { + "epoch": 0.21, + "grad_norm": 1.7386256288737492, + "learning_rate": 9.203818852672458e-06, + "loss": 0.3795, + "step": 7125 + }, + { + "epoch": 0.21, + "grad_norm": 1.6760131046574314, + "learning_rate": 9.203564533393829e-06, + "loss": 0.388, + "step": 7126 + }, + { + "epoch": 0.21, + "grad_norm": 1.8953638359068536, + "learning_rate": 9.203310177018696e-06, + "loss": 0.3892, + "step": 7127 + }, + { + "epoch": 0.21, + "grad_norm": 1.7297859953214823, + "learning_rate": 9.203055783549304e-06, + "loss": 0.3614, + "step": 7128 + }, + { + "epoch": 0.21, + "grad_norm": 1.7623934218607629, + "learning_rate": 9.202801352987898e-06, + "loss": 0.351, + "step": 7129 + }, + { + "epoch": 0.21, + "grad_norm": 8.47405679584566, + "learning_rate": 9.202546885336725e-06, + "loss": 0.3661, + "step": 7130 + }, + { + "epoch": 0.21, + "grad_norm": 1.946369769016322, + "learning_rate": 9.202292380598027e-06, + "loss": 0.3683, + "step": 7131 + }, + { + "epoch": 0.21, + "grad_norm": 1.5020144781907174, + "learning_rate": 9.202037838774053e-06, + "loss": 0.3636, + "step": 7132 + }, + { + "epoch": 0.21, + "grad_norm": 1.6992308663009341, + "learning_rate": 9.201783259867048e-06, + "loss": 0.3701, + "step": 7133 + }, + { + "epoch": 0.21, + "grad_norm": 1.0309471009234588, + "learning_rate": 9.201528643879259e-06, + "loss": 0.5613, + "step": 7134 + }, + { + "epoch": 0.21, + "grad_norm": 1.685788903458198, + "learning_rate": 9.201273990812933e-06, + "loss": 0.4155, + "step": 7135 + }, + { + "epoch": 0.21, + "grad_norm": 1.4609085431495836, + "learning_rate": 9.20101930067032e-06, + "loss": 0.3606, + "step": 7136 + }, + { + "epoch": 0.21, + "grad_norm": 1.6034329545686374, + "learning_rate": 9.200764573453662e-06, + "loss": 0.3662, + "step": 7137 + }, + { + "epoch": 0.21, + "grad_norm": 1.9162581735170414, + "learning_rate": 9.200509809165212e-06, + "loss": 0.3655, + "step": 7138 + }, + { + "epoch": 0.21, + "grad_norm": 1.760977030496341, + "learning_rate": 9.200255007807215e-06, + "loss": 0.3813, + "step": 7139 + }, + { + "epoch": 0.21, + "grad_norm": 1.3869720635886071, + "learning_rate": 9.200000169381922e-06, + "loss": 0.3567, + "step": 7140 + }, + { + "epoch": 0.21, + "grad_norm": 2.338190405819497, + "learning_rate": 9.19974529389158e-06, + "loss": 0.3722, + "step": 7141 + }, + { + "epoch": 0.21, + "grad_norm": 1.8008432438520474, + "learning_rate": 9.199490381338439e-06, + "loss": 0.3693, + "step": 7142 + }, + { + "epoch": 0.21, + "grad_norm": 2.150033027601895, + "learning_rate": 9.199235431724749e-06, + "loss": 0.3734, + "step": 7143 + }, + { + "epoch": 0.21, + "grad_norm": 1.8229719727896605, + "learning_rate": 9.19898044505276e-06, + "loss": 0.3542, + "step": 7144 + }, + { + "epoch": 0.21, + "grad_norm": 1.597390666219037, + "learning_rate": 9.198725421324724e-06, + "loss": 0.3768, + "step": 7145 + }, + { + "epoch": 0.21, + "grad_norm": 2.3350422657989784, + "learning_rate": 9.198470360542886e-06, + "loss": 0.3881, + "step": 7146 + }, + { + "epoch": 0.21, + "grad_norm": 1.9533781476704257, + "learning_rate": 9.198215262709502e-06, + "loss": 0.3732, + "step": 7147 + }, + { + "epoch": 0.21, + "grad_norm": 1.5236548898790092, + "learning_rate": 9.19796012782682e-06, + "loss": 0.3587, + "step": 7148 + }, + { + "epoch": 0.21, + "grad_norm": 1.6488602784769744, + "learning_rate": 9.197704955897097e-06, + "loss": 0.3795, + "step": 7149 + }, + { + "epoch": 0.21, + "grad_norm": 1.8162330991615014, + "learning_rate": 9.197449746922579e-06, + "loss": 0.3702, + "step": 7150 + }, + { + "epoch": 0.21, + "grad_norm": 1.5345870807800166, + "learning_rate": 9.197194500905521e-06, + "loss": 0.3635, + "step": 7151 + }, + { + "epoch": 0.21, + "grad_norm": 1.8497366090526888, + "learning_rate": 9.196939217848175e-06, + "loss": 0.3836, + "step": 7152 + }, + { + "epoch": 0.21, + "grad_norm": 1.8631271307803599, + "learning_rate": 9.196683897752794e-06, + "loss": 0.3566, + "step": 7153 + }, + { + "epoch": 0.21, + "grad_norm": 1.9364826946587461, + "learning_rate": 9.196428540621632e-06, + "loss": 0.3977, + "step": 7154 + }, + { + "epoch": 0.21, + "grad_norm": 1.8431447436883497, + "learning_rate": 9.19617314645694e-06, + "loss": 0.3985, + "step": 7155 + }, + { + "epoch": 0.21, + "grad_norm": 2.038050299297245, + "learning_rate": 9.195917715260973e-06, + "loss": 0.36, + "step": 7156 + }, + { + "epoch": 0.21, + "grad_norm": 2.2072182437877044, + "learning_rate": 9.195662247035985e-06, + "loss": 0.4113, + "step": 7157 + }, + { + "epoch": 0.21, + "grad_norm": 1.732212782801728, + "learning_rate": 9.195406741784233e-06, + "loss": 0.37, + "step": 7158 + }, + { + "epoch": 0.21, + "grad_norm": 1.5851931582617165, + "learning_rate": 9.195151199507971e-06, + "loss": 0.382, + "step": 7159 + }, + { + "epoch": 0.21, + "grad_norm": 1.5943205689956461, + "learning_rate": 9.19489562020945e-06, + "loss": 0.3781, + "step": 7160 + }, + { + "epoch": 0.21, + "grad_norm": 1.425526123528467, + "learning_rate": 9.19464000389093e-06, + "loss": 0.3358, + "step": 7161 + }, + { + "epoch": 0.21, + "grad_norm": 2.2636996158172034, + "learning_rate": 9.194384350554666e-06, + "loss": 0.3719, + "step": 7162 + }, + { + "epoch": 0.21, + "grad_norm": 1.4747675172028079, + "learning_rate": 9.194128660202913e-06, + "loss": 0.36, + "step": 7163 + }, + { + "epoch": 0.21, + "grad_norm": 1.6867531658903006, + "learning_rate": 9.193872932837927e-06, + "loss": 0.3838, + "step": 7164 + }, + { + "epoch": 0.21, + "grad_norm": 1.589145874631122, + "learning_rate": 9.193617168461968e-06, + "loss": 0.3841, + "step": 7165 + }, + { + "epoch": 0.21, + "grad_norm": 1.6620495204142758, + "learning_rate": 9.193361367077288e-06, + "loss": 0.3577, + "step": 7166 + }, + { + "epoch": 0.21, + "grad_norm": 1.4035768180325743, + "learning_rate": 9.19310552868615e-06, + "loss": 0.3384, + "step": 7167 + }, + { + "epoch": 0.21, + "grad_norm": 1.58990084571886, + "learning_rate": 9.19284965329081e-06, + "loss": 0.3598, + "step": 7168 + }, + { + "epoch": 0.21, + "grad_norm": 1.51536694980111, + "learning_rate": 9.192593740893524e-06, + "loss": 0.365, + "step": 7169 + }, + { + "epoch": 0.21, + "grad_norm": 1.583184840957754, + "learning_rate": 9.19233779149655e-06, + "loss": 0.3646, + "step": 7170 + }, + { + "epoch": 0.21, + "grad_norm": 1.5633877658826454, + "learning_rate": 9.192081805102151e-06, + "loss": 0.398, + "step": 7171 + }, + { + "epoch": 0.21, + "grad_norm": 1.5830717240636194, + "learning_rate": 9.191825781712583e-06, + "loss": 0.3884, + "step": 7172 + }, + { + "epoch": 0.21, + "grad_norm": 1.52797631199018, + "learning_rate": 9.191569721330104e-06, + "loss": 0.3548, + "step": 7173 + }, + { + "epoch": 0.21, + "grad_norm": 1.4688879254790965, + "learning_rate": 9.191313623956978e-06, + "loss": 0.3636, + "step": 7174 + }, + { + "epoch": 0.21, + "grad_norm": 1.5395496925257695, + "learning_rate": 9.191057489595464e-06, + "loss": 0.3654, + "step": 7175 + }, + { + "epoch": 0.21, + "grad_norm": 1.4686506187780846, + "learning_rate": 9.190801318247817e-06, + "loss": 0.3671, + "step": 7176 + }, + { + "epoch": 0.21, + "grad_norm": 1.762409941496756, + "learning_rate": 9.190545109916305e-06, + "loss": 0.3958, + "step": 7177 + }, + { + "epoch": 0.21, + "grad_norm": 1.7245970328011893, + "learning_rate": 9.190288864603185e-06, + "loss": 0.3797, + "step": 7178 + }, + { + "epoch": 0.21, + "grad_norm": 2.524350055269702, + "learning_rate": 9.190032582310719e-06, + "loss": 0.3938, + "step": 7179 + }, + { + "epoch": 0.21, + "grad_norm": 1.8602039234894778, + "learning_rate": 9.18977626304117e-06, + "loss": 0.369, + "step": 7180 + }, + { + "epoch": 0.21, + "grad_norm": 1.5279372671400107, + "learning_rate": 9.189519906796798e-06, + "loss": 0.3607, + "step": 7181 + }, + { + "epoch": 0.21, + "grad_norm": 1.5750572889393515, + "learning_rate": 9.189263513579866e-06, + "loss": 0.364, + "step": 7182 + }, + { + "epoch": 0.21, + "grad_norm": 1.5326775507068158, + "learning_rate": 9.189007083392639e-06, + "loss": 0.359, + "step": 7183 + }, + { + "epoch": 0.21, + "grad_norm": 1.367398868871133, + "learning_rate": 9.188750616237376e-06, + "loss": 0.3498, + "step": 7184 + }, + { + "epoch": 0.21, + "grad_norm": 2.4668565500645783, + "learning_rate": 9.188494112116343e-06, + "loss": 0.3662, + "step": 7185 + }, + { + "epoch": 0.21, + "grad_norm": 1.57533580033637, + "learning_rate": 9.188237571031804e-06, + "loss": 0.3777, + "step": 7186 + }, + { + "epoch": 0.21, + "grad_norm": 1.679488183417649, + "learning_rate": 9.18798099298602e-06, + "loss": 0.3525, + "step": 7187 + }, + { + "epoch": 0.21, + "grad_norm": 1.423515750879761, + "learning_rate": 9.187724377981259e-06, + "loss": 0.44, + "step": 7188 + }, + { + "epoch": 0.21, + "grad_norm": 1.4793009240465294, + "learning_rate": 9.187467726019784e-06, + "loss": 0.3566, + "step": 7189 + }, + { + "epoch": 0.21, + "grad_norm": 2.380613128420746, + "learning_rate": 9.187211037103857e-06, + "loss": 0.3921, + "step": 7190 + }, + { + "epoch": 0.21, + "grad_norm": 1.4344452450755523, + "learning_rate": 9.186954311235749e-06, + "loss": 0.3487, + "step": 7191 + }, + { + "epoch": 0.21, + "grad_norm": 1.5663851315919495, + "learning_rate": 9.186697548417722e-06, + "loss": 0.3556, + "step": 7192 + }, + { + "epoch": 0.21, + "grad_norm": 3.388167505535813, + "learning_rate": 9.186440748652042e-06, + "loss": 0.3439, + "step": 7193 + }, + { + "epoch": 0.21, + "grad_norm": 1.5186512725335124, + "learning_rate": 9.186183911940977e-06, + "loss": 0.3635, + "step": 7194 + }, + { + "epoch": 0.21, + "grad_norm": 1.6210606631274278, + "learning_rate": 9.185927038286792e-06, + "loss": 0.3455, + "step": 7195 + }, + { + "epoch": 0.21, + "grad_norm": 1.9926948213930389, + "learning_rate": 9.185670127691755e-06, + "loss": 0.3864, + "step": 7196 + }, + { + "epoch": 0.21, + "grad_norm": 1.439330810996226, + "learning_rate": 9.185413180158132e-06, + "loss": 0.3881, + "step": 7197 + }, + { + "epoch": 0.21, + "grad_norm": 1.85822012189634, + "learning_rate": 9.185156195688192e-06, + "loss": 0.4223, + "step": 7198 + }, + { + "epoch": 0.21, + "grad_norm": 2.772612750352193, + "learning_rate": 9.184899174284201e-06, + "loss": 0.3918, + "step": 7199 + }, + { + "epoch": 0.21, + "grad_norm": 1.4192145838106625, + "learning_rate": 9.18464211594843e-06, + "loss": 0.3768, + "step": 7200 + }, + { + "epoch": 0.21, + "grad_norm": 1.5689092892635939, + "learning_rate": 9.184385020683144e-06, + "loss": 0.3734, + "step": 7201 + }, + { + "epoch": 0.21, + "grad_norm": 1.9929510174572618, + "learning_rate": 9.184127888490615e-06, + "loss": 0.4374, + "step": 7202 + }, + { + "epoch": 0.21, + "grad_norm": 1.4688842555783523, + "learning_rate": 9.183870719373112e-06, + "loss": 0.4018, + "step": 7203 + }, + { + "epoch": 0.21, + "grad_norm": 1.7843489273657582, + "learning_rate": 9.183613513332902e-06, + "loss": 0.3859, + "step": 7204 + }, + { + "epoch": 0.21, + "grad_norm": 1.435358260234249, + "learning_rate": 9.183356270372255e-06, + "loss": 0.3855, + "step": 7205 + }, + { + "epoch": 0.21, + "grad_norm": 1.5938263639264991, + "learning_rate": 9.183098990493443e-06, + "loss": 0.3597, + "step": 7206 + }, + { + "epoch": 0.21, + "grad_norm": 1.4820977945192968, + "learning_rate": 9.182841673698737e-06, + "loss": 0.3823, + "step": 7207 + }, + { + "epoch": 0.21, + "grad_norm": 1.535316514577133, + "learning_rate": 9.182584319990408e-06, + "loss": 0.4176, + "step": 7208 + }, + { + "epoch": 0.21, + "grad_norm": 1.6085952615525625, + "learning_rate": 9.182326929370726e-06, + "loss": 0.3562, + "step": 7209 + }, + { + "epoch": 0.21, + "grad_norm": 1.48292230990871, + "learning_rate": 9.182069501841961e-06, + "loss": 0.3879, + "step": 7210 + }, + { + "epoch": 0.21, + "grad_norm": 2.238302383633747, + "learning_rate": 9.181812037406386e-06, + "loss": 0.3561, + "step": 7211 + }, + { + "epoch": 0.21, + "grad_norm": 2.0070706857609752, + "learning_rate": 9.181554536066274e-06, + "loss": 0.3573, + "step": 7212 + }, + { + "epoch": 0.21, + "grad_norm": 1.6948894565730819, + "learning_rate": 9.181296997823896e-06, + "loss": 0.391, + "step": 7213 + }, + { + "epoch": 0.21, + "grad_norm": 1.377668185544434, + "learning_rate": 9.181039422681527e-06, + "loss": 0.3518, + "step": 7214 + }, + { + "epoch": 0.21, + "grad_norm": 1.749726548348457, + "learning_rate": 9.180781810641437e-06, + "loss": 0.3685, + "step": 7215 + }, + { + "epoch": 0.21, + "grad_norm": 1.450873952006027, + "learning_rate": 9.180524161705903e-06, + "loss": 0.3645, + "step": 7216 + }, + { + "epoch": 0.21, + "grad_norm": 1.377532344890524, + "learning_rate": 9.180266475877195e-06, + "loss": 0.3606, + "step": 7217 + }, + { + "epoch": 0.21, + "grad_norm": 1.5265209373181157, + "learning_rate": 9.180008753157592e-06, + "loss": 0.3842, + "step": 7218 + }, + { + "epoch": 0.21, + "grad_norm": 1.4510159973155505, + "learning_rate": 9.179750993549364e-06, + "loss": 0.3529, + "step": 7219 + }, + { + "epoch": 0.21, + "grad_norm": 1.6005418111267837, + "learning_rate": 9.179493197054785e-06, + "loss": 0.401, + "step": 7220 + }, + { + "epoch": 0.21, + "grad_norm": 2.13079935168049, + "learning_rate": 9.179235363676133e-06, + "loss": 0.3714, + "step": 7221 + }, + { + "epoch": 0.21, + "grad_norm": 2.562916048541077, + "learning_rate": 9.178977493415684e-06, + "loss": 0.3643, + "step": 7222 + }, + { + "epoch": 0.21, + "grad_norm": 1.518789966786515, + "learning_rate": 9.178719586275712e-06, + "loss": 0.3273, + "step": 7223 + }, + { + "epoch": 0.21, + "grad_norm": 1.7331499781857953, + "learning_rate": 9.178461642258492e-06, + "loss": 0.3982, + "step": 7224 + }, + { + "epoch": 0.21, + "grad_norm": 1.3867292022625402, + "learning_rate": 9.178203661366303e-06, + "loss": 0.3538, + "step": 7225 + }, + { + "epoch": 0.21, + "grad_norm": 1.44316115232875, + "learning_rate": 9.17794564360142e-06, + "loss": 0.3823, + "step": 7226 + }, + { + "epoch": 0.21, + "grad_norm": 1.4623956118935146, + "learning_rate": 9.17768758896612e-06, + "loss": 0.3942, + "step": 7227 + }, + { + "epoch": 0.21, + "grad_norm": 1.5971033356122217, + "learning_rate": 9.177429497462681e-06, + "loss": 0.3592, + "step": 7228 + }, + { + "epoch": 0.21, + "grad_norm": 1.4846742477499617, + "learning_rate": 9.17717136909338e-06, + "loss": 0.3648, + "step": 7229 + }, + { + "epoch": 0.21, + "grad_norm": 1.6880742553335684, + "learning_rate": 9.176913203860496e-06, + "loss": 0.3714, + "step": 7230 + }, + { + "epoch": 0.21, + "grad_norm": 1.4398479773058546, + "learning_rate": 9.176655001766306e-06, + "loss": 0.3511, + "step": 7231 + }, + { + "epoch": 0.21, + "grad_norm": 1.3576343444908978, + "learning_rate": 9.17639676281309e-06, + "loss": 0.3758, + "step": 7232 + }, + { + "epoch": 0.21, + "grad_norm": 1.4408563036370363, + "learning_rate": 9.176138487003125e-06, + "loss": 0.3328, + "step": 7233 + }, + { + "epoch": 0.21, + "grad_norm": 1.5124762252393131, + "learning_rate": 9.175880174338693e-06, + "loss": 0.3535, + "step": 7234 + }, + { + "epoch": 0.21, + "grad_norm": 1.5443299659660974, + "learning_rate": 9.175621824822071e-06, + "loss": 0.3611, + "step": 7235 + }, + { + "epoch": 0.21, + "grad_norm": 1.8481096430948687, + "learning_rate": 9.17536343845554e-06, + "loss": 0.3999, + "step": 7236 + }, + { + "epoch": 0.21, + "grad_norm": 1.4954917878843272, + "learning_rate": 9.17510501524138e-06, + "loss": 0.3832, + "step": 7237 + }, + { + "epoch": 0.21, + "grad_norm": 1.6146233356427417, + "learning_rate": 9.174846555181873e-06, + "loss": 0.3942, + "step": 7238 + }, + { + "epoch": 0.21, + "grad_norm": 1.7408480357931242, + "learning_rate": 9.1745880582793e-06, + "loss": 0.3766, + "step": 7239 + }, + { + "epoch": 0.21, + "grad_norm": 1.7610970797753038, + "learning_rate": 9.17432952453594e-06, + "loss": 0.3707, + "step": 7240 + }, + { + "epoch": 0.21, + "grad_norm": 1.5151608137987134, + "learning_rate": 9.174070953954074e-06, + "loss": 0.3734, + "step": 7241 + }, + { + "epoch": 0.21, + "grad_norm": 2.5301057171482317, + "learning_rate": 9.173812346535986e-06, + "loss": 0.3313, + "step": 7242 + }, + { + "epoch": 0.21, + "grad_norm": 2.7864915466243203, + "learning_rate": 9.173553702283959e-06, + "loss": 0.3639, + "step": 7243 + }, + { + "epoch": 0.21, + "grad_norm": 2.299814528350977, + "learning_rate": 9.173295021200275e-06, + "loss": 0.3718, + "step": 7244 + }, + { + "epoch": 0.21, + "grad_norm": 1.638439284614173, + "learning_rate": 9.173036303287215e-06, + "loss": 0.3677, + "step": 7245 + }, + { + "epoch": 0.21, + "grad_norm": 2.534763979710926, + "learning_rate": 9.172777548547063e-06, + "loss": 0.3699, + "step": 7246 + }, + { + "epoch": 0.21, + "grad_norm": 1.9226286751517825, + "learning_rate": 9.172518756982103e-06, + "loss": 0.3543, + "step": 7247 + }, + { + "epoch": 0.21, + "grad_norm": 1.3889587455857244, + "learning_rate": 9.172259928594619e-06, + "loss": 0.362, + "step": 7248 + }, + { + "epoch": 0.21, + "grad_norm": 1.4751240626537554, + "learning_rate": 9.172001063386893e-06, + "loss": 0.3584, + "step": 7249 + }, + { + "epoch": 0.21, + "grad_norm": 1.760268219520533, + "learning_rate": 9.171742161361215e-06, + "loss": 0.3696, + "step": 7250 + }, + { + "epoch": 0.21, + "grad_norm": 1.3297449533272414, + "learning_rate": 9.171483222519864e-06, + "loss": 0.3529, + "step": 7251 + }, + { + "epoch": 0.21, + "grad_norm": 1.8824302396224488, + "learning_rate": 9.171224246865127e-06, + "loss": 0.3455, + "step": 7252 + }, + { + "epoch": 0.21, + "grad_norm": 2.112698363912008, + "learning_rate": 9.170965234399289e-06, + "loss": 0.352, + "step": 7253 + }, + { + "epoch": 0.21, + "grad_norm": 1.5412064476099963, + "learning_rate": 9.170706185124638e-06, + "loss": 0.391, + "step": 7254 + }, + { + "epoch": 0.21, + "grad_norm": 1.5164664395802643, + "learning_rate": 9.170447099043457e-06, + "loss": 0.3739, + "step": 7255 + }, + { + "epoch": 0.21, + "grad_norm": 1.7183060887755859, + "learning_rate": 9.170187976158036e-06, + "loss": 0.3554, + "step": 7256 + }, + { + "epoch": 0.21, + "grad_norm": 1.6044631937736606, + "learning_rate": 9.16992881647066e-06, + "loss": 0.3557, + "step": 7257 + }, + { + "epoch": 0.21, + "grad_norm": 1.4430827921812945, + "learning_rate": 9.169669619983616e-06, + "loss": 0.3712, + "step": 7258 + }, + { + "epoch": 0.21, + "grad_norm": 1.6222244599021716, + "learning_rate": 9.16941038669919e-06, + "loss": 0.3624, + "step": 7259 + }, + { + "epoch": 0.21, + "grad_norm": 1.4674151068129508, + "learning_rate": 9.169151116619672e-06, + "loss": 0.3658, + "step": 7260 + }, + { + "epoch": 0.21, + "grad_norm": 3.1849695181502744, + "learning_rate": 9.16889180974735e-06, + "loss": 0.3565, + "step": 7261 + }, + { + "epoch": 0.21, + "grad_norm": 2.0207244184808486, + "learning_rate": 9.168632466084509e-06, + "loss": 0.3778, + "step": 7262 + }, + { + "epoch": 0.21, + "grad_norm": 1.7779836030882088, + "learning_rate": 9.168373085633442e-06, + "loss": 0.3652, + "step": 7263 + }, + { + "epoch": 0.21, + "grad_norm": 1.4691424128781512, + "learning_rate": 9.168113668396435e-06, + "loss": 0.3778, + "step": 7264 + }, + { + "epoch": 0.21, + "grad_norm": 1.5028242589739509, + "learning_rate": 9.16785421437578e-06, + "loss": 0.3886, + "step": 7265 + }, + { + "epoch": 0.21, + "grad_norm": 1.5491007510005403, + "learning_rate": 9.167594723573765e-06, + "loss": 0.3628, + "step": 7266 + }, + { + "epoch": 0.21, + "grad_norm": 1.549500577306857, + "learning_rate": 9.167335195992679e-06, + "loss": 0.3622, + "step": 7267 + }, + { + "epoch": 0.21, + "grad_norm": 2.3228891642609004, + "learning_rate": 9.167075631634816e-06, + "loss": 0.3756, + "step": 7268 + }, + { + "epoch": 0.21, + "grad_norm": 1.6637949305012736, + "learning_rate": 9.166816030502462e-06, + "loss": 0.3843, + "step": 7269 + }, + { + "epoch": 0.21, + "grad_norm": 3.050790701073695, + "learning_rate": 9.166556392597912e-06, + "loss": 0.391, + "step": 7270 + }, + { + "epoch": 0.21, + "grad_norm": 1.8562213996442165, + "learning_rate": 9.166296717923455e-06, + "loss": 0.3671, + "step": 7271 + }, + { + "epoch": 0.21, + "grad_norm": 1.6405038490496788, + "learning_rate": 9.166037006481384e-06, + "loss": 0.3689, + "step": 7272 + }, + { + "epoch": 0.21, + "grad_norm": 1.8028225345056441, + "learning_rate": 9.16577725827399e-06, + "loss": 0.3451, + "step": 7273 + }, + { + "epoch": 0.21, + "grad_norm": 1.0386510227450703, + "learning_rate": 9.165517473303563e-06, + "loss": 0.5853, + "step": 7274 + }, + { + "epoch": 0.21, + "grad_norm": 1.5339841709341988, + "learning_rate": 9.165257651572401e-06, + "loss": 0.3772, + "step": 7275 + }, + { + "epoch": 0.21, + "grad_norm": 1.7962958947164536, + "learning_rate": 9.164997793082793e-06, + "loss": 0.3875, + "step": 7276 + }, + { + "epoch": 0.21, + "grad_norm": 1.5102987450280756, + "learning_rate": 9.164737897837033e-06, + "loss": 0.3729, + "step": 7277 + }, + { + "epoch": 0.21, + "grad_norm": 1.6197931331359425, + "learning_rate": 9.164477965837415e-06, + "loss": 0.3468, + "step": 7278 + }, + { + "epoch": 0.21, + "grad_norm": 2.1934965806512223, + "learning_rate": 9.164217997086234e-06, + "loss": 0.3466, + "step": 7279 + }, + { + "epoch": 0.21, + "grad_norm": 1.5017989604230149, + "learning_rate": 9.16395799158578e-06, + "loss": 0.3845, + "step": 7280 + }, + { + "epoch": 0.21, + "grad_norm": 2.392701676839056, + "learning_rate": 9.163697949338353e-06, + "loss": 0.385, + "step": 7281 + }, + { + "epoch": 0.21, + "grad_norm": 1.4357893664010166, + "learning_rate": 9.163437870346246e-06, + "loss": 0.39, + "step": 7282 + }, + { + "epoch": 0.21, + "grad_norm": 1.9254499662957154, + "learning_rate": 9.163177754611752e-06, + "loss": 0.3543, + "step": 7283 + }, + { + "epoch": 0.21, + "grad_norm": 1.6835896084978461, + "learning_rate": 9.162917602137169e-06, + "loss": 0.357, + "step": 7284 + }, + { + "epoch": 0.21, + "grad_norm": 1.726553649276968, + "learning_rate": 9.162657412924791e-06, + "loss": 0.3791, + "step": 7285 + }, + { + "epoch": 0.21, + "grad_norm": 1.7157041376917987, + "learning_rate": 9.162397186976916e-06, + "loss": 0.3779, + "step": 7286 + }, + { + "epoch": 0.21, + "grad_norm": 2.9527396036762266, + "learning_rate": 9.162136924295839e-06, + "loss": 0.3904, + "step": 7287 + }, + { + "epoch": 0.21, + "grad_norm": 1.7099379975066564, + "learning_rate": 9.161876624883858e-06, + "loss": 0.3809, + "step": 7288 + }, + { + "epoch": 0.21, + "grad_norm": 1.7481069327318473, + "learning_rate": 9.161616288743268e-06, + "loss": 0.3592, + "step": 7289 + }, + { + "epoch": 0.21, + "grad_norm": 1.5325614876762896, + "learning_rate": 9.16135591587637e-06, + "loss": 0.377, + "step": 7290 + }, + { + "epoch": 0.21, + "grad_norm": 1.0269815645349492, + "learning_rate": 9.16109550628546e-06, + "loss": 0.6279, + "step": 7291 + }, + { + "epoch": 0.21, + "grad_norm": 0.9647273104520426, + "learning_rate": 9.160835059972837e-06, + "loss": 0.5794, + "step": 7292 + }, + { + "epoch": 0.21, + "grad_norm": 1.6998345015014307, + "learning_rate": 9.160574576940795e-06, + "loss": 0.3806, + "step": 7293 + }, + { + "epoch": 0.21, + "grad_norm": 1.7005715522991407, + "learning_rate": 9.16031405719164e-06, + "loss": 0.4079, + "step": 7294 + }, + { + "epoch": 0.21, + "grad_norm": 2.3212621002877247, + "learning_rate": 9.160053500727664e-06, + "loss": 0.3961, + "step": 7295 + }, + { + "epoch": 0.21, + "grad_norm": 1.6314449279076417, + "learning_rate": 9.159792907551171e-06, + "loss": 0.3886, + "step": 7296 + }, + { + "epoch": 0.21, + "grad_norm": 1.7729968028404026, + "learning_rate": 9.15953227766446e-06, + "loss": 0.387, + "step": 7297 + }, + { + "epoch": 0.21, + "grad_norm": 1.5337291614140536, + "learning_rate": 9.15927161106983e-06, + "loss": 0.3979, + "step": 7298 + }, + { + "epoch": 0.21, + "grad_norm": 1.6558839487458874, + "learning_rate": 9.159010907769583e-06, + "loss": 0.3891, + "step": 7299 + }, + { + "epoch": 0.21, + "grad_norm": 1.4071036824323808, + "learning_rate": 9.158750167766018e-06, + "loss": 0.375, + "step": 7300 + }, + { + "epoch": 0.21, + "grad_norm": 2.016710268420628, + "learning_rate": 9.158489391061434e-06, + "loss": 0.3634, + "step": 7301 + }, + { + "epoch": 0.21, + "grad_norm": 2.6528130754218795, + "learning_rate": 9.158228577658137e-06, + "loss": 0.4091, + "step": 7302 + }, + { + "epoch": 0.21, + "grad_norm": 2.11000188017495, + "learning_rate": 9.157967727558427e-06, + "loss": 0.3783, + "step": 7303 + }, + { + "epoch": 0.21, + "grad_norm": 1.7338538247324757, + "learning_rate": 9.157706840764607e-06, + "loss": 0.4265, + "step": 7304 + }, + { + "epoch": 0.21, + "grad_norm": 1.684711500460785, + "learning_rate": 9.157445917278975e-06, + "loss": 0.3875, + "step": 7305 + }, + { + "epoch": 0.21, + "grad_norm": 1.8079577497620591, + "learning_rate": 9.15718495710384e-06, + "loss": 0.3577, + "step": 7306 + }, + { + "epoch": 0.21, + "grad_norm": 1.5118901264221412, + "learning_rate": 9.1569239602415e-06, + "loss": 0.3646, + "step": 7307 + }, + { + "epoch": 0.21, + "grad_norm": 1.5010133969650497, + "learning_rate": 9.156662926694257e-06, + "loss": 0.3795, + "step": 7308 + }, + { + "epoch": 0.21, + "grad_norm": 1.4071786220699676, + "learning_rate": 9.15640185646442e-06, + "loss": 0.3743, + "step": 7309 + }, + { + "epoch": 0.21, + "grad_norm": 1.4647272834550829, + "learning_rate": 9.156140749554291e-06, + "loss": 0.3587, + "step": 7310 + }, + { + "epoch": 0.21, + "grad_norm": 1.3700971069353525, + "learning_rate": 9.155879605966175e-06, + "loss": 0.3986, + "step": 7311 + }, + { + "epoch": 0.21, + "grad_norm": 1.5388011053287023, + "learning_rate": 9.155618425702374e-06, + "loss": 0.3558, + "step": 7312 + }, + { + "epoch": 0.21, + "grad_norm": 1.3134279046946236, + "learning_rate": 9.155357208765195e-06, + "loss": 0.6467, + "step": 7313 + }, + { + "epoch": 0.21, + "grad_norm": 1.933463355886289, + "learning_rate": 9.155095955156941e-06, + "loss": 0.3522, + "step": 7314 + }, + { + "epoch": 0.21, + "grad_norm": 1.7705217506286333, + "learning_rate": 9.154834664879919e-06, + "loss": 0.3801, + "step": 7315 + }, + { + "epoch": 0.21, + "grad_norm": 1.6691983353947013, + "learning_rate": 9.154573337936437e-06, + "loss": 0.3609, + "step": 7316 + }, + { + "epoch": 0.21, + "grad_norm": 1.4389468624453976, + "learning_rate": 9.154311974328797e-06, + "loss": 0.3796, + "step": 7317 + }, + { + "epoch": 0.21, + "grad_norm": 1.5103434968218692, + "learning_rate": 9.15405057405931e-06, + "loss": 0.3776, + "step": 7318 + }, + { + "epoch": 0.21, + "grad_norm": 1.5831200665153518, + "learning_rate": 9.153789137130279e-06, + "loss": 0.3739, + "step": 7319 + }, + { + "epoch": 0.21, + "grad_norm": 1.6270713117166342, + "learning_rate": 9.153527663544013e-06, + "loss": 0.4167, + "step": 7320 + }, + { + "epoch": 0.21, + "grad_norm": 3.320304809622989, + "learning_rate": 9.15326615330282e-06, + "loss": 0.4013, + "step": 7321 + }, + { + "epoch": 0.21, + "grad_norm": 1.5010335709365028, + "learning_rate": 9.153004606409006e-06, + "loss": 0.3697, + "step": 7322 + }, + { + "epoch": 0.21, + "grad_norm": 1.4639398835689998, + "learning_rate": 9.15274302286488e-06, + "loss": 0.3672, + "step": 7323 + }, + { + "epoch": 0.21, + "grad_norm": 1.4222417527877118, + "learning_rate": 9.152481402672752e-06, + "loss": 0.3544, + "step": 7324 + }, + { + "epoch": 0.21, + "grad_norm": 1.834907131514894, + "learning_rate": 9.15221974583493e-06, + "loss": 0.3873, + "step": 7325 + }, + { + "epoch": 0.21, + "grad_norm": 1.5257403895246024, + "learning_rate": 9.151958052353722e-06, + "loss": 0.373, + "step": 7326 + }, + { + "epoch": 0.21, + "grad_norm": 1.6801007604400784, + "learning_rate": 9.151696322231438e-06, + "loss": 0.3485, + "step": 7327 + }, + { + "epoch": 0.21, + "grad_norm": 1.5685798539801614, + "learning_rate": 9.151434555470388e-06, + "loss": 0.3709, + "step": 7328 + }, + { + "epoch": 0.21, + "grad_norm": 1.4337602028736014, + "learning_rate": 9.151172752072881e-06, + "loss": 0.3596, + "step": 7329 + }, + { + "epoch": 0.21, + "grad_norm": 2.0658314921943264, + "learning_rate": 9.150910912041232e-06, + "loss": 0.3847, + "step": 7330 + }, + { + "epoch": 0.21, + "grad_norm": 2.8749082647795174, + "learning_rate": 9.150649035377743e-06, + "loss": 0.3851, + "step": 7331 + }, + { + "epoch": 0.21, + "grad_norm": 1.7601214376267886, + "learning_rate": 9.150387122084735e-06, + "loss": 0.4417, + "step": 7332 + }, + { + "epoch": 0.21, + "grad_norm": 1.3959980594886285, + "learning_rate": 9.150125172164511e-06, + "loss": 0.3825, + "step": 7333 + }, + { + "epoch": 0.21, + "grad_norm": 1.673257275621625, + "learning_rate": 9.149863185619387e-06, + "loss": 0.3855, + "step": 7334 + }, + { + "epoch": 0.21, + "grad_norm": 1.7541660135125403, + "learning_rate": 9.149601162451675e-06, + "loss": 0.3455, + "step": 7335 + }, + { + "epoch": 0.21, + "grad_norm": 1.56037044510562, + "learning_rate": 9.149339102663687e-06, + "loss": 0.348, + "step": 7336 + }, + { + "epoch": 0.21, + "grad_norm": 1.723412482184738, + "learning_rate": 9.149077006257734e-06, + "loss": 0.3651, + "step": 7337 + }, + { + "epoch": 0.21, + "grad_norm": 1.5999206295397157, + "learning_rate": 9.148814873236133e-06, + "loss": 0.3604, + "step": 7338 + }, + { + "epoch": 0.21, + "grad_norm": 2.19006182684726, + "learning_rate": 9.148552703601192e-06, + "loss": 0.3938, + "step": 7339 + }, + { + "epoch": 0.21, + "grad_norm": 1.3638698551465513, + "learning_rate": 9.148290497355229e-06, + "loss": 0.3844, + "step": 7340 + }, + { + "epoch": 0.21, + "grad_norm": 1.7064843104648835, + "learning_rate": 9.148028254500556e-06, + "loss": 0.3753, + "step": 7341 + }, + { + "epoch": 0.21, + "grad_norm": 1.6723899447522597, + "learning_rate": 9.147765975039487e-06, + "loss": 0.3943, + "step": 7342 + }, + { + "epoch": 0.21, + "grad_norm": 1.6491381900657691, + "learning_rate": 9.147503658974338e-06, + "loss": 0.3666, + "step": 7343 + }, + { + "epoch": 0.21, + "grad_norm": 1.4029798521438606, + "learning_rate": 9.147241306307424e-06, + "loss": 0.3255, + "step": 7344 + }, + { + "epoch": 0.21, + "grad_norm": 1.6465737473860504, + "learning_rate": 9.146978917041058e-06, + "loss": 0.3869, + "step": 7345 + }, + { + "epoch": 0.21, + "grad_norm": 1.5640761045673957, + "learning_rate": 9.146716491177557e-06, + "loss": 0.408, + "step": 7346 + }, + { + "epoch": 0.21, + "grad_norm": 1.499135744206675, + "learning_rate": 9.146454028719239e-06, + "loss": 0.351, + "step": 7347 + }, + { + "epoch": 0.21, + "grad_norm": 1.0558953895896965, + "learning_rate": 9.146191529668416e-06, + "loss": 0.6226, + "step": 7348 + }, + { + "epoch": 0.21, + "grad_norm": 2.6363444089580184, + "learning_rate": 9.145928994027407e-06, + "loss": 0.3697, + "step": 7349 + }, + { + "epoch": 0.21, + "grad_norm": 0.9660102284865393, + "learning_rate": 9.14566642179853e-06, + "loss": 0.5897, + "step": 7350 + }, + { + "epoch": 0.21, + "grad_norm": 1.4575715038902282, + "learning_rate": 9.1454038129841e-06, + "loss": 0.3968, + "step": 7351 + }, + { + "epoch": 0.21, + "grad_norm": 1.5632314864160546, + "learning_rate": 9.145141167586433e-06, + "loss": 0.3703, + "step": 7352 + }, + { + "epoch": 0.21, + "grad_norm": 1.7375443691071608, + "learning_rate": 9.144878485607854e-06, + "loss": 0.3899, + "step": 7353 + }, + { + "epoch": 0.21, + "grad_norm": 1.5374574730005774, + "learning_rate": 9.144615767050674e-06, + "loss": 0.3501, + "step": 7354 + }, + { + "epoch": 0.21, + "grad_norm": 1.5481125101274407, + "learning_rate": 9.144353011917213e-06, + "loss": 0.3594, + "step": 7355 + }, + { + "epoch": 0.21, + "grad_norm": 1.515877303037089, + "learning_rate": 9.144090220209791e-06, + "loss": 0.3967, + "step": 7356 + }, + { + "epoch": 0.21, + "grad_norm": 1.705934973842806, + "learning_rate": 9.143827391930726e-06, + "loss": 0.3744, + "step": 7357 + }, + { + "epoch": 0.21, + "grad_norm": 1.549483774630377, + "learning_rate": 9.14356452708234e-06, + "loss": 0.3908, + "step": 7358 + }, + { + "epoch": 0.21, + "grad_norm": 1.685081296835253, + "learning_rate": 9.143301625666952e-06, + "loss": 0.3524, + "step": 7359 + }, + { + "epoch": 0.21, + "grad_norm": 1.6571538043570098, + "learning_rate": 9.143038687686877e-06, + "loss": 0.3724, + "step": 7360 + }, + { + "epoch": 0.21, + "grad_norm": 1.4586473303243077, + "learning_rate": 9.142775713144443e-06, + "loss": 0.3898, + "step": 7361 + }, + { + "epoch": 0.21, + "grad_norm": 1.6213100401379887, + "learning_rate": 9.142512702041968e-06, + "loss": 0.3596, + "step": 7362 + }, + { + "epoch": 0.21, + "grad_norm": 1.4961764847321743, + "learning_rate": 9.142249654381772e-06, + "loss": 0.3739, + "step": 7363 + }, + { + "epoch": 0.21, + "grad_norm": 4.179313495080194, + "learning_rate": 9.141986570166176e-06, + "loss": 0.3629, + "step": 7364 + }, + { + "epoch": 0.21, + "grad_norm": 1.8013455714586817, + "learning_rate": 9.141723449397501e-06, + "loss": 0.3584, + "step": 7365 + }, + { + "epoch": 0.21, + "grad_norm": 1.6059896429304126, + "learning_rate": 9.141460292078074e-06, + "loss": 0.345, + "step": 7366 + }, + { + "epoch": 0.21, + "grad_norm": 2.60830721345826, + "learning_rate": 9.14119709821021e-06, + "loss": 0.3589, + "step": 7367 + }, + { + "epoch": 0.21, + "grad_norm": 1.7086145488023852, + "learning_rate": 9.14093386779624e-06, + "loss": 0.36, + "step": 7368 + }, + { + "epoch": 0.21, + "grad_norm": 1.4940586079212237, + "learning_rate": 9.140670600838481e-06, + "loss": 0.3517, + "step": 7369 + }, + { + "epoch": 0.21, + "grad_norm": 1.5872750145559813, + "learning_rate": 9.140407297339258e-06, + "loss": 0.3901, + "step": 7370 + }, + { + "epoch": 0.21, + "grad_norm": 1.61083773510317, + "learning_rate": 9.140143957300892e-06, + "loss": 0.3822, + "step": 7371 + }, + { + "epoch": 0.21, + "grad_norm": 1.4719770571887707, + "learning_rate": 9.139880580725713e-06, + "loss": 0.4134, + "step": 7372 + }, + { + "epoch": 0.21, + "grad_norm": 1.464975508641224, + "learning_rate": 9.139617167616041e-06, + "loss": 0.3403, + "step": 7373 + }, + { + "epoch": 0.21, + "grad_norm": 1.5010248015014374, + "learning_rate": 9.139353717974202e-06, + "loss": 0.3756, + "step": 7374 + }, + { + "epoch": 0.21, + "grad_norm": 1.6543410916849735, + "learning_rate": 9.13909023180252e-06, + "loss": 0.3955, + "step": 7375 + }, + { + "epoch": 0.21, + "grad_norm": 1.3701030774423202, + "learning_rate": 9.138826709103321e-06, + "loss": 0.3844, + "step": 7376 + }, + { + "epoch": 0.21, + "grad_norm": 2.000746951072411, + "learning_rate": 9.138563149878929e-06, + "loss": 0.3484, + "step": 7377 + }, + { + "epoch": 0.21, + "grad_norm": 1.3756213896717249, + "learning_rate": 9.138299554131674e-06, + "loss": 0.3497, + "step": 7378 + }, + { + "epoch": 0.21, + "grad_norm": 1.547159226595432, + "learning_rate": 9.138035921863877e-06, + "loss": 0.3744, + "step": 7379 + }, + { + "epoch": 0.21, + "grad_norm": 1.821026118367407, + "learning_rate": 9.137772253077869e-06, + "loss": 0.3825, + "step": 7380 + }, + { + "epoch": 0.21, + "grad_norm": 1.9245353613432636, + "learning_rate": 9.137508547775974e-06, + "loss": 0.3571, + "step": 7381 + }, + { + "epoch": 0.21, + "grad_norm": 1.5955851886472359, + "learning_rate": 9.13724480596052e-06, + "loss": 0.3466, + "step": 7382 + }, + { + "epoch": 0.21, + "grad_norm": 2.136692423788791, + "learning_rate": 9.136981027633834e-06, + "loss": 0.395, + "step": 7383 + }, + { + "epoch": 0.21, + "grad_norm": 1.4553952168502204, + "learning_rate": 9.136717212798246e-06, + "loss": 0.3445, + "step": 7384 + }, + { + "epoch": 0.21, + "grad_norm": 1.4886996202538925, + "learning_rate": 9.136453361456082e-06, + "loss": 0.3817, + "step": 7385 + }, + { + "epoch": 0.21, + "grad_norm": 1.8011031873877879, + "learning_rate": 9.13618947360967e-06, + "loss": 0.3767, + "step": 7386 + }, + { + "epoch": 0.21, + "grad_norm": 1.6328977473484194, + "learning_rate": 9.135925549261344e-06, + "loss": 0.372, + "step": 7387 + }, + { + "epoch": 0.21, + "grad_norm": 1.406640211946076, + "learning_rate": 9.135661588413425e-06, + "loss": 0.3631, + "step": 7388 + }, + { + "epoch": 0.21, + "grad_norm": 1.5011650812719861, + "learning_rate": 9.13539759106825e-06, + "loss": 0.3619, + "step": 7389 + }, + { + "epoch": 0.21, + "grad_norm": 1.4195409475380607, + "learning_rate": 9.135133557228141e-06, + "loss": 0.3422, + "step": 7390 + }, + { + "epoch": 0.21, + "grad_norm": 1.7988786571544046, + "learning_rate": 9.134869486895436e-06, + "loss": 0.3659, + "step": 7391 + }, + { + "epoch": 0.21, + "grad_norm": 1.5731081316445994, + "learning_rate": 9.134605380072461e-06, + "loss": 0.3759, + "step": 7392 + }, + { + "epoch": 0.21, + "grad_norm": 1.6844071806241, + "learning_rate": 9.134341236761547e-06, + "loss": 0.3481, + "step": 7393 + }, + { + "epoch": 0.21, + "grad_norm": 1.5685638550264613, + "learning_rate": 9.134077056965027e-06, + "loss": 0.3431, + "step": 7394 + }, + { + "epoch": 0.21, + "grad_norm": 2.0543523531183063, + "learning_rate": 9.13381284068523e-06, + "loss": 0.3691, + "step": 7395 + }, + { + "epoch": 0.21, + "grad_norm": 1.518689438714447, + "learning_rate": 9.133548587924489e-06, + "loss": 0.3659, + "step": 7396 + }, + { + "epoch": 0.21, + "grad_norm": 2.384317643045803, + "learning_rate": 9.133284298685136e-06, + "loss": 0.3892, + "step": 7397 + }, + { + "epoch": 0.21, + "grad_norm": 2.113773613909411, + "learning_rate": 9.133019972969503e-06, + "loss": 0.3941, + "step": 7398 + }, + { + "epoch": 0.21, + "grad_norm": 2.819846868178921, + "learning_rate": 9.132755610779924e-06, + "loss": 0.3605, + "step": 7399 + }, + { + "epoch": 0.21, + "grad_norm": 1.3817546337118933, + "learning_rate": 9.132491212118729e-06, + "loss": 0.3582, + "step": 7400 + }, + { + "epoch": 0.21, + "grad_norm": 2.2570569725085887, + "learning_rate": 9.132226776988256e-06, + "loss": 0.3646, + "step": 7401 + }, + { + "epoch": 0.21, + "grad_norm": 1.4146080002824988, + "learning_rate": 9.131962305390832e-06, + "loss": 0.3536, + "step": 7402 + }, + { + "epoch": 0.21, + "grad_norm": 1.4876965883343003, + "learning_rate": 9.131697797328798e-06, + "loss": 0.3806, + "step": 7403 + }, + { + "epoch": 0.21, + "grad_norm": 1.3565163696042, + "learning_rate": 9.131433252804483e-06, + "loss": 0.3534, + "step": 7404 + }, + { + "epoch": 0.21, + "grad_norm": 2.8833404591061527, + "learning_rate": 9.131168671820226e-06, + "loss": 0.3716, + "step": 7405 + }, + { + "epoch": 0.21, + "grad_norm": 1.4373942255733345, + "learning_rate": 9.130904054378358e-06, + "loss": 0.3675, + "step": 7406 + }, + { + "epoch": 0.21, + "grad_norm": 1.5971506388203518, + "learning_rate": 9.130639400481216e-06, + "loss": 0.3776, + "step": 7407 + }, + { + "epoch": 0.21, + "grad_norm": 1.53769454370576, + "learning_rate": 9.130374710131136e-06, + "loss": 0.3474, + "step": 7408 + }, + { + "epoch": 0.21, + "grad_norm": 1.4939877374351058, + "learning_rate": 9.130109983330452e-06, + "loss": 0.6937, + "step": 7409 + }, + { + "epoch": 0.21, + "grad_norm": 1.5705364338884873, + "learning_rate": 9.129845220081503e-06, + "loss": 0.3735, + "step": 7410 + }, + { + "epoch": 0.21, + "grad_norm": 1.1506857446088457, + "learning_rate": 9.129580420386624e-06, + "loss": 0.5445, + "step": 7411 + }, + { + "epoch": 0.21, + "grad_norm": 1.5588851221753783, + "learning_rate": 9.129315584248153e-06, + "loss": 0.3646, + "step": 7412 + }, + { + "epoch": 0.22, + "grad_norm": 1.7864512729882096, + "learning_rate": 9.129050711668424e-06, + "loss": 0.3683, + "step": 7413 + }, + { + "epoch": 0.22, + "grad_norm": 1.475612430003913, + "learning_rate": 9.128785802649778e-06, + "loss": 0.352, + "step": 7414 + }, + { + "epoch": 0.22, + "grad_norm": 2.513261506723538, + "learning_rate": 9.128520857194551e-06, + "loss": 0.3827, + "step": 7415 + }, + { + "epoch": 0.22, + "grad_norm": 1.4228868873147515, + "learning_rate": 9.128255875305081e-06, + "loss": 0.3519, + "step": 7416 + }, + { + "epoch": 0.22, + "grad_norm": 1.7767715322327042, + "learning_rate": 9.127990856983708e-06, + "loss": 0.3622, + "step": 7417 + }, + { + "epoch": 0.22, + "grad_norm": 1.7581717730800144, + "learning_rate": 9.127725802232769e-06, + "loss": 0.3982, + "step": 7418 + }, + { + "epoch": 0.22, + "grad_norm": 1.9789720725572633, + "learning_rate": 9.127460711054605e-06, + "loss": 0.3999, + "step": 7419 + }, + { + "epoch": 0.22, + "grad_norm": 1.4035958135044269, + "learning_rate": 9.127195583451555e-06, + "loss": 0.3856, + "step": 7420 + }, + { + "epoch": 0.22, + "grad_norm": 1.50516395318059, + "learning_rate": 9.126930419425956e-06, + "loss": 0.365, + "step": 7421 + }, + { + "epoch": 0.22, + "grad_norm": 1.7382534522751092, + "learning_rate": 9.126665218980153e-06, + "loss": 0.3585, + "step": 7422 + }, + { + "epoch": 0.22, + "grad_norm": 1.4290425169590997, + "learning_rate": 9.126399982116483e-06, + "loss": 0.3801, + "step": 7423 + }, + { + "epoch": 0.22, + "grad_norm": 1.7323465101792523, + "learning_rate": 9.126134708837286e-06, + "loss": 0.3767, + "step": 7424 + }, + { + "epoch": 0.22, + "grad_norm": 1.5998679799917286, + "learning_rate": 9.125869399144905e-06, + "loss": 0.3652, + "step": 7425 + }, + { + "epoch": 0.22, + "grad_norm": 1.5175172402501835, + "learning_rate": 9.125604053041681e-06, + "loss": 0.3785, + "step": 7426 + }, + { + "epoch": 0.22, + "grad_norm": 1.624502359372475, + "learning_rate": 9.125338670529956e-06, + "loss": 0.395, + "step": 7427 + }, + { + "epoch": 0.22, + "grad_norm": 1.5095692328310466, + "learning_rate": 9.12507325161207e-06, + "loss": 0.3624, + "step": 7428 + }, + { + "epoch": 0.22, + "grad_norm": 1.4158337271845516, + "learning_rate": 9.124807796290366e-06, + "loss": 0.3698, + "step": 7429 + }, + { + "epoch": 0.22, + "grad_norm": 2.6837541183615827, + "learning_rate": 9.12454230456719e-06, + "loss": 0.4151, + "step": 7430 + }, + { + "epoch": 0.22, + "grad_norm": 1.7238031394082198, + "learning_rate": 9.124276776444882e-06, + "loss": 0.3671, + "step": 7431 + }, + { + "epoch": 0.22, + "grad_norm": 2.237643194365962, + "learning_rate": 9.124011211925784e-06, + "loss": 0.4131, + "step": 7432 + }, + { + "epoch": 0.22, + "grad_norm": 1.9133708816180102, + "learning_rate": 9.123745611012243e-06, + "loss": 0.3883, + "step": 7433 + }, + { + "epoch": 0.22, + "grad_norm": 1.6423381290080292, + "learning_rate": 9.123479973706602e-06, + "loss": 0.3744, + "step": 7434 + }, + { + "epoch": 0.22, + "grad_norm": 1.686737428052629, + "learning_rate": 9.123214300011202e-06, + "loss": 0.361, + "step": 7435 + }, + { + "epoch": 0.22, + "grad_norm": 1.7481060478696342, + "learning_rate": 9.122948589928392e-06, + "loss": 0.4091, + "step": 7436 + }, + { + "epoch": 0.22, + "grad_norm": 1.5729817274803795, + "learning_rate": 9.122682843460514e-06, + "loss": 0.3852, + "step": 7437 + }, + { + "epoch": 0.22, + "grad_norm": 1.525222111213405, + "learning_rate": 9.122417060609914e-06, + "loss": 0.3569, + "step": 7438 + }, + { + "epoch": 0.22, + "grad_norm": 2.1233608001906528, + "learning_rate": 9.122151241378938e-06, + "loss": 0.3818, + "step": 7439 + }, + { + "epoch": 0.22, + "grad_norm": 1.9748475905238847, + "learning_rate": 9.121885385769931e-06, + "loss": 0.3544, + "step": 7440 + }, + { + "epoch": 0.22, + "grad_norm": 1.6812317181751495, + "learning_rate": 9.12161949378524e-06, + "loss": 0.3731, + "step": 7441 + }, + { + "epoch": 0.22, + "grad_norm": 1.666998075862649, + "learning_rate": 9.121353565427213e-06, + "loss": 0.3945, + "step": 7442 + }, + { + "epoch": 0.22, + "grad_norm": 1.7706629443588409, + "learning_rate": 9.121087600698193e-06, + "loss": 0.3595, + "step": 7443 + }, + { + "epoch": 0.22, + "grad_norm": 1.819762554138382, + "learning_rate": 9.12082159960053e-06, + "loss": 0.3936, + "step": 7444 + }, + { + "epoch": 0.22, + "grad_norm": 1.6535394694252534, + "learning_rate": 9.12055556213657e-06, + "loss": 0.3614, + "step": 7445 + }, + { + "epoch": 0.22, + "grad_norm": 1.4776073333071214, + "learning_rate": 9.120289488308663e-06, + "loss": 0.3679, + "step": 7446 + }, + { + "epoch": 0.22, + "grad_norm": 1.411941886573366, + "learning_rate": 9.120023378119155e-06, + "loss": 0.3844, + "step": 7447 + }, + { + "epoch": 0.22, + "grad_norm": 1.5039077350059007, + "learning_rate": 9.119757231570394e-06, + "loss": 0.3724, + "step": 7448 + }, + { + "epoch": 0.22, + "grad_norm": 1.7733757549643185, + "learning_rate": 9.11949104866473e-06, + "loss": 0.3625, + "step": 7449 + }, + { + "epoch": 0.22, + "grad_norm": 1.6762787167767959, + "learning_rate": 9.119224829404512e-06, + "loss": 0.3539, + "step": 7450 + }, + { + "epoch": 0.22, + "grad_norm": 1.847902106978823, + "learning_rate": 9.118958573792089e-06, + "loss": 0.3937, + "step": 7451 + }, + { + "epoch": 0.22, + "grad_norm": 1.4788474104715226, + "learning_rate": 9.118692281829813e-06, + "loss": 0.3982, + "step": 7452 + }, + { + "epoch": 0.22, + "grad_norm": 1.6344272613289166, + "learning_rate": 9.11842595352003e-06, + "loss": 0.3692, + "step": 7453 + }, + { + "epoch": 0.22, + "grad_norm": 1.4937264235587446, + "learning_rate": 9.118159588865092e-06, + "loss": 0.3914, + "step": 7454 + }, + { + "epoch": 0.22, + "grad_norm": 1.678291446220876, + "learning_rate": 9.117893187867351e-06, + "loss": 0.3651, + "step": 7455 + }, + { + "epoch": 0.22, + "grad_norm": 4.350598207853003, + "learning_rate": 9.117626750529155e-06, + "loss": 0.3625, + "step": 7456 + }, + { + "epoch": 0.22, + "grad_norm": 1.4668268782125216, + "learning_rate": 9.117360276852858e-06, + "loss": 0.3583, + "step": 7457 + }, + { + "epoch": 0.22, + "grad_norm": 1.5083020586319447, + "learning_rate": 9.117093766840811e-06, + "loss": 0.3772, + "step": 7458 + }, + { + "epoch": 0.22, + "grad_norm": 1.5794353523595428, + "learning_rate": 9.116827220495367e-06, + "loss": 0.3972, + "step": 7459 + }, + { + "epoch": 0.22, + "grad_norm": 1.386267988665228, + "learning_rate": 9.116560637818876e-06, + "loss": 0.3634, + "step": 7460 + }, + { + "epoch": 0.22, + "grad_norm": 1.5088644652762495, + "learning_rate": 9.116294018813692e-06, + "loss": 0.3535, + "step": 7461 + }, + { + "epoch": 0.22, + "grad_norm": 1.816004272577383, + "learning_rate": 9.11602736348217e-06, + "loss": 0.3592, + "step": 7462 + }, + { + "epoch": 0.22, + "grad_norm": 1.7538384852220337, + "learning_rate": 9.115760671826656e-06, + "loss": 0.3598, + "step": 7463 + }, + { + "epoch": 0.22, + "grad_norm": 1.8353495169486354, + "learning_rate": 9.115493943849511e-06, + "loss": 0.3553, + "step": 7464 + }, + { + "epoch": 0.22, + "grad_norm": 1.5401960098112484, + "learning_rate": 9.115227179553086e-06, + "loss": 0.3751, + "step": 7465 + }, + { + "epoch": 0.22, + "grad_norm": 1.7821167150370976, + "learning_rate": 9.114960378939737e-06, + "loss": 0.3799, + "step": 7466 + }, + { + "epoch": 0.22, + "grad_norm": 1.5441045701703573, + "learning_rate": 9.114693542011816e-06, + "loss": 0.357, + "step": 7467 + }, + { + "epoch": 0.22, + "grad_norm": 1.7205027075725645, + "learning_rate": 9.114426668771677e-06, + "loss": 0.398, + "step": 7468 + }, + { + "epoch": 0.22, + "grad_norm": 1.5422641087748585, + "learning_rate": 9.11415975922168e-06, + "loss": 0.6644, + "step": 7469 + }, + { + "epoch": 0.22, + "grad_norm": 1.9348080462628388, + "learning_rate": 9.113892813364176e-06, + "loss": 0.3532, + "step": 7470 + }, + { + "epoch": 0.22, + "grad_norm": 1.5029099078528732, + "learning_rate": 9.11362583120152e-06, + "loss": 0.3727, + "step": 7471 + }, + { + "epoch": 0.22, + "grad_norm": 1.1192548452139357, + "learning_rate": 9.113358812736073e-06, + "loss": 0.6162, + "step": 7472 + }, + { + "epoch": 0.22, + "grad_norm": 1.6675281343894286, + "learning_rate": 9.113091757970189e-06, + "loss": 0.3813, + "step": 7473 + }, + { + "epoch": 0.22, + "grad_norm": 1.388445867200065, + "learning_rate": 9.112824666906225e-06, + "loss": 0.3855, + "step": 7474 + }, + { + "epoch": 0.22, + "grad_norm": 1.443402676222287, + "learning_rate": 9.112557539546535e-06, + "loss": 0.3547, + "step": 7475 + }, + { + "epoch": 0.22, + "grad_norm": 0.9933846332432641, + "learning_rate": 9.112290375893483e-06, + "loss": 0.6567, + "step": 7476 + }, + { + "epoch": 0.22, + "grad_norm": 1.4056717889805683, + "learning_rate": 9.11202317594942e-06, + "loss": 0.3651, + "step": 7477 + }, + { + "epoch": 0.22, + "grad_norm": 1.5207500620615408, + "learning_rate": 9.111755939716709e-06, + "loss": 0.3953, + "step": 7478 + }, + { + "epoch": 0.22, + "grad_norm": 1.497476309734435, + "learning_rate": 9.111488667197703e-06, + "loss": 0.3641, + "step": 7479 + }, + { + "epoch": 0.22, + "grad_norm": 1.532529109286481, + "learning_rate": 9.111221358394766e-06, + "loss": 0.3409, + "step": 7480 + }, + { + "epoch": 0.22, + "grad_norm": 1.7987111450916806, + "learning_rate": 9.110954013310256e-06, + "loss": 0.4028, + "step": 7481 + }, + { + "epoch": 0.22, + "grad_norm": 1.505002668140012, + "learning_rate": 9.110686631946529e-06, + "loss": 0.3846, + "step": 7482 + }, + { + "epoch": 0.22, + "grad_norm": 1.3845534494103648, + "learning_rate": 9.110419214305948e-06, + "loss": 0.3596, + "step": 7483 + }, + { + "epoch": 0.22, + "grad_norm": 1.4032381977081319, + "learning_rate": 9.110151760390871e-06, + "loss": 0.3655, + "step": 7484 + }, + { + "epoch": 0.22, + "grad_norm": 1.7081561742866243, + "learning_rate": 9.10988427020366e-06, + "loss": 0.3596, + "step": 7485 + }, + { + "epoch": 0.22, + "grad_norm": 1.6841604518287698, + "learning_rate": 9.109616743746675e-06, + "loss": 0.3659, + "step": 7486 + }, + { + "epoch": 0.22, + "grad_norm": 1.4637953253738643, + "learning_rate": 9.109349181022276e-06, + "loss": 0.3586, + "step": 7487 + }, + { + "epoch": 0.22, + "grad_norm": 1.3988972204659014, + "learning_rate": 9.109081582032823e-06, + "loss": 0.3613, + "step": 7488 + }, + { + "epoch": 0.22, + "grad_norm": 1.1923381088974387, + "learning_rate": 9.108813946780684e-06, + "loss": 0.6245, + "step": 7489 + }, + { + "epoch": 0.22, + "grad_norm": 1.442349993821519, + "learning_rate": 9.108546275268213e-06, + "loss": 0.37, + "step": 7490 + }, + { + "epoch": 0.22, + "grad_norm": 1.975585488812905, + "learning_rate": 9.108278567497775e-06, + "loss": 0.3579, + "step": 7491 + }, + { + "epoch": 0.22, + "grad_norm": 1.5747835512927337, + "learning_rate": 9.108010823471735e-06, + "loss": 0.3545, + "step": 7492 + }, + { + "epoch": 0.22, + "grad_norm": 1.0707532299798181, + "learning_rate": 9.107743043192455e-06, + "loss": 0.6185, + "step": 7493 + }, + { + "epoch": 0.22, + "grad_norm": 1.4847142428532492, + "learning_rate": 9.107475226662295e-06, + "loss": 0.3711, + "step": 7494 + }, + { + "epoch": 0.22, + "grad_norm": 2.1318195185225552, + "learning_rate": 9.10720737388362e-06, + "loss": 0.401, + "step": 7495 + }, + { + "epoch": 0.22, + "grad_norm": 1.3170290465803478, + "learning_rate": 9.106939484858794e-06, + "loss": 0.3678, + "step": 7496 + }, + { + "epoch": 0.22, + "grad_norm": 1.4654514269038164, + "learning_rate": 9.106671559590183e-06, + "loss": 0.3831, + "step": 7497 + }, + { + "epoch": 0.22, + "grad_norm": 1.3304071713849914, + "learning_rate": 9.10640359808015e-06, + "loss": 0.3695, + "step": 7498 + }, + { + "epoch": 0.22, + "grad_norm": 1.3568385156179577, + "learning_rate": 9.106135600331059e-06, + "loss": 0.3483, + "step": 7499 + }, + { + "epoch": 0.22, + "grad_norm": 1.337654174817152, + "learning_rate": 9.105867566345277e-06, + "loss": 0.3551, + "step": 7500 + }, + { + "epoch": 0.22, + "grad_norm": 1.5052957635232755, + "learning_rate": 9.105599496125168e-06, + "loss": 0.3642, + "step": 7501 + }, + { + "epoch": 0.22, + "grad_norm": 1.393810583125049, + "learning_rate": 9.105331389673097e-06, + "loss": 0.3619, + "step": 7502 + }, + { + "epoch": 0.22, + "grad_norm": 1.4001637068122352, + "learning_rate": 9.10506324699143e-06, + "loss": 0.3548, + "step": 7503 + }, + { + "epoch": 0.22, + "grad_norm": 1.3878103930168382, + "learning_rate": 9.104795068082537e-06, + "loss": 0.3458, + "step": 7504 + }, + { + "epoch": 0.22, + "grad_norm": 1.4601529790622114, + "learning_rate": 9.10452685294878e-06, + "loss": 0.3747, + "step": 7505 + }, + { + "epoch": 0.22, + "grad_norm": 2.297400829227269, + "learning_rate": 9.104258601592529e-06, + "loss": 0.346, + "step": 7506 + }, + { + "epoch": 0.22, + "grad_norm": 2.2610900015114743, + "learning_rate": 9.10399031401615e-06, + "loss": 0.3739, + "step": 7507 + }, + { + "epoch": 0.22, + "grad_norm": 1.5867317229844173, + "learning_rate": 9.10372199022201e-06, + "loss": 0.3657, + "step": 7508 + }, + { + "epoch": 0.22, + "grad_norm": 1.8356479471050757, + "learning_rate": 9.103453630212477e-06, + "loss": 0.3682, + "step": 7509 + }, + { + "epoch": 0.22, + "grad_norm": 1.6413447205361935, + "learning_rate": 9.103185233989923e-06, + "loss": 0.3927, + "step": 7510 + }, + { + "epoch": 0.22, + "grad_norm": 1.5235575291825372, + "learning_rate": 9.102916801556713e-06, + "loss": 0.3513, + "step": 7511 + }, + { + "epoch": 0.22, + "grad_norm": 1.4577910796450828, + "learning_rate": 9.102648332915215e-06, + "loss": 0.3629, + "step": 7512 + }, + { + "epoch": 0.22, + "grad_norm": 1.362490469072025, + "learning_rate": 9.102379828067802e-06, + "loss": 0.3955, + "step": 7513 + }, + { + "epoch": 0.22, + "grad_norm": 1.7913367809658778, + "learning_rate": 9.102111287016842e-06, + "loss": 0.3635, + "step": 7514 + }, + { + "epoch": 0.22, + "grad_norm": 1.4896852297033756, + "learning_rate": 9.101842709764702e-06, + "loss": 0.3638, + "step": 7515 + }, + { + "epoch": 0.22, + "grad_norm": 3.1182307625272805, + "learning_rate": 9.101574096313755e-06, + "loss": 0.3727, + "step": 7516 + }, + { + "epoch": 0.22, + "grad_norm": 1.3662288232539155, + "learning_rate": 9.101305446666373e-06, + "loss": 0.3335, + "step": 7517 + }, + { + "epoch": 0.22, + "grad_norm": 1.4501436192710107, + "learning_rate": 9.101036760824924e-06, + "loss": 0.3667, + "step": 7518 + }, + { + "epoch": 0.22, + "grad_norm": 1.382322634556521, + "learning_rate": 9.10076803879178e-06, + "loss": 0.3674, + "step": 7519 + }, + { + "epoch": 0.22, + "grad_norm": 9.178435793158998, + "learning_rate": 9.100499280569312e-06, + "loss": 0.3734, + "step": 7520 + }, + { + "epoch": 0.22, + "grad_norm": 1.6365631332239072, + "learning_rate": 9.100230486159893e-06, + "loss": 0.3782, + "step": 7521 + }, + { + "epoch": 0.22, + "grad_norm": 1.5928998451072318, + "learning_rate": 9.099961655565894e-06, + "loss": 0.3765, + "step": 7522 + }, + { + "epoch": 0.22, + "grad_norm": 1.3709877975240845, + "learning_rate": 9.099692788789688e-06, + "loss": 0.4039, + "step": 7523 + }, + { + "epoch": 0.22, + "grad_norm": 1.6822743871093093, + "learning_rate": 9.099423885833649e-06, + "loss": 0.3762, + "step": 7524 + }, + { + "epoch": 0.22, + "grad_norm": 1.4663600229101417, + "learning_rate": 9.099154946700146e-06, + "loss": 0.3903, + "step": 7525 + }, + { + "epoch": 0.22, + "grad_norm": 1.8779675684041803, + "learning_rate": 9.098885971391557e-06, + "loss": 0.4215, + "step": 7526 + }, + { + "epoch": 0.22, + "grad_norm": 1.5495378444187407, + "learning_rate": 9.098616959910254e-06, + "loss": 0.3861, + "step": 7527 + }, + { + "epoch": 0.22, + "grad_norm": 1.4197830529750284, + "learning_rate": 9.09834791225861e-06, + "loss": 0.375, + "step": 7528 + }, + { + "epoch": 0.22, + "grad_norm": 1.5962821567550463, + "learning_rate": 9.098078828439e-06, + "loss": 0.4057, + "step": 7529 + }, + { + "epoch": 0.22, + "grad_norm": 1.6070908044350256, + "learning_rate": 9.0978097084538e-06, + "loss": 0.3737, + "step": 7530 + }, + { + "epoch": 0.22, + "grad_norm": 1.4132740435367492, + "learning_rate": 9.097540552305384e-06, + "loss": 0.3649, + "step": 7531 + }, + { + "epoch": 0.22, + "grad_norm": 2.613857847876138, + "learning_rate": 9.097271359996125e-06, + "loss": 0.3922, + "step": 7532 + }, + { + "epoch": 0.22, + "grad_norm": 1.4973891496209568, + "learning_rate": 9.097002131528403e-06, + "loss": 0.3774, + "step": 7533 + }, + { + "epoch": 0.22, + "grad_norm": 1.4692266086327275, + "learning_rate": 9.096732866904592e-06, + "loss": 0.3615, + "step": 7534 + }, + { + "epoch": 0.22, + "grad_norm": 1.6852194520872341, + "learning_rate": 9.096463566127068e-06, + "loss": 0.3922, + "step": 7535 + }, + { + "epoch": 0.22, + "grad_norm": 1.7416401659597702, + "learning_rate": 9.096194229198206e-06, + "loss": 0.3978, + "step": 7536 + }, + { + "epoch": 0.22, + "grad_norm": 1.6004276746176107, + "learning_rate": 9.095924856120384e-06, + "loss": 0.393, + "step": 7537 + }, + { + "epoch": 0.22, + "grad_norm": 1.5214032803091477, + "learning_rate": 9.09565544689598e-06, + "loss": 0.3864, + "step": 7538 + }, + { + "epoch": 0.22, + "grad_norm": 1.6450098246208449, + "learning_rate": 9.095386001527373e-06, + "loss": 0.357, + "step": 7539 + }, + { + "epoch": 0.22, + "grad_norm": 1.4902304681410572, + "learning_rate": 9.095116520016937e-06, + "loss": 0.3802, + "step": 7540 + }, + { + "epoch": 0.22, + "grad_norm": 1.7610119573091947, + "learning_rate": 9.094847002367056e-06, + "loss": 0.3881, + "step": 7541 + }, + { + "epoch": 0.22, + "grad_norm": 1.4407953181033715, + "learning_rate": 9.094577448580102e-06, + "loss": 0.3536, + "step": 7542 + }, + { + "epoch": 0.22, + "grad_norm": 1.3690068162934745, + "learning_rate": 9.094307858658455e-06, + "loss": 0.3614, + "step": 7543 + }, + { + "epoch": 0.22, + "grad_norm": 1.3718394061348176, + "learning_rate": 9.094038232604499e-06, + "loss": 0.3544, + "step": 7544 + }, + { + "epoch": 0.22, + "grad_norm": 2.6686868790685594, + "learning_rate": 9.09376857042061e-06, + "loss": 0.3588, + "step": 7545 + }, + { + "epoch": 0.22, + "grad_norm": 1.4085179742542944, + "learning_rate": 9.093498872109166e-06, + "loss": 0.3658, + "step": 7546 + }, + { + "epoch": 0.22, + "grad_norm": 1.720922468643232, + "learning_rate": 9.09322913767255e-06, + "loss": 0.3628, + "step": 7547 + }, + { + "epoch": 0.22, + "grad_norm": 1.6608941941515738, + "learning_rate": 9.09295936711314e-06, + "loss": 0.3739, + "step": 7548 + }, + { + "epoch": 0.22, + "grad_norm": 1.4348660179328, + "learning_rate": 9.092689560433319e-06, + "loss": 0.3667, + "step": 7549 + }, + { + "epoch": 0.22, + "grad_norm": 1.7585587846849342, + "learning_rate": 9.092419717635468e-06, + "loss": 0.4452, + "step": 7550 + }, + { + "epoch": 0.22, + "grad_norm": 1.3594730076937669, + "learning_rate": 9.092149838721966e-06, + "loss": 0.3685, + "step": 7551 + }, + { + "epoch": 0.22, + "grad_norm": 1.4629876169929392, + "learning_rate": 9.0918799236952e-06, + "loss": 0.3938, + "step": 7552 + }, + { + "epoch": 0.22, + "grad_norm": 1.4365666804109218, + "learning_rate": 9.091609972557544e-06, + "loss": 0.363, + "step": 7553 + }, + { + "epoch": 0.22, + "grad_norm": 1.4841897172954415, + "learning_rate": 9.091339985311386e-06, + "loss": 0.393, + "step": 7554 + }, + { + "epoch": 0.22, + "grad_norm": 1.4155521278490881, + "learning_rate": 9.091069961959108e-06, + "loss": 0.3595, + "step": 7555 + }, + { + "epoch": 0.22, + "grad_norm": 1.6223630047815831, + "learning_rate": 9.090799902503092e-06, + "loss": 0.3662, + "step": 7556 + }, + { + "epoch": 0.22, + "grad_norm": 1.4037207756089711, + "learning_rate": 9.09052980694572e-06, + "loss": 0.3801, + "step": 7557 + }, + { + "epoch": 0.22, + "grad_norm": 1.451474244670654, + "learning_rate": 9.090259675289378e-06, + "loss": 0.3911, + "step": 7558 + }, + { + "epoch": 0.22, + "grad_norm": 1.4132051604141804, + "learning_rate": 9.089989507536451e-06, + "loss": 0.3674, + "step": 7559 + }, + { + "epoch": 0.22, + "grad_norm": 1.4336945946869126, + "learning_rate": 9.089719303689319e-06, + "loss": 0.3857, + "step": 7560 + }, + { + "epoch": 0.22, + "grad_norm": 1.4411134412673552, + "learning_rate": 9.08944906375037e-06, + "loss": 0.3583, + "step": 7561 + }, + { + "epoch": 0.22, + "grad_norm": 1.3692014435512085, + "learning_rate": 9.089178787721987e-06, + "loss": 0.3671, + "step": 7562 + }, + { + "epoch": 0.22, + "grad_norm": 1.6695616823228892, + "learning_rate": 9.088908475606554e-06, + "loss": 0.3655, + "step": 7563 + }, + { + "epoch": 0.22, + "grad_norm": 1.4126491866116457, + "learning_rate": 9.088638127406461e-06, + "loss": 0.3567, + "step": 7564 + }, + { + "epoch": 0.22, + "grad_norm": 1.5538670914868142, + "learning_rate": 9.08836774312409e-06, + "loss": 0.3795, + "step": 7565 + }, + { + "epoch": 0.22, + "grad_norm": 1.3969741118560055, + "learning_rate": 9.08809732276183e-06, + "loss": 0.3728, + "step": 7566 + }, + { + "epoch": 0.22, + "grad_norm": 2.8641810371056717, + "learning_rate": 9.087826866322065e-06, + "loss": 0.3726, + "step": 7567 + }, + { + "epoch": 0.22, + "grad_norm": 1.5471421254537825, + "learning_rate": 9.08755637380718e-06, + "loss": 0.3676, + "step": 7568 + }, + { + "epoch": 0.22, + "grad_norm": 1.8676883564245066, + "learning_rate": 9.087285845219567e-06, + "loss": 0.3452, + "step": 7569 + }, + { + "epoch": 0.22, + "grad_norm": 1.6862886364288912, + "learning_rate": 9.08701528056161e-06, + "loss": 0.3569, + "step": 7570 + }, + { + "epoch": 0.22, + "grad_norm": 1.4021972398865323, + "learning_rate": 9.0867446798357e-06, + "loss": 0.3717, + "step": 7571 + }, + { + "epoch": 0.22, + "grad_norm": 1.5156998041811558, + "learning_rate": 9.086474043044221e-06, + "loss": 0.3839, + "step": 7572 + }, + { + "epoch": 0.22, + "grad_norm": 1.3503644552911904, + "learning_rate": 9.086203370189564e-06, + "loss": 0.3625, + "step": 7573 + }, + { + "epoch": 0.22, + "grad_norm": 1.4554350035561645, + "learning_rate": 9.085932661274117e-06, + "loss": 0.3512, + "step": 7574 + }, + { + "epoch": 0.22, + "grad_norm": 1.695342234197587, + "learning_rate": 9.085661916300268e-06, + "loss": 0.3641, + "step": 7575 + }, + { + "epoch": 0.22, + "grad_norm": 1.4906535331176782, + "learning_rate": 9.085391135270408e-06, + "loss": 0.3774, + "step": 7576 + }, + { + "epoch": 0.22, + "grad_norm": 1.6096283136276421, + "learning_rate": 9.085120318186926e-06, + "loss": 0.3734, + "step": 7577 + }, + { + "epoch": 0.22, + "grad_norm": 1.9014272195423099, + "learning_rate": 9.08484946505221e-06, + "loss": 0.3872, + "step": 7578 + }, + { + "epoch": 0.22, + "grad_norm": 2.5509951431911824, + "learning_rate": 9.084578575868655e-06, + "loss": 0.4149, + "step": 7579 + }, + { + "epoch": 0.22, + "grad_norm": 1.7980710323341385, + "learning_rate": 9.084307650638646e-06, + "loss": 0.3687, + "step": 7580 + }, + { + "epoch": 0.22, + "grad_norm": 1.4247222050876562, + "learning_rate": 9.08403668936458e-06, + "loss": 0.3826, + "step": 7581 + }, + { + "epoch": 0.22, + "grad_norm": 1.4425425885852754, + "learning_rate": 9.083765692048842e-06, + "loss": 0.3582, + "step": 7582 + }, + { + "epoch": 0.22, + "grad_norm": 1.572066522697548, + "learning_rate": 9.083494658693827e-06, + "loss": 0.3692, + "step": 7583 + }, + { + "epoch": 0.22, + "grad_norm": 1.4370309488766952, + "learning_rate": 9.083223589301929e-06, + "loss": 0.3683, + "step": 7584 + }, + { + "epoch": 0.22, + "grad_norm": 1.4471077625475475, + "learning_rate": 9.082952483875533e-06, + "loss": 0.3457, + "step": 7585 + }, + { + "epoch": 0.22, + "grad_norm": 2.5895080351748767, + "learning_rate": 9.08268134241704e-06, + "loss": 0.4234, + "step": 7586 + }, + { + "epoch": 0.22, + "grad_norm": 1.53385466508265, + "learning_rate": 9.082410164928837e-06, + "loss": 0.4063, + "step": 7587 + }, + { + "epoch": 0.22, + "grad_norm": 3.0689425294808563, + "learning_rate": 9.08213895141332e-06, + "loss": 0.3693, + "step": 7588 + }, + { + "epoch": 0.22, + "grad_norm": 1.685587095797738, + "learning_rate": 9.08186770187288e-06, + "loss": 0.3872, + "step": 7589 + }, + { + "epoch": 0.22, + "grad_norm": 2.597031150115951, + "learning_rate": 9.081596416309913e-06, + "loss": 0.3729, + "step": 7590 + }, + { + "epoch": 0.22, + "grad_norm": 1.478108219421992, + "learning_rate": 9.081325094726812e-06, + "loss": 0.3852, + "step": 7591 + }, + { + "epoch": 0.22, + "grad_norm": 1.6202349880974498, + "learning_rate": 9.081053737125972e-06, + "loss": 0.3652, + "step": 7592 + }, + { + "epoch": 0.22, + "grad_norm": 1.5219020421885394, + "learning_rate": 9.080782343509787e-06, + "loss": 0.3484, + "step": 7593 + }, + { + "epoch": 0.22, + "grad_norm": 1.3875927788141031, + "learning_rate": 9.080510913880653e-06, + "loss": 0.3656, + "step": 7594 + }, + { + "epoch": 0.22, + "grad_norm": 1.700910845511368, + "learning_rate": 9.080239448240965e-06, + "loss": 0.3634, + "step": 7595 + }, + { + "epoch": 0.22, + "grad_norm": 1.7621421318092663, + "learning_rate": 9.07996794659312e-06, + "loss": 0.3677, + "step": 7596 + }, + { + "epoch": 0.22, + "grad_norm": 1.7565218578385706, + "learning_rate": 9.07969640893951e-06, + "loss": 0.338, + "step": 7597 + }, + { + "epoch": 0.22, + "grad_norm": 1.380560900843436, + "learning_rate": 9.079424835282536e-06, + "loss": 0.3795, + "step": 7598 + }, + { + "epoch": 0.22, + "grad_norm": 1.462870794500457, + "learning_rate": 9.079153225624592e-06, + "loss": 0.3759, + "step": 7599 + }, + { + "epoch": 0.22, + "grad_norm": 1.4914525059566908, + "learning_rate": 9.078881579968075e-06, + "loss": 0.3598, + "step": 7600 + }, + { + "epoch": 0.22, + "grad_norm": 1.0741314683424008, + "learning_rate": 9.078609898315382e-06, + "loss": 0.5846, + "step": 7601 + }, + { + "epoch": 0.22, + "grad_norm": 4.731997285088748, + "learning_rate": 9.078338180668914e-06, + "loss": 0.3681, + "step": 7602 + }, + { + "epoch": 0.22, + "grad_norm": 1.7606662015046703, + "learning_rate": 9.078066427031065e-06, + "loss": 0.3755, + "step": 7603 + }, + { + "epoch": 0.22, + "grad_norm": 1.388599302106608, + "learning_rate": 9.077794637404234e-06, + "loss": 0.3698, + "step": 7604 + }, + { + "epoch": 0.22, + "grad_norm": 1.373761363476609, + "learning_rate": 9.07752281179082e-06, + "loss": 0.3606, + "step": 7605 + }, + { + "epoch": 0.22, + "grad_norm": 1.5491863821308012, + "learning_rate": 9.077250950193222e-06, + "loss": 0.3641, + "step": 7606 + }, + { + "epoch": 0.22, + "grad_norm": 1.9080554319201368, + "learning_rate": 9.07697905261384e-06, + "loss": 0.3561, + "step": 7607 + }, + { + "epoch": 0.22, + "grad_norm": 1.489779839877636, + "learning_rate": 9.076707119055073e-06, + "loss": 0.3726, + "step": 7608 + }, + { + "epoch": 0.22, + "grad_norm": 1.529208345606925, + "learning_rate": 9.076435149519319e-06, + "loss": 0.3933, + "step": 7609 + }, + { + "epoch": 0.22, + "grad_norm": 1.4849710851505176, + "learning_rate": 9.07616314400898e-06, + "loss": 0.367, + "step": 7610 + }, + { + "epoch": 0.22, + "grad_norm": 3.1306434019034213, + "learning_rate": 9.075891102526456e-06, + "loss": 0.3878, + "step": 7611 + }, + { + "epoch": 0.22, + "grad_norm": 1.7134871005668628, + "learning_rate": 9.075619025074149e-06, + "loss": 0.3627, + "step": 7612 + }, + { + "epoch": 0.22, + "grad_norm": 1.7097138608544606, + "learning_rate": 9.075346911654456e-06, + "loss": 0.3654, + "step": 7613 + }, + { + "epoch": 0.22, + "grad_norm": 1.6106124721618487, + "learning_rate": 9.075074762269784e-06, + "loss": 0.3778, + "step": 7614 + }, + { + "epoch": 0.22, + "grad_norm": 1.5959641440461163, + "learning_rate": 9.074802576922531e-06, + "loss": 0.3696, + "step": 7615 + }, + { + "epoch": 0.22, + "grad_norm": 1.4252577972714229, + "learning_rate": 9.0745303556151e-06, + "loss": 0.3876, + "step": 7616 + }, + { + "epoch": 0.22, + "grad_norm": 1.4744110189256636, + "learning_rate": 9.074258098349896e-06, + "loss": 0.3487, + "step": 7617 + }, + { + "epoch": 0.22, + "grad_norm": 1.3844691337176145, + "learning_rate": 9.073985805129315e-06, + "loss": 0.3461, + "step": 7618 + }, + { + "epoch": 0.22, + "grad_norm": 1.4568415411549065, + "learning_rate": 9.073713475955766e-06, + "loss": 0.3979, + "step": 7619 + }, + { + "epoch": 0.22, + "grad_norm": 1.5218796237912142, + "learning_rate": 9.073441110831651e-06, + "loss": 0.353, + "step": 7620 + }, + { + "epoch": 0.22, + "grad_norm": 1.7492667103846733, + "learning_rate": 9.073168709759373e-06, + "loss": 0.362, + "step": 7621 + }, + { + "epoch": 0.22, + "grad_norm": 1.488123559034462, + "learning_rate": 9.072896272741334e-06, + "loss": 0.355, + "step": 7622 + }, + { + "epoch": 0.22, + "grad_norm": 1.4353369968317933, + "learning_rate": 9.072623799779942e-06, + "loss": 0.3685, + "step": 7623 + }, + { + "epoch": 0.22, + "grad_norm": 1.5545953687067118, + "learning_rate": 9.0723512908776e-06, + "loss": 0.3663, + "step": 7624 + }, + { + "epoch": 0.22, + "grad_norm": 1.9632114307294273, + "learning_rate": 9.072078746036712e-06, + "loss": 0.366, + "step": 7625 + }, + { + "epoch": 0.22, + "grad_norm": 2.4856557989478754, + "learning_rate": 9.071806165259682e-06, + "loss": 0.3608, + "step": 7626 + }, + { + "epoch": 0.22, + "grad_norm": 1.8397334519789843, + "learning_rate": 9.07153354854892e-06, + "loss": 0.3498, + "step": 7627 + }, + { + "epoch": 0.22, + "grad_norm": 1.4307660987246684, + "learning_rate": 9.071260895906828e-06, + "loss": 0.3572, + "step": 7628 + }, + { + "epoch": 0.22, + "grad_norm": 1.4489390764347512, + "learning_rate": 9.070988207335814e-06, + "loss": 0.3825, + "step": 7629 + }, + { + "epoch": 0.22, + "grad_norm": 1.630184859511296, + "learning_rate": 9.070715482838284e-06, + "loss": 0.3533, + "step": 7630 + }, + { + "epoch": 0.22, + "grad_norm": 1.3867887034822053, + "learning_rate": 9.070442722416646e-06, + "loss": 0.3577, + "step": 7631 + }, + { + "epoch": 0.22, + "grad_norm": 2.1285089410999043, + "learning_rate": 9.070169926073302e-06, + "loss": 0.3629, + "step": 7632 + }, + { + "epoch": 0.22, + "grad_norm": 1.400699368462933, + "learning_rate": 9.069897093810667e-06, + "loss": 0.3671, + "step": 7633 + }, + { + "epoch": 0.22, + "grad_norm": 1.2958810916529595, + "learning_rate": 9.069624225631143e-06, + "loss": 0.3667, + "step": 7634 + }, + { + "epoch": 0.22, + "grad_norm": 1.630747812041218, + "learning_rate": 9.069351321537142e-06, + "loss": 0.3645, + "step": 7635 + }, + { + "epoch": 0.22, + "grad_norm": 1.2523347183638476, + "learning_rate": 9.069078381531067e-06, + "loss": 0.35, + "step": 7636 + }, + { + "epoch": 0.22, + "grad_norm": 1.3376542789134245, + "learning_rate": 9.068805405615333e-06, + "loss": 0.3791, + "step": 7637 + }, + { + "epoch": 0.22, + "grad_norm": 1.5035327874315647, + "learning_rate": 9.068532393792347e-06, + "loss": 0.3794, + "step": 7638 + }, + { + "epoch": 0.22, + "grad_norm": 1.0806938596472102, + "learning_rate": 9.068259346064515e-06, + "loss": 0.6261, + "step": 7639 + }, + { + "epoch": 0.22, + "grad_norm": 1.8277806326327601, + "learning_rate": 9.06798626243425e-06, + "loss": 0.3629, + "step": 7640 + }, + { + "epoch": 0.22, + "grad_norm": 1.524065105920255, + "learning_rate": 9.06771314290396e-06, + "loss": 0.3606, + "step": 7641 + }, + { + "epoch": 0.22, + "grad_norm": 1.7114242550336534, + "learning_rate": 9.067439987476057e-06, + "loss": 0.4237, + "step": 7642 + }, + { + "epoch": 0.22, + "grad_norm": 2.0414810299203134, + "learning_rate": 9.067166796152953e-06, + "loss": 0.3671, + "step": 7643 + }, + { + "epoch": 0.22, + "grad_norm": 1.4351723368582656, + "learning_rate": 9.066893568937056e-06, + "loss": 0.3591, + "step": 7644 + }, + { + "epoch": 0.22, + "grad_norm": 1.3456908567354053, + "learning_rate": 9.066620305830777e-06, + "loss": 0.3685, + "step": 7645 + }, + { + "epoch": 0.22, + "grad_norm": 1.4748190238823873, + "learning_rate": 9.066347006836528e-06, + "loss": 0.3805, + "step": 7646 + }, + { + "epoch": 0.22, + "grad_norm": 1.62276520783129, + "learning_rate": 9.066073671956723e-06, + "loss": 0.4029, + "step": 7647 + }, + { + "epoch": 0.22, + "grad_norm": 1.6767474543838774, + "learning_rate": 9.065800301193773e-06, + "loss": 0.3654, + "step": 7648 + }, + { + "epoch": 0.22, + "grad_norm": 2.638501284590056, + "learning_rate": 9.065526894550089e-06, + "loss": 0.3831, + "step": 7649 + }, + { + "epoch": 0.22, + "grad_norm": 1.4582863234152792, + "learning_rate": 9.065253452028086e-06, + "loss": 0.3459, + "step": 7650 + }, + { + "epoch": 0.22, + "grad_norm": 1.6615957984631515, + "learning_rate": 9.064979973630176e-06, + "loss": 0.3794, + "step": 7651 + }, + { + "epoch": 0.22, + "grad_norm": 1.762441146212552, + "learning_rate": 9.064706459358774e-06, + "loss": 0.3583, + "step": 7652 + }, + { + "epoch": 0.22, + "grad_norm": 1.6116458660900719, + "learning_rate": 9.06443290921629e-06, + "loss": 0.3774, + "step": 7653 + }, + { + "epoch": 0.22, + "grad_norm": 1.7667870706720807, + "learning_rate": 9.064159323205142e-06, + "loss": 0.3799, + "step": 7654 + }, + { + "epoch": 0.22, + "grad_norm": 3.6542905533822885, + "learning_rate": 9.063885701327743e-06, + "loss": 0.3572, + "step": 7655 + }, + { + "epoch": 0.22, + "grad_norm": 2.672194255593208, + "learning_rate": 9.063612043586506e-06, + "loss": 0.3552, + "step": 7656 + }, + { + "epoch": 0.22, + "grad_norm": 1.4541922584647518, + "learning_rate": 9.06333834998385e-06, + "loss": 0.3792, + "step": 7657 + }, + { + "epoch": 0.22, + "grad_norm": 1.8874652323210672, + "learning_rate": 9.063064620522186e-06, + "loss": 0.3676, + "step": 7658 + }, + { + "epoch": 0.22, + "grad_norm": 2.7036084065584505, + "learning_rate": 9.062790855203932e-06, + "loss": 0.3283, + "step": 7659 + }, + { + "epoch": 0.22, + "grad_norm": 1.3791378177774607, + "learning_rate": 9.062517054031505e-06, + "loss": 0.3592, + "step": 7660 + }, + { + "epoch": 0.22, + "grad_norm": 1.4945550034533155, + "learning_rate": 9.062243217007319e-06, + "loss": 0.3589, + "step": 7661 + }, + { + "epoch": 0.22, + "grad_norm": 1.3989043297046813, + "learning_rate": 9.06196934413379e-06, + "loss": 0.3506, + "step": 7662 + }, + { + "epoch": 0.22, + "grad_norm": 1.6113834968238445, + "learning_rate": 9.061695435413337e-06, + "loss": 0.3742, + "step": 7663 + }, + { + "epoch": 0.22, + "grad_norm": 1.3356592462858297, + "learning_rate": 9.061421490848378e-06, + "loss": 0.3734, + "step": 7664 + }, + { + "epoch": 0.22, + "grad_norm": 1.3890191365615534, + "learning_rate": 9.06114751044133e-06, + "loss": 0.3682, + "step": 7665 + }, + { + "epoch": 0.22, + "grad_norm": 1.728847291198133, + "learning_rate": 9.060873494194608e-06, + "loss": 0.3953, + "step": 7666 + }, + { + "epoch": 0.22, + "grad_norm": 1.55762901426399, + "learning_rate": 9.060599442110634e-06, + "loss": 0.3837, + "step": 7667 + }, + { + "epoch": 0.22, + "grad_norm": 1.471402716696997, + "learning_rate": 9.060325354191826e-06, + "loss": 0.3557, + "step": 7668 + }, + { + "epoch": 0.22, + "grad_norm": 1.4954212402226095, + "learning_rate": 9.060051230440598e-06, + "loss": 0.4022, + "step": 7669 + }, + { + "epoch": 0.22, + "grad_norm": 1.3698561384464, + "learning_rate": 9.059777070859376e-06, + "loss": 0.3805, + "step": 7670 + }, + { + "epoch": 0.22, + "grad_norm": 1.484159330179106, + "learning_rate": 9.059502875450575e-06, + "loss": 0.3684, + "step": 7671 + }, + { + "epoch": 0.22, + "grad_norm": 1.6539944894465, + "learning_rate": 9.059228644216615e-06, + "loss": 0.3763, + "step": 7672 + }, + { + "epoch": 0.22, + "grad_norm": 1.7319977287383783, + "learning_rate": 9.058954377159919e-06, + "loss": 0.3573, + "step": 7673 + }, + { + "epoch": 0.22, + "grad_norm": 1.9293764835065845, + "learning_rate": 9.058680074282906e-06, + "loss": 0.4284, + "step": 7674 + }, + { + "epoch": 0.22, + "grad_norm": 1.6040970422772938, + "learning_rate": 9.058405735587996e-06, + "loss": 0.4234, + "step": 7675 + }, + { + "epoch": 0.22, + "grad_norm": 1.4663831288500238, + "learning_rate": 9.05813136107761e-06, + "loss": 0.3574, + "step": 7676 + }, + { + "epoch": 0.22, + "grad_norm": 7.321792337731323, + "learning_rate": 9.05785695075417e-06, + "loss": 0.352, + "step": 7677 + }, + { + "epoch": 0.22, + "grad_norm": 1.023982046743319, + "learning_rate": 9.057582504620095e-06, + "loss": 0.5498, + "step": 7678 + }, + { + "epoch": 0.22, + "grad_norm": 1.5295695800228446, + "learning_rate": 9.05730802267781e-06, + "loss": 0.3325, + "step": 7679 + }, + { + "epoch": 0.22, + "grad_norm": 1.6448543869188073, + "learning_rate": 9.05703350492974e-06, + "loss": 0.388, + "step": 7680 + }, + { + "epoch": 0.22, + "grad_norm": 2.009413225518423, + "learning_rate": 9.056758951378302e-06, + "loss": 0.3831, + "step": 7681 + }, + { + "epoch": 0.22, + "grad_norm": 1.4146621706538185, + "learning_rate": 9.056484362025922e-06, + "loss": 0.3614, + "step": 7682 + }, + { + "epoch": 0.22, + "grad_norm": 1.6395846647825623, + "learning_rate": 9.056209736875021e-06, + "loss": 0.3699, + "step": 7683 + }, + { + "epoch": 0.22, + "grad_norm": 2.0323197180536985, + "learning_rate": 9.055935075928025e-06, + "loss": 0.3621, + "step": 7684 + }, + { + "epoch": 0.22, + "grad_norm": 1.5749002261100025, + "learning_rate": 9.055660379187358e-06, + "loss": 0.3497, + "step": 7685 + }, + { + "epoch": 0.22, + "grad_norm": 1.3639744831386473, + "learning_rate": 9.055385646655442e-06, + "loss": 0.3404, + "step": 7686 + }, + { + "epoch": 0.22, + "grad_norm": 2.4149697432406274, + "learning_rate": 9.055110878334702e-06, + "loss": 0.3733, + "step": 7687 + }, + { + "epoch": 0.22, + "grad_norm": 1.4735960953861102, + "learning_rate": 9.054836074227564e-06, + "loss": 0.3897, + "step": 7688 + }, + { + "epoch": 0.22, + "grad_norm": 1.6163891811675815, + "learning_rate": 9.054561234336454e-06, + "loss": 0.361, + "step": 7689 + }, + { + "epoch": 0.22, + "grad_norm": 1.7071583355615512, + "learning_rate": 9.054286358663795e-06, + "loss": 0.4034, + "step": 7690 + }, + { + "epoch": 0.22, + "grad_norm": 1.8873369408592506, + "learning_rate": 9.054011447212014e-06, + "loss": 0.3771, + "step": 7691 + }, + { + "epoch": 0.22, + "grad_norm": 1.395449631413268, + "learning_rate": 9.053736499983538e-06, + "loss": 0.3699, + "step": 7692 + }, + { + "epoch": 0.22, + "grad_norm": 1.6787995526877486, + "learning_rate": 9.05346151698079e-06, + "loss": 0.3752, + "step": 7693 + }, + { + "epoch": 0.22, + "grad_norm": 1.6304660993391293, + "learning_rate": 9.0531864982062e-06, + "loss": 0.3609, + "step": 7694 + }, + { + "epoch": 0.22, + "grad_norm": 1.4859758396452365, + "learning_rate": 9.052911443662196e-06, + "loss": 0.3741, + "step": 7695 + }, + { + "epoch": 0.22, + "grad_norm": 1.5038650368997517, + "learning_rate": 9.052636353351202e-06, + "loss": 0.3534, + "step": 7696 + }, + { + "epoch": 0.22, + "grad_norm": 1.5222890333961854, + "learning_rate": 9.052361227275648e-06, + "loss": 0.3528, + "step": 7697 + }, + { + "epoch": 0.22, + "grad_norm": 1.559418254095233, + "learning_rate": 9.052086065437962e-06, + "loss": 0.3416, + "step": 7698 + }, + { + "epoch": 0.22, + "grad_norm": 1.4603892084614245, + "learning_rate": 9.051810867840568e-06, + "loss": 0.3584, + "step": 7699 + }, + { + "epoch": 0.22, + "grad_norm": 1.8115012357984417, + "learning_rate": 9.051535634485901e-06, + "loss": 0.3678, + "step": 7700 + }, + { + "epoch": 0.22, + "grad_norm": 1.5175269317806317, + "learning_rate": 9.051260365376388e-06, + "loss": 0.3604, + "step": 7701 + }, + { + "epoch": 0.22, + "grad_norm": 1.6725204857773754, + "learning_rate": 9.050985060514455e-06, + "loss": 0.3837, + "step": 7702 + }, + { + "epoch": 0.22, + "grad_norm": 1.4645749302264566, + "learning_rate": 9.050709719902535e-06, + "loss": 0.3743, + "step": 7703 + }, + { + "epoch": 0.22, + "grad_norm": 1.3790541934711138, + "learning_rate": 9.050434343543057e-06, + "loss": 0.3764, + "step": 7704 + }, + { + "epoch": 0.22, + "grad_norm": 1.4381220440150275, + "learning_rate": 9.050158931438451e-06, + "loss": 0.3503, + "step": 7705 + }, + { + "epoch": 0.22, + "grad_norm": 1.4767344289583801, + "learning_rate": 9.049883483591147e-06, + "loss": 0.369, + "step": 7706 + }, + { + "epoch": 0.22, + "grad_norm": 1.325090441203457, + "learning_rate": 9.049608000003575e-06, + "loss": 0.3795, + "step": 7707 + }, + { + "epoch": 0.22, + "grad_norm": 1.6750657146881216, + "learning_rate": 9.049332480678171e-06, + "loss": 0.3881, + "step": 7708 + }, + { + "epoch": 0.22, + "grad_norm": 1.3709244172267296, + "learning_rate": 9.04905692561736e-06, + "loss": 0.3933, + "step": 7709 + }, + { + "epoch": 0.22, + "grad_norm": 1.6103405644664797, + "learning_rate": 9.048781334823578e-06, + "loss": 0.3715, + "step": 7710 + }, + { + "epoch": 0.22, + "grad_norm": 1.429735505740015, + "learning_rate": 9.048505708299256e-06, + "loss": 0.349, + "step": 7711 + }, + { + "epoch": 0.22, + "grad_norm": 1.4516685282714366, + "learning_rate": 9.048230046046825e-06, + "loss": 0.3683, + "step": 7712 + }, + { + "epoch": 0.22, + "grad_norm": 1.2948481473281996, + "learning_rate": 9.04795434806872e-06, + "loss": 0.3843, + "step": 7713 + }, + { + "epoch": 0.22, + "grad_norm": 1.3799244068088432, + "learning_rate": 9.047678614367372e-06, + "loss": 0.3776, + "step": 7714 + }, + { + "epoch": 0.22, + "grad_norm": 1.426333376714693, + "learning_rate": 9.047402844945214e-06, + "loss": 0.3829, + "step": 7715 + }, + { + "epoch": 0.22, + "grad_norm": 1.3767027105850629, + "learning_rate": 9.047127039804684e-06, + "loss": 0.3685, + "step": 7716 + }, + { + "epoch": 0.22, + "grad_norm": 1.3489210307725787, + "learning_rate": 9.046851198948213e-06, + "loss": 0.3807, + "step": 7717 + }, + { + "epoch": 0.22, + "grad_norm": 2.167226150577728, + "learning_rate": 9.046575322378233e-06, + "loss": 0.3695, + "step": 7718 + }, + { + "epoch": 0.22, + "grad_norm": 1.402296688176798, + "learning_rate": 9.046299410097182e-06, + "loss": 0.3728, + "step": 7719 + }, + { + "epoch": 0.22, + "grad_norm": 1.9035453727276184, + "learning_rate": 9.046023462107494e-06, + "loss": 0.3698, + "step": 7720 + }, + { + "epoch": 0.22, + "grad_norm": 1.0281840649625216, + "learning_rate": 9.045747478411604e-06, + "loss": 0.6531, + "step": 7721 + }, + { + "epoch": 0.22, + "grad_norm": 1.7301662846563304, + "learning_rate": 9.045471459011946e-06, + "loss": 0.4276, + "step": 7722 + }, + { + "epoch": 0.22, + "grad_norm": 1.4041802766440117, + "learning_rate": 9.04519540391096e-06, + "loss": 0.3685, + "step": 7723 + }, + { + "epoch": 0.22, + "grad_norm": 1.5476654173337678, + "learning_rate": 9.044919313111078e-06, + "loss": 0.4262, + "step": 7724 + }, + { + "epoch": 0.22, + "grad_norm": 1.354514901161735, + "learning_rate": 9.044643186614739e-06, + "loss": 0.3801, + "step": 7725 + }, + { + "epoch": 0.22, + "grad_norm": 1.6882419074250425, + "learning_rate": 9.044367024424379e-06, + "loss": 0.366, + "step": 7726 + }, + { + "epoch": 0.22, + "grad_norm": 1.5048121653481155, + "learning_rate": 9.044090826542436e-06, + "loss": 0.4008, + "step": 7727 + }, + { + "epoch": 0.22, + "grad_norm": 1.4748993340822052, + "learning_rate": 9.043814592971345e-06, + "loss": 0.3462, + "step": 7728 + }, + { + "epoch": 0.22, + "grad_norm": 1.5648271861678225, + "learning_rate": 9.043538323713546e-06, + "loss": 0.3796, + "step": 7729 + }, + { + "epoch": 0.22, + "grad_norm": 1.6714179673460339, + "learning_rate": 9.043262018771476e-06, + "loss": 0.3569, + "step": 7730 + }, + { + "epoch": 0.22, + "grad_norm": 1.6402015390699316, + "learning_rate": 9.042985678147573e-06, + "loss": 0.3465, + "step": 7731 + }, + { + "epoch": 0.22, + "grad_norm": 1.5673198060506193, + "learning_rate": 9.042709301844279e-06, + "loss": 0.3877, + "step": 7732 + }, + { + "epoch": 0.22, + "grad_norm": 1.4432321123072833, + "learning_rate": 9.042432889864028e-06, + "loss": 0.3867, + "step": 7733 + }, + { + "epoch": 0.22, + "grad_norm": 1.9895811345745842, + "learning_rate": 9.042156442209265e-06, + "loss": 0.3664, + "step": 7734 + }, + { + "epoch": 0.22, + "grad_norm": 1.4318584780832317, + "learning_rate": 9.041879958882425e-06, + "loss": 0.3562, + "step": 7735 + }, + { + "epoch": 0.22, + "grad_norm": 1.635875622069634, + "learning_rate": 9.04160343988595e-06, + "loss": 0.3612, + "step": 7736 + }, + { + "epoch": 0.22, + "grad_norm": 1.529402292389665, + "learning_rate": 9.041326885222277e-06, + "loss": 0.3731, + "step": 7737 + }, + { + "epoch": 0.22, + "grad_norm": 1.5649502227004135, + "learning_rate": 9.041050294893852e-06, + "loss": 0.385, + "step": 7738 + }, + { + "epoch": 0.22, + "grad_norm": 1.570505705866913, + "learning_rate": 9.040773668903112e-06, + "loss": 0.3889, + "step": 7739 + }, + { + "epoch": 0.22, + "grad_norm": 1.3730345208460522, + "learning_rate": 9.0404970072525e-06, + "loss": 0.3962, + "step": 7740 + }, + { + "epoch": 0.22, + "grad_norm": 1.446503953364663, + "learning_rate": 9.040220309944458e-06, + "loss": 0.3891, + "step": 7741 + }, + { + "epoch": 0.22, + "grad_norm": 1.410461043272801, + "learning_rate": 9.039943576981428e-06, + "loss": 0.3368, + "step": 7742 + }, + { + "epoch": 0.22, + "grad_norm": 1.3608563674719552, + "learning_rate": 9.039666808365849e-06, + "loss": 0.3536, + "step": 7743 + }, + { + "epoch": 0.22, + "grad_norm": 1.747621164657109, + "learning_rate": 9.039390004100166e-06, + "loss": 0.3576, + "step": 7744 + }, + { + "epoch": 0.22, + "grad_norm": 1.4269048317650264, + "learning_rate": 9.03911316418682e-06, + "loss": 0.3748, + "step": 7745 + }, + { + "epoch": 0.22, + "grad_norm": 2.540731244315133, + "learning_rate": 9.038836288628258e-06, + "loss": 0.3766, + "step": 7746 + }, + { + "epoch": 0.22, + "grad_norm": 1.678742600975677, + "learning_rate": 9.038559377426919e-06, + "loss": 0.3806, + "step": 7747 + }, + { + "epoch": 0.22, + "grad_norm": 1.919046413752609, + "learning_rate": 9.03828243058525e-06, + "loss": 0.3564, + "step": 7748 + }, + { + "epoch": 0.22, + "grad_norm": 1.3729687526144316, + "learning_rate": 9.038005448105694e-06, + "loss": 0.343, + "step": 7749 + }, + { + "epoch": 0.22, + "grad_norm": 1.8409548607999526, + "learning_rate": 9.037728429990694e-06, + "loss": 0.3719, + "step": 7750 + }, + { + "epoch": 0.22, + "grad_norm": 1.332786599443624, + "learning_rate": 9.037451376242696e-06, + "loss": 0.3829, + "step": 7751 + }, + { + "epoch": 0.22, + "grad_norm": 1.5959934765896313, + "learning_rate": 9.037174286864146e-06, + "loss": 0.3739, + "step": 7752 + }, + { + "epoch": 0.22, + "grad_norm": 1.764111906958292, + "learning_rate": 9.036897161857488e-06, + "loss": 0.3934, + "step": 7753 + }, + { + "epoch": 0.22, + "grad_norm": 1.4298027564659082, + "learning_rate": 9.036620001225166e-06, + "loss": 0.3516, + "step": 7754 + }, + { + "epoch": 0.22, + "grad_norm": 1.5592770442670025, + "learning_rate": 9.036342804969628e-06, + "loss": 0.3717, + "step": 7755 + }, + { + "epoch": 0.22, + "grad_norm": 12.467140464270466, + "learning_rate": 9.036065573093323e-06, + "loss": 0.37, + "step": 7756 + }, + { + "epoch": 0.22, + "grad_norm": 1.9821677949639838, + "learning_rate": 9.035788305598692e-06, + "loss": 0.357, + "step": 7757 + }, + { + "epoch": 0.23, + "grad_norm": 1.5819408228351588, + "learning_rate": 9.035511002488185e-06, + "loss": 0.3983, + "step": 7758 + }, + { + "epoch": 0.23, + "grad_norm": 1.6927874682648147, + "learning_rate": 9.035233663764247e-06, + "loss": 0.4153, + "step": 7759 + }, + { + "epoch": 0.23, + "grad_norm": 1.8208681671446485, + "learning_rate": 9.034956289429329e-06, + "loss": 0.3592, + "step": 7760 + }, + { + "epoch": 0.23, + "grad_norm": 1.5443018067391294, + "learning_rate": 9.034678879485876e-06, + "loss": 0.3952, + "step": 7761 + }, + { + "epoch": 0.23, + "grad_norm": 1.3751012274255863, + "learning_rate": 9.034401433936338e-06, + "loss": 0.3724, + "step": 7762 + }, + { + "epoch": 0.23, + "grad_norm": 1.5324202846228103, + "learning_rate": 9.03412395278316e-06, + "loss": 0.3629, + "step": 7763 + }, + { + "epoch": 0.23, + "grad_norm": 1.4206899459462565, + "learning_rate": 9.033846436028798e-06, + "loss": 0.371, + "step": 7764 + }, + { + "epoch": 0.23, + "grad_norm": 1.7201829514747724, + "learning_rate": 9.033568883675694e-06, + "loss": 0.3821, + "step": 7765 + }, + { + "epoch": 0.23, + "grad_norm": 1.9893575804262014, + "learning_rate": 9.0332912957263e-06, + "loss": 0.3725, + "step": 7766 + }, + { + "epoch": 0.23, + "grad_norm": 2.2000696274082667, + "learning_rate": 9.033013672183064e-06, + "loss": 0.3375, + "step": 7767 + }, + { + "epoch": 0.23, + "grad_norm": 1.5087295352749048, + "learning_rate": 9.03273601304844e-06, + "loss": 0.3634, + "step": 7768 + }, + { + "epoch": 0.23, + "grad_norm": 1.5593058336635015, + "learning_rate": 9.032458318324875e-06, + "loss": 0.3553, + "step": 7769 + }, + { + "epoch": 0.23, + "grad_norm": 1.4023383968600387, + "learning_rate": 9.03218058801482e-06, + "loss": 0.3384, + "step": 7770 + }, + { + "epoch": 0.23, + "grad_norm": 1.7793990198200584, + "learning_rate": 9.031902822120729e-06, + "loss": 0.3832, + "step": 7771 + }, + { + "epoch": 0.23, + "grad_norm": 1.3540323274413686, + "learning_rate": 9.031625020645048e-06, + "loss": 0.3896, + "step": 7772 + }, + { + "epoch": 0.23, + "grad_norm": 1.6332486662955414, + "learning_rate": 9.031347183590232e-06, + "loss": 0.3503, + "step": 7773 + }, + { + "epoch": 0.23, + "grad_norm": 2.082923672952715, + "learning_rate": 9.031069310958733e-06, + "loss": 0.378, + "step": 7774 + }, + { + "epoch": 0.23, + "grad_norm": 1.3366692700586456, + "learning_rate": 9.030791402753003e-06, + "loss": 0.3615, + "step": 7775 + }, + { + "epoch": 0.23, + "grad_norm": 1.5531422956819565, + "learning_rate": 9.030513458975494e-06, + "loss": 0.3856, + "step": 7776 + }, + { + "epoch": 0.23, + "grad_norm": 1.32718646093461, + "learning_rate": 9.030235479628657e-06, + "loss": 0.3523, + "step": 7777 + }, + { + "epoch": 0.23, + "grad_norm": 1.5678990363484926, + "learning_rate": 9.029957464714949e-06, + "loss": 0.3555, + "step": 7778 + }, + { + "epoch": 0.23, + "grad_norm": 1.6999954641646653, + "learning_rate": 9.029679414236823e-06, + "loss": 0.3756, + "step": 7779 + }, + { + "epoch": 0.23, + "grad_norm": 1.396599463030509, + "learning_rate": 9.02940132819673e-06, + "loss": 0.3769, + "step": 7780 + }, + { + "epoch": 0.23, + "grad_norm": 2.045602122272989, + "learning_rate": 9.029123206597125e-06, + "loss": 0.3567, + "step": 7781 + }, + { + "epoch": 0.23, + "grad_norm": 1.597802992392362, + "learning_rate": 9.028845049440463e-06, + "loss": 0.4055, + "step": 7782 + }, + { + "epoch": 0.23, + "grad_norm": 1.3938746827726047, + "learning_rate": 9.0285668567292e-06, + "loss": 0.3814, + "step": 7783 + }, + { + "epoch": 0.23, + "grad_norm": 1.6237391009100495, + "learning_rate": 9.028288628465789e-06, + "loss": 0.3618, + "step": 7784 + }, + { + "epoch": 0.23, + "grad_norm": 1.6110402655181857, + "learning_rate": 9.028010364652686e-06, + "loss": 0.3714, + "step": 7785 + }, + { + "epoch": 0.23, + "grad_norm": 1.9510552738320297, + "learning_rate": 9.027732065292348e-06, + "loss": 0.3904, + "step": 7786 + }, + { + "epoch": 0.23, + "grad_norm": 1.4008830974055055, + "learning_rate": 9.027453730387228e-06, + "loss": 0.3531, + "step": 7787 + }, + { + "epoch": 0.23, + "grad_norm": 1.340828295788189, + "learning_rate": 9.027175359939785e-06, + "loss": 0.3589, + "step": 7788 + }, + { + "epoch": 0.23, + "grad_norm": 1.4099877587230238, + "learning_rate": 9.026896953952475e-06, + "loss": 0.3804, + "step": 7789 + }, + { + "epoch": 0.23, + "grad_norm": 1.3128361079564415, + "learning_rate": 9.026618512427755e-06, + "loss": 0.3794, + "step": 7790 + }, + { + "epoch": 0.23, + "grad_norm": 1.2938372764917336, + "learning_rate": 9.026340035368082e-06, + "loss": 0.3588, + "step": 7791 + }, + { + "epoch": 0.23, + "grad_norm": 1.635873307840605, + "learning_rate": 9.026061522775913e-06, + "loss": 0.3688, + "step": 7792 + }, + { + "epoch": 0.23, + "grad_norm": 1.398667644419892, + "learning_rate": 9.025782974653706e-06, + "loss": 0.4137, + "step": 7793 + }, + { + "epoch": 0.23, + "grad_norm": 1.4447740943378058, + "learning_rate": 9.025504391003922e-06, + "loss": 0.3998, + "step": 7794 + }, + { + "epoch": 0.23, + "grad_norm": 1.6903809801609, + "learning_rate": 9.025225771829013e-06, + "loss": 0.359, + "step": 7795 + }, + { + "epoch": 0.23, + "grad_norm": 1.476403523844771, + "learning_rate": 9.024947117131444e-06, + "loss": 0.3683, + "step": 7796 + }, + { + "epoch": 0.23, + "grad_norm": 0.9995799847440142, + "learning_rate": 9.024668426913671e-06, + "loss": 0.5984, + "step": 7797 + }, + { + "epoch": 0.23, + "grad_norm": 1.297291196230407, + "learning_rate": 9.024389701178157e-06, + "loss": 0.3556, + "step": 7798 + }, + { + "epoch": 0.23, + "grad_norm": 1.5423381126236606, + "learning_rate": 9.024110939927357e-06, + "loss": 0.3381, + "step": 7799 + }, + { + "epoch": 0.23, + "grad_norm": 1.5260485725405117, + "learning_rate": 9.023832143163736e-06, + "loss": 0.3627, + "step": 7800 + }, + { + "epoch": 0.23, + "grad_norm": 1.583960543566616, + "learning_rate": 9.02355331088975e-06, + "loss": 0.3937, + "step": 7801 + }, + { + "epoch": 0.23, + "grad_norm": 1.3749501729980496, + "learning_rate": 9.02327444310786e-06, + "loss": 0.3563, + "step": 7802 + }, + { + "epoch": 0.23, + "grad_norm": 1.4142048219780168, + "learning_rate": 9.022995539820529e-06, + "loss": 0.3483, + "step": 7803 + }, + { + "epoch": 0.23, + "grad_norm": 2.265775095344292, + "learning_rate": 9.02271660103022e-06, + "loss": 0.411, + "step": 7804 + }, + { + "epoch": 0.23, + "grad_norm": 1.3844282126792378, + "learning_rate": 9.022437626739391e-06, + "loss": 0.337, + "step": 7805 + }, + { + "epoch": 0.23, + "grad_norm": 1.443932356127573, + "learning_rate": 9.022158616950504e-06, + "loss": 0.3604, + "step": 7806 + }, + { + "epoch": 0.23, + "grad_norm": 1.3455674088041216, + "learning_rate": 9.021879571666024e-06, + "loss": 0.3435, + "step": 7807 + }, + { + "epoch": 0.23, + "grad_norm": 1.5755885499680542, + "learning_rate": 9.021600490888412e-06, + "loss": 0.4084, + "step": 7808 + }, + { + "epoch": 0.23, + "grad_norm": 1.7491625139732896, + "learning_rate": 9.021321374620132e-06, + "loss": 0.3625, + "step": 7809 + }, + { + "epoch": 0.23, + "grad_norm": 1.6395876144688788, + "learning_rate": 9.021042222863644e-06, + "loss": 0.3664, + "step": 7810 + }, + { + "epoch": 0.23, + "grad_norm": 1.304814289180791, + "learning_rate": 9.020763035621416e-06, + "loss": 0.3529, + "step": 7811 + }, + { + "epoch": 0.23, + "grad_norm": 1.3702209795709177, + "learning_rate": 9.020483812895908e-06, + "loss": 0.3729, + "step": 7812 + }, + { + "epoch": 0.23, + "grad_norm": 2.2158683715554632, + "learning_rate": 9.020204554689586e-06, + "loss": 0.3929, + "step": 7813 + }, + { + "epoch": 0.23, + "grad_norm": 1.394811071305969, + "learning_rate": 9.019925261004914e-06, + "loss": 0.3516, + "step": 7814 + }, + { + "epoch": 0.23, + "grad_norm": 1.579713770761186, + "learning_rate": 9.019645931844358e-06, + "loss": 0.3788, + "step": 7815 + }, + { + "epoch": 0.23, + "grad_norm": 1.6189139915241235, + "learning_rate": 9.019366567210381e-06, + "loss": 0.3415, + "step": 7816 + }, + { + "epoch": 0.23, + "grad_norm": 1.38513564923076, + "learning_rate": 9.01908716710545e-06, + "loss": 0.3677, + "step": 7817 + }, + { + "epoch": 0.23, + "grad_norm": 1.4416807324455863, + "learning_rate": 9.018807731532031e-06, + "loss": 0.3619, + "step": 7818 + }, + { + "epoch": 0.23, + "grad_norm": 1.4821259436802023, + "learning_rate": 9.018528260492588e-06, + "loss": 0.3629, + "step": 7819 + }, + { + "epoch": 0.23, + "grad_norm": 1.2883489850696235, + "learning_rate": 9.018248753989589e-06, + "loss": 0.3541, + "step": 7820 + }, + { + "epoch": 0.23, + "grad_norm": 1.7575444623174479, + "learning_rate": 9.017969212025499e-06, + "loss": 0.3932, + "step": 7821 + }, + { + "epoch": 0.23, + "grad_norm": 1.5334831353706493, + "learning_rate": 9.017689634602788e-06, + "loss": 0.3577, + "step": 7822 + }, + { + "epoch": 0.23, + "grad_norm": 1.5754380334080391, + "learning_rate": 9.01741002172392e-06, + "loss": 0.3638, + "step": 7823 + }, + { + "epoch": 0.23, + "grad_norm": 1.498611494505331, + "learning_rate": 9.017130373391365e-06, + "loss": 0.3774, + "step": 7824 + }, + { + "epoch": 0.23, + "grad_norm": 1.3022201803085398, + "learning_rate": 9.01685068960759e-06, + "loss": 0.3587, + "step": 7825 + }, + { + "epoch": 0.23, + "grad_norm": 1.5033225840693902, + "learning_rate": 9.016570970375063e-06, + "loss": 0.3618, + "step": 7826 + }, + { + "epoch": 0.23, + "grad_norm": 1.710619751834241, + "learning_rate": 9.016291215696251e-06, + "loss": 0.3683, + "step": 7827 + }, + { + "epoch": 0.23, + "grad_norm": 1.3209267099751625, + "learning_rate": 9.016011425573627e-06, + "loss": 0.3657, + "step": 7828 + }, + { + "epoch": 0.23, + "grad_norm": 1.321052999622903, + "learning_rate": 9.015731600009655e-06, + "loss": 0.3687, + "step": 7829 + }, + { + "epoch": 0.23, + "grad_norm": 1.3424206472255131, + "learning_rate": 9.015451739006808e-06, + "loss": 0.3475, + "step": 7830 + }, + { + "epoch": 0.23, + "grad_norm": 0.9976306077260177, + "learning_rate": 9.015171842567556e-06, + "loss": 0.6151, + "step": 7831 + }, + { + "epoch": 0.23, + "grad_norm": 1.4189211352165751, + "learning_rate": 9.014891910694367e-06, + "loss": 0.362, + "step": 7832 + }, + { + "epoch": 0.23, + "grad_norm": 1.690887235614118, + "learning_rate": 9.014611943389713e-06, + "loss": 0.3536, + "step": 7833 + }, + { + "epoch": 0.23, + "grad_norm": 1.3367414999834097, + "learning_rate": 9.014331940656062e-06, + "loss": 0.3844, + "step": 7834 + }, + { + "epoch": 0.23, + "grad_norm": 0.9343618056946876, + "learning_rate": 9.01405190249589e-06, + "loss": 0.5991, + "step": 7835 + }, + { + "epoch": 0.23, + "grad_norm": 1.7011965456776583, + "learning_rate": 9.013771828911664e-06, + "loss": 0.3855, + "step": 7836 + }, + { + "epoch": 0.23, + "grad_norm": 1.2574923239267988, + "learning_rate": 9.013491719905856e-06, + "loss": 0.3462, + "step": 7837 + }, + { + "epoch": 0.23, + "grad_norm": 1.3815744675928505, + "learning_rate": 9.01321157548094e-06, + "loss": 0.3742, + "step": 7838 + }, + { + "epoch": 0.23, + "grad_norm": 1.4680064062539417, + "learning_rate": 9.012931395639389e-06, + "loss": 0.3594, + "step": 7839 + }, + { + "epoch": 0.23, + "grad_norm": 1.429193175674858, + "learning_rate": 9.012651180383673e-06, + "loss": 0.4112, + "step": 7840 + }, + { + "epoch": 0.23, + "grad_norm": 1.4112931418818875, + "learning_rate": 9.012370929716265e-06, + "loss": 0.3727, + "step": 7841 + }, + { + "epoch": 0.23, + "grad_norm": 1.3913026091201017, + "learning_rate": 9.01209064363964e-06, + "loss": 0.3788, + "step": 7842 + }, + { + "epoch": 0.23, + "grad_norm": 1.3691365521368133, + "learning_rate": 9.011810322156269e-06, + "loss": 0.3671, + "step": 7843 + }, + { + "epoch": 0.23, + "grad_norm": 1.3505945161405413, + "learning_rate": 9.011529965268629e-06, + "loss": 0.3465, + "step": 7844 + }, + { + "epoch": 0.23, + "grad_norm": 1.3303336667005359, + "learning_rate": 9.01124957297919e-06, + "loss": 0.3543, + "step": 7845 + }, + { + "epoch": 0.23, + "grad_norm": 1.5377080515996366, + "learning_rate": 9.01096914529043e-06, + "loss": 0.3537, + "step": 7846 + }, + { + "epoch": 0.23, + "grad_norm": 1.4165440147908468, + "learning_rate": 9.010688682204825e-06, + "loss": 0.3748, + "step": 7847 + }, + { + "epoch": 0.23, + "grad_norm": 1.4311591462214692, + "learning_rate": 9.010408183724847e-06, + "loss": 0.3714, + "step": 7848 + }, + { + "epoch": 0.23, + "grad_norm": 2.0419896754482556, + "learning_rate": 9.010127649852972e-06, + "loss": 0.3752, + "step": 7849 + }, + { + "epoch": 0.23, + "grad_norm": 1.3666418979690969, + "learning_rate": 9.009847080591676e-06, + "loss": 0.3661, + "step": 7850 + }, + { + "epoch": 0.23, + "grad_norm": 1.5340438278488087, + "learning_rate": 9.009566475943433e-06, + "loss": 0.3753, + "step": 7851 + }, + { + "epoch": 0.23, + "grad_norm": 1.4226587368714203, + "learning_rate": 9.009285835910725e-06, + "loss": 0.3612, + "step": 7852 + }, + { + "epoch": 0.23, + "grad_norm": 1.499035033786587, + "learning_rate": 9.009005160496023e-06, + "loss": 0.3579, + "step": 7853 + }, + { + "epoch": 0.23, + "grad_norm": 1.3339788553644802, + "learning_rate": 9.008724449701807e-06, + "loss": 0.3709, + "step": 7854 + }, + { + "epoch": 0.23, + "grad_norm": 1.4563994338801038, + "learning_rate": 9.008443703530552e-06, + "loss": 0.3867, + "step": 7855 + }, + { + "epoch": 0.23, + "grad_norm": 1.6681388321861585, + "learning_rate": 9.008162921984739e-06, + "loss": 0.3701, + "step": 7856 + }, + { + "epoch": 0.23, + "grad_norm": 1.356483837727074, + "learning_rate": 9.00788210506684e-06, + "loss": 0.3947, + "step": 7857 + }, + { + "epoch": 0.23, + "grad_norm": 2.931466555095126, + "learning_rate": 9.007601252779339e-06, + "loss": 0.3841, + "step": 7858 + }, + { + "epoch": 0.23, + "grad_norm": 1.3973287183446828, + "learning_rate": 9.007320365124711e-06, + "loss": 0.3812, + "step": 7859 + }, + { + "epoch": 0.23, + "grad_norm": 1.4655306109590016, + "learning_rate": 9.00703944210544e-06, + "loss": 0.3663, + "step": 7860 + }, + { + "epoch": 0.23, + "grad_norm": 1.7858921011090703, + "learning_rate": 9.006758483723999e-06, + "loss": 0.3747, + "step": 7861 + }, + { + "epoch": 0.23, + "grad_norm": 1.358848610140252, + "learning_rate": 9.006477489982869e-06, + "loss": 0.3743, + "step": 7862 + }, + { + "epoch": 0.23, + "grad_norm": 1.5155498251051573, + "learning_rate": 9.006196460884533e-06, + "loss": 0.3619, + "step": 7863 + }, + { + "epoch": 0.23, + "grad_norm": 1.3559691027429535, + "learning_rate": 9.005915396431468e-06, + "loss": 0.3638, + "step": 7864 + }, + { + "epoch": 0.23, + "grad_norm": 1.471946308109221, + "learning_rate": 9.005634296626156e-06, + "loss": 0.3653, + "step": 7865 + }, + { + "epoch": 0.23, + "grad_norm": 1.5977668473640774, + "learning_rate": 9.005353161471075e-06, + "loss": 0.3656, + "step": 7866 + }, + { + "epoch": 0.23, + "grad_norm": 1.5360170768257237, + "learning_rate": 9.005071990968709e-06, + "loss": 0.3395, + "step": 7867 + }, + { + "epoch": 0.23, + "grad_norm": 1.3066543838619424, + "learning_rate": 9.004790785121537e-06, + "loss": 0.3617, + "step": 7868 + }, + { + "epoch": 0.23, + "grad_norm": 1.2772463237288407, + "learning_rate": 9.004509543932044e-06, + "loss": 0.3623, + "step": 7869 + }, + { + "epoch": 0.23, + "grad_norm": 1.6645077341947194, + "learning_rate": 9.004228267402709e-06, + "loss": 0.383, + "step": 7870 + }, + { + "epoch": 0.23, + "grad_norm": 1.4445145726411328, + "learning_rate": 9.003946955536016e-06, + "loss": 0.355, + "step": 7871 + }, + { + "epoch": 0.23, + "grad_norm": 1.4673780555607474, + "learning_rate": 9.003665608334444e-06, + "loss": 0.3384, + "step": 7872 + }, + { + "epoch": 0.23, + "grad_norm": 1.480613503406553, + "learning_rate": 9.00338422580048e-06, + "loss": 0.3786, + "step": 7873 + }, + { + "epoch": 0.23, + "grad_norm": 1.296994020020555, + "learning_rate": 9.003102807936606e-06, + "loss": 0.3697, + "step": 7874 + }, + { + "epoch": 0.23, + "grad_norm": 1.4319319259028918, + "learning_rate": 9.002821354745307e-06, + "loss": 0.3737, + "step": 7875 + }, + { + "epoch": 0.23, + "grad_norm": 3.7444949596730397, + "learning_rate": 9.002539866229062e-06, + "loss": 0.3996, + "step": 7876 + }, + { + "epoch": 0.23, + "grad_norm": 1.48219284114514, + "learning_rate": 9.00225834239036e-06, + "loss": 0.3732, + "step": 7877 + }, + { + "epoch": 0.23, + "grad_norm": 1.3910348839089597, + "learning_rate": 9.001976783231684e-06, + "loss": 0.3747, + "step": 7878 + }, + { + "epoch": 0.23, + "grad_norm": 2.0173361310027014, + "learning_rate": 9.001695188755517e-06, + "loss": 0.3721, + "step": 7879 + }, + { + "epoch": 0.23, + "grad_norm": 1.3927754994570618, + "learning_rate": 9.001413558964347e-06, + "loss": 0.3823, + "step": 7880 + }, + { + "epoch": 0.23, + "grad_norm": 1.6417148256109462, + "learning_rate": 9.001131893860658e-06, + "loss": 0.3892, + "step": 7881 + }, + { + "epoch": 0.23, + "grad_norm": 1.2660576714178202, + "learning_rate": 9.000850193446933e-06, + "loss": 0.3583, + "step": 7882 + }, + { + "epoch": 0.23, + "grad_norm": 1.4223725489439585, + "learning_rate": 9.000568457725665e-06, + "loss": 0.3527, + "step": 7883 + }, + { + "epoch": 0.23, + "grad_norm": 1.5097986268933996, + "learning_rate": 9.000286686699334e-06, + "loss": 0.3643, + "step": 7884 + }, + { + "epoch": 0.23, + "grad_norm": 1.3670724544148916, + "learning_rate": 9.000004880370427e-06, + "loss": 0.3475, + "step": 7885 + }, + { + "epoch": 0.23, + "grad_norm": 3.4985394878154383, + "learning_rate": 8.999723038741433e-06, + "loss": 0.3564, + "step": 7886 + }, + { + "epoch": 0.23, + "grad_norm": 1.662473281896381, + "learning_rate": 8.999441161814841e-06, + "loss": 0.3785, + "step": 7887 + }, + { + "epoch": 0.23, + "grad_norm": 1.5603907206964367, + "learning_rate": 8.999159249593135e-06, + "loss": 0.3708, + "step": 7888 + }, + { + "epoch": 0.23, + "grad_norm": 1.3937832051916736, + "learning_rate": 8.998877302078803e-06, + "loss": 0.377, + "step": 7889 + }, + { + "epoch": 0.23, + "grad_norm": 0.9813713771842749, + "learning_rate": 8.998595319274336e-06, + "loss": 0.5635, + "step": 7890 + }, + { + "epoch": 0.23, + "grad_norm": 1.4510899657686984, + "learning_rate": 8.998313301182223e-06, + "loss": 0.3833, + "step": 7891 + }, + { + "epoch": 0.23, + "grad_norm": 1.4560298814277277, + "learning_rate": 8.998031247804948e-06, + "loss": 0.3642, + "step": 7892 + }, + { + "epoch": 0.23, + "grad_norm": 1.9769651605617327, + "learning_rate": 8.997749159145001e-06, + "loss": 0.383, + "step": 7893 + }, + { + "epoch": 0.23, + "grad_norm": 1.4778811862427934, + "learning_rate": 8.997467035204878e-06, + "loss": 0.4053, + "step": 7894 + }, + { + "epoch": 0.23, + "grad_norm": 1.5679785352030602, + "learning_rate": 8.997184875987061e-06, + "loss": 0.3785, + "step": 7895 + }, + { + "epoch": 0.23, + "grad_norm": 1.3648765662538496, + "learning_rate": 8.996902681494045e-06, + "loss": 0.3487, + "step": 7896 + }, + { + "epoch": 0.23, + "grad_norm": 1.7465866049454448, + "learning_rate": 8.996620451728318e-06, + "loss": 0.3679, + "step": 7897 + }, + { + "epoch": 0.23, + "grad_norm": 1.4259043024561608, + "learning_rate": 8.996338186692368e-06, + "loss": 0.3437, + "step": 7898 + }, + { + "epoch": 0.23, + "grad_norm": 1.3314732898297477, + "learning_rate": 8.996055886388692e-06, + "loss": 0.3654, + "step": 7899 + }, + { + "epoch": 0.23, + "grad_norm": 1.3929792947971136, + "learning_rate": 8.99577355081978e-06, + "loss": 0.3653, + "step": 7900 + }, + { + "epoch": 0.23, + "grad_norm": 1.4735255857726635, + "learning_rate": 8.995491179988119e-06, + "loss": 0.3833, + "step": 7901 + }, + { + "epoch": 0.23, + "grad_norm": 1.2920568031795336, + "learning_rate": 8.995208773896206e-06, + "loss": 0.3566, + "step": 7902 + }, + { + "epoch": 0.23, + "grad_norm": 1.4042101961935758, + "learning_rate": 8.994926332546531e-06, + "loss": 0.3482, + "step": 7903 + }, + { + "epoch": 0.23, + "grad_norm": 1.4543157711324335, + "learning_rate": 8.994643855941585e-06, + "loss": 0.3759, + "step": 7904 + }, + { + "epoch": 0.23, + "grad_norm": 3.26241390287869, + "learning_rate": 8.994361344083863e-06, + "loss": 0.3475, + "step": 7905 + }, + { + "epoch": 0.23, + "grad_norm": 1.467644851994501, + "learning_rate": 8.99407879697586e-06, + "loss": 0.3661, + "step": 7906 + }, + { + "epoch": 0.23, + "grad_norm": 1.6120798127453797, + "learning_rate": 8.993796214620067e-06, + "loss": 0.4082, + "step": 7907 + }, + { + "epoch": 0.23, + "grad_norm": 1.6990810117449557, + "learning_rate": 8.993513597018977e-06, + "loss": 0.3811, + "step": 7908 + }, + { + "epoch": 0.23, + "grad_norm": 1.5211743015991268, + "learning_rate": 8.993230944175086e-06, + "loss": 0.4245, + "step": 7909 + }, + { + "epoch": 0.23, + "grad_norm": 1.4384847276269201, + "learning_rate": 8.992948256090887e-06, + "loss": 0.3795, + "step": 7910 + }, + { + "epoch": 0.23, + "grad_norm": 1.7257367569896638, + "learning_rate": 8.992665532768876e-06, + "loss": 0.3528, + "step": 7911 + }, + { + "epoch": 0.23, + "grad_norm": 1.019428161174006, + "learning_rate": 8.992382774211546e-06, + "loss": 0.6224, + "step": 7912 + }, + { + "epoch": 0.23, + "grad_norm": 1.3596501532241758, + "learning_rate": 8.992099980421396e-06, + "loss": 0.3877, + "step": 7913 + }, + { + "epoch": 0.23, + "grad_norm": 1.519613963478496, + "learning_rate": 8.991817151400916e-06, + "loss": 0.3498, + "step": 7914 + }, + { + "epoch": 0.23, + "grad_norm": 1.507931928512092, + "learning_rate": 8.991534287152607e-06, + "loss": 0.4067, + "step": 7915 + }, + { + "epoch": 0.23, + "grad_norm": 1.4905648242226253, + "learning_rate": 8.991251387678966e-06, + "loss": 0.3827, + "step": 7916 + }, + { + "epoch": 0.23, + "grad_norm": 1.490987945240077, + "learning_rate": 8.990968452982485e-06, + "loss": 0.3492, + "step": 7917 + }, + { + "epoch": 0.23, + "grad_norm": 1.4130339112212529, + "learning_rate": 8.990685483065662e-06, + "loss": 0.3671, + "step": 7918 + }, + { + "epoch": 0.23, + "grad_norm": 1.3265281763081813, + "learning_rate": 8.990402477930997e-06, + "loss": 0.3523, + "step": 7919 + }, + { + "epoch": 0.23, + "grad_norm": 1.4222548152317394, + "learning_rate": 8.990119437580986e-06, + "loss": 0.3831, + "step": 7920 + }, + { + "epoch": 0.23, + "grad_norm": 1.5267005958410467, + "learning_rate": 8.989836362018124e-06, + "loss": 0.3615, + "step": 7921 + }, + { + "epoch": 0.23, + "grad_norm": 1.3901769175589977, + "learning_rate": 8.989553251244916e-06, + "loss": 0.3498, + "step": 7922 + }, + { + "epoch": 0.23, + "grad_norm": 1.4003396418376355, + "learning_rate": 8.989270105263854e-06, + "loss": 0.3867, + "step": 7923 + }, + { + "epoch": 0.23, + "grad_norm": 1.4696512668930148, + "learning_rate": 8.98898692407744e-06, + "loss": 0.3467, + "step": 7924 + }, + { + "epoch": 0.23, + "grad_norm": 1.5597837380625945, + "learning_rate": 8.988703707688171e-06, + "loss": 0.3397, + "step": 7925 + }, + { + "epoch": 0.23, + "grad_norm": 1.2718602421097946, + "learning_rate": 8.988420456098548e-06, + "loss": 0.3527, + "step": 7926 + }, + { + "epoch": 0.23, + "grad_norm": 1.4784306025769702, + "learning_rate": 8.98813716931107e-06, + "loss": 0.3968, + "step": 7927 + }, + { + "epoch": 0.23, + "grad_norm": 1.4342219347728307, + "learning_rate": 8.987853847328239e-06, + "loss": 0.3875, + "step": 7928 + }, + { + "epoch": 0.23, + "grad_norm": 1.7608467653976678, + "learning_rate": 8.98757049015255e-06, + "loss": 0.3849, + "step": 7929 + }, + { + "epoch": 0.23, + "grad_norm": 1.6054897313373957, + "learning_rate": 8.98728709778651e-06, + "loss": 0.4099, + "step": 7930 + }, + { + "epoch": 0.23, + "grad_norm": 1.3959209085407507, + "learning_rate": 8.987003670232616e-06, + "loss": 0.3392, + "step": 7931 + }, + { + "epoch": 0.23, + "grad_norm": 1.7596502892733767, + "learning_rate": 8.986720207493371e-06, + "loss": 0.3587, + "step": 7932 + }, + { + "epoch": 0.23, + "grad_norm": 1.5608495238047309, + "learning_rate": 8.986436709571276e-06, + "loss": 0.3649, + "step": 7933 + }, + { + "epoch": 0.23, + "grad_norm": 1.3906565452700077, + "learning_rate": 8.986153176468832e-06, + "loss": 0.4134, + "step": 7934 + }, + { + "epoch": 0.23, + "grad_norm": 1.5583052035343217, + "learning_rate": 8.985869608188545e-06, + "loss": 0.3887, + "step": 7935 + }, + { + "epoch": 0.23, + "grad_norm": 3.7442982063463806, + "learning_rate": 8.985586004732911e-06, + "loss": 0.3661, + "step": 7936 + }, + { + "epoch": 0.23, + "grad_norm": 1.7565790642172743, + "learning_rate": 8.985302366104437e-06, + "loss": 0.3559, + "step": 7937 + }, + { + "epoch": 0.23, + "grad_norm": 1.6322518388585503, + "learning_rate": 8.985018692305626e-06, + "loss": 0.3768, + "step": 7938 + }, + { + "epoch": 0.23, + "grad_norm": 1.3722853965677928, + "learning_rate": 8.98473498333898e-06, + "loss": 0.3827, + "step": 7939 + }, + { + "epoch": 0.23, + "grad_norm": 1.3690152639856388, + "learning_rate": 8.984451239207003e-06, + "loss": 0.3456, + "step": 7940 + }, + { + "epoch": 0.23, + "grad_norm": 1.482866518802983, + "learning_rate": 8.984167459912201e-06, + "loss": 0.3714, + "step": 7941 + }, + { + "epoch": 0.23, + "grad_norm": 1.4557424240658052, + "learning_rate": 8.983883645457079e-06, + "loss": 0.3707, + "step": 7942 + }, + { + "epoch": 0.23, + "grad_norm": 1.444141423243988, + "learning_rate": 8.983599795844136e-06, + "loss": 0.3715, + "step": 7943 + }, + { + "epoch": 0.23, + "grad_norm": 1.3576850846503032, + "learning_rate": 8.983315911075882e-06, + "loss": 0.3693, + "step": 7944 + }, + { + "epoch": 0.23, + "grad_norm": 1.8326877624757363, + "learning_rate": 8.98303199115482e-06, + "loss": 0.3761, + "step": 7945 + }, + { + "epoch": 0.23, + "grad_norm": 1.5081049036912142, + "learning_rate": 8.982748036083459e-06, + "loss": 0.3729, + "step": 7946 + }, + { + "epoch": 0.23, + "grad_norm": 1.4666564532477746, + "learning_rate": 8.9824640458643e-06, + "loss": 0.3693, + "step": 7947 + }, + { + "epoch": 0.23, + "grad_norm": 1.8598165934279642, + "learning_rate": 8.982180020499853e-06, + "loss": 0.4102, + "step": 7948 + }, + { + "epoch": 0.23, + "grad_norm": 2.153477392268059, + "learning_rate": 8.981895959992623e-06, + "loss": 0.3671, + "step": 7949 + }, + { + "epoch": 0.23, + "grad_norm": 1.778505586125173, + "learning_rate": 8.981611864345115e-06, + "loss": 0.3773, + "step": 7950 + }, + { + "epoch": 0.23, + "grad_norm": 1.4065604065349144, + "learning_rate": 8.98132773355984e-06, + "loss": 0.4043, + "step": 7951 + }, + { + "epoch": 0.23, + "grad_norm": 1.408812767459446, + "learning_rate": 8.981043567639304e-06, + "loss": 0.3643, + "step": 7952 + }, + { + "epoch": 0.23, + "grad_norm": 1.562214143307604, + "learning_rate": 8.980759366586014e-06, + "loss": 0.4173, + "step": 7953 + }, + { + "epoch": 0.23, + "grad_norm": 1.3344768299425598, + "learning_rate": 8.980475130402477e-06, + "loss": 0.3609, + "step": 7954 + }, + { + "epoch": 0.23, + "grad_norm": 1.330475896733943, + "learning_rate": 8.980190859091205e-06, + "loss": 0.373, + "step": 7955 + }, + { + "epoch": 0.23, + "grad_norm": 1.6303571537689547, + "learning_rate": 8.979906552654702e-06, + "loss": 0.3745, + "step": 7956 + }, + { + "epoch": 0.23, + "grad_norm": 1.7354268622895908, + "learning_rate": 8.97962221109548e-06, + "loss": 0.3626, + "step": 7957 + }, + { + "epoch": 0.23, + "grad_norm": 1.474168312687979, + "learning_rate": 8.97933783441605e-06, + "loss": 0.3743, + "step": 7958 + }, + { + "epoch": 0.23, + "grad_norm": 2.0880838821024237, + "learning_rate": 8.979053422618917e-06, + "loss": 0.3863, + "step": 7959 + }, + { + "epoch": 0.23, + "grad_norm": 1.5658962426554017, + "learning_rate": 8.978768975706595e-06, + "loss": 0.3773, + "step": 7960 + }, + { + "epoch": 0.23, + "grad_norm": 1.492159166270387, + "learning_rate": 8.978484493681593e-06, + "loss": 0.3788, + "step": 7961 + }, + { + "epoch": 0.23, + "grad_norm": 1.5616254236129228, + "learning_rate": 8.97819997654642e-06, + "loss": 0.3423, + "step": 7962 + }, + { + "epoch": 0.23, + "grad_norm": 4.434260604404199, + "learning_rate": 8.97791542430359e-06, + "loss": 0.3758, + "step": 7963 + }, + { + "epoch": 0.23, + "grad_norm": 1.0592191346062365, + "learning_rate": 8.977630836955609e-06, + "loss": 0.6069, + "step": 7964 + }, + { + "epoch": 0.23, + "grad_norm": 1.5255109808873997, + "learning_rate": 8.977346214504994e-06, + "loss": 0.3641, + "step": 7965 + }, + { + "epoch": 0.23, + "grad_norm": 1.5473635357646545, + "learning_rate": 8.977061556954254e-06, + "loss": 0.3772, + "step": 7966 + }, + { + "epoch": 0.23, + "grad_norm": 1.6713262865392493, + "learning_rate": 8.976776864305904e-06, + "loss": 0.3545, + "step": 7967 + }, + { + "epoch": 0.23, + "grad_norm": 2.1426539350872984, + "learning_rate": 8.976492136562449e-06, + "loss": 0.3659, + "step": 7968 + }, + { + "epoch": 0.23, + "grad_norm": 1.833871847796568, + "learning_rate": 8.976207373726411e-06, + "loss": 0.3674, + "step": 7969 + }, + { + "epoch": 0.23, + "grad_norm": 1.387360093531574, + "learning_rate": 8.975922575800297e-06, + "loss": 0.3384, + "step": 7970 + }, + { + "epoch": 0.23, + "grad_norm": 1.3967454693553052, + "learning_rate": 8.975637742786621e-06, + "loss": 0.3581, + "step": 7971 + }, + { + "epoch": 0.23, + "grad_norm": 1.7370482874917275, + "learning_rate": 8.9753528746879e-06, + "loss": 0.3597, + "step": 7972 + }, + { + "epoch": 0.23, + "grad_norm": 1.7202427113355006, + "learning_rate": 8.975067971506646e-06, + "loss": 0.4122, + "step": 7973 + }, + { + "epoch": 0.23, + "grad_norm": 1.521981771059281, + "learning_rate": 8.974783033245372e-06, + "loss": 0.3544, + "step": 7974 + }, + { + "epoch": 0.23, + "grad_norm": 1.4808461578119556, + "learning_rate": 8.974498059906593e-06, + "loss": 0.363, + "step": 7975 + }, + { + "epoch": 0.23, + "grad_norm": 1.624347194555401, + "learning_rate": 8.974213051492825e-06, + "loss": 0.3587, + "step": 7976 + }, + { + "epoch": 0.23, + "grad_norm": 1.4367645770516073, + "learning_rate": 8.973928008006583e-06, + "loss": 0.3515, + "step": 7977 + }, + { + "epoch": 0.23, + "grad_norm": 1.373909883139435, + "learning_rate": 8.973642929450383e-06, + "loss": 0.369, + "step": 7978 + }, + { + "epoch": 0.23, + "grad_norm": 1.4615336023016714, + "learning_rate": 8.97335781582674e-06, + "loss": 0.3891, + "step": 7979 + }, + { + "epoch": 0.23, + "grad_norm": 1.5562565549587215, + "learning_rate": 8.973072667138168e-06, + "loss": 0.3533, + "step": 7980 + }, + { + "epoch": 0.23, + "grad_norm": 1.3943107911690986, + "learning_rate": 8.97278748338719e-06, + "loss": 0.3551, + "step": 7981 + }, + { + "epoch": 0.23, + "grad_norm": 1.749003841140715, + "learning_rate": 8.972502264576313e-06, + "loss": 0.4046, + "step": 7982 + }, + { + "epoch": 0.23, + "grad_norm": 1.4377360969407906, + "learning_rate": 8.972217010708062e-06, + "loss": 0.358, + "step": 7983 + }, + { + "epoch": 0.23, + "grad_norm": 1.7439892128212757, + "learning_rate": 8.971931721784953e-06, + "loss": 0.3749, + "step": 7984 + }, + { + "epoch": 0.23, + "grad_norm": 1.3808747633043024, + "learning_rate": 8.971646397809501e-06, + "loss": 0.3461, + "step": 7985 + }, + { + "epoch": 0.23, + "grad_norm": 1.3918328798630508, + "learning_rate": 8.971361038784226e-06, + "loss": 0.3716, + "step": 7986 + }, + { + "epoch": 0.23, + "grad_norm": 2.883829575898193, + "learning_rate": 8.971075644711645e-06, + "loss": 0.3685, + "step": 7987 + }, + { + "epoch": 0.23, + "grad_norm": 1.6305666525173932, + "learning_rate": 8.970790215594278e-06, + "loss": 0.4573, + "step": 7988 + }, + { + "epoch": 0.23, + "grad_norm": 1.07578538979529, + "learning_rate": 8.970504751434643e-06, + "loss": 0.5852, + "step": 7989 + }, + { + "epoch": 0.23, + "grad_norm": 2.077931415442755, + "learning_rate": 8.97021925223526e-06, + "loss": 0.3915, + "step": 7990 + }, + { + "epoch": 0.23, + "grad_norm": 2.5877149629710887, + "learning_rate": 8.969933717998647e-06, + "loss": 0.3926, + "step": 7991 + }, + { + "epoch": 0.23, + "grad_norm": 1.509321155538605, + "learning_rate": 8.969648148727326e-06, + "loss": 0.372, + "step": 7992 + }, + { + "epoch": 0.23, + "grad_norm": 1.5824509993547988, + "learning_rate": 8.969362544423817e-06, + "loss": 0.3716, + "step": 7993 + }, + { + "epoch": 0.23, + "grad_norm": 1.5571447554720468, + "learning_rate": 8.969076905090638e-06, + "loss": 0.4039, + "step": 7994 + }, + { + "epoch": 0.23, + "grad_norm": 1.8143808507567634, + "learning_rate": 8.968791230730311e-06, + "loss": 0.3649, + "step": 7995 + }, + { + "epoch": 0.23, + "grad_norm": 1.670993995670931, + "learning_rate": 8.968505521345358e-06, + "loss": 0.3629, + "step": 7996 + }, + { + "epoch": 0.23, + "grad_norm": 1.4656141154670426, + "learning_rate": 8.968219776938301e-06, + "loss": 0.3405, + "step": 7997 + }, + { + "epoch": 0.23, + "grad_norm": 1.4112461808498915, + "learning_rate": 8.967933997511657e-06, + "loss": 0.369, + "step": 7998 + }, + { + "epoch": 0.23, + "grad_norm": 1.2625565822539424, + "learning_rate": 8.967648183067954e-06, + "loss": 0.3537, + "step": 7999 + }, + { + "epoch": 0.23, + "grad_norm": 1.5357513847717577, + "learning_rate": 8.967362333609712e-06, + "loss": 0.3494, + "step": 8000 + }, + { + "epoch": 0.23, + "grad_norm": 2.031978261024263, + "learning_rate": 8.967076449139454e-06, + "loss": 0.368, + "step": 8001 + }, + { + "epoch": 0.23, + "grad_norm": 1.6938062782410697, + "learning_rate": 8.9667905296597e-06, + "loss": 0.3824, + "step": 8002 + }, + { + "epoch": 0.23, + "grad_norm": 1.9096144185640622, + "learning_rate": 8.966504575172977e-06, + "loss": 0.3573, + "step": 8003 + }, + { + "epoch": 0.23, + "grad_norm": 1.447734533683254, + "learning_rate": 8.966218585681807e-06, + "loss": 0.3623, + "step": 8004 + }, + { + "epoch": 0.23, + "grad_norm": 2.1273699768892156, + "learning_rate": 8.965932561188712e-06, + "loss": 0.3458, + "step": 8005 + }, + { + "epoch": 0.23, + "grad_norm": 1.4454911996180864, + "learning_rate": 8.96564650169622e-06, + "loss": 0.3818, + "step": 8006 + }, + { + "epoch": 0.23, + "grad_norm": 1.4723818641231055, + "learning_rate": 8.965360407206854e-06, + "loss": 0.3501, + "step": 8007 + }, + { + "epoch": 0.23, + "grad_norm": 1.766463758946381, + "learning_rate": 8.965074277723139e-06, + "loss": 0.3512, + "step": 8008 + }, + { + "epoch": 0.23, + "grad_norm": 1.45321866152737, + "learning_rate": 8.964788113247598e-06, + "loss": 0.3634, + "step": 8009 + }, + { + "epoch": 0.23, + "grad_norm": 1.7833110903086788, + "learning_rate": 8.964501913782758e-06, + "loss": 0.4007, + "step": 8010 + }, + { + "epoch": 0.23, + "grad_norm": 1.6562779797434533, + "learning_rate": 8.964215679331144e-06, + "loss": 0.4202, + "step": 8011 + }, + { + "epoch": 0.23, + "grad_norm": 1.4305508850045963, + "learning_rate": 8.963929409895284e-06, + "loss": 0.3494, + "step": 8012 + }, + { + "epoch": 0.23, + "grad_norm": 2.76189806681156, + "learning_rate": 8.963643105477704e-06, + "loss": 0.3983, + "step": 8013 + }, + { + "epoch": 0.23, + "grad_norm": 1.3510925329887076, + "learning_rate": 8.963356766080929e-06, + "loss": 0.3864, + "step": 8014 + }, + { + "epoch": 0.23, + "grad_norm": 1.6162503404194528, + "learning_rate": 8.963070391707485e-06, + "loss": 0.3537, + "step": 8015 + }, + { + "epoch": 0.23, + "grad_norm": 1.7160507106988638, + "learning_rate": 8.962783982359901e-06, + "loss": 0.3749, + "step": 8016 + }, + { + "epoch": 0.23, + "grad_norm": 1.553847620867148, + "learning_rate": 8.962497538040705e-06, + "loss": 0.3704, + "step": 8017 + }, + { + "epoch": 0.23, + "grad_norm": 1.7433322919313432, + "learning_rate": 8.962211058752427e-06, + "loss": 0.3694, + "step": 8018 + }, + { + "epoch": 0.23, + "grad_norm": 1.3979397201577275, + "learning_rate": 8.961924544497588e-06, + "loss": 0.3542, + "step": 8019 + }, + { + "epoch": 0.23, + "grad_norm": 1.3620512805360494, + "learning_rate": 8.961637995278723e-06, + "loss": 0.349, + "step": 8020 + }, + { + "epoch": 0.23, + "grad_norm": 1.3832835512828003, + "learning_rate": 8.961351411098359e-06, + "loss": 0.345, + "step": 8021 + }, + { + "epoch": 0.23, + "grad_norm": 2.42446423680915, + "learning_rate": 8.961064791959024e-06, + "loss": 0.3668, + "step": 8022 + }, + { + "epoch": 0.23, + "grad_norm": 2.2559135875289007, + "learning_rate": 8.960778137863249e-06, + "loss": 0.3559, + "step": 8023 + }, + { + "epoch": 0.23, + "grad_norm": 1.354557808179743, + "learning_rate": 8.960491448813562e-06, + "loss": 0.3843, + "step": 8024 + }, + { + "epoch": 0.23, + "grad_norm": 1.4311207184680212, + "learning_rate": 8.960204724812493e-06, + "loss": 0.3816, + "step": 8025 + }, + { + "epoch": 0.23, + "grad_norm": 1.3322212600818928, + "learning_rate": 8.959917965862576e-06, + "loss": 0.351, + "step": 8026 + }, + { + "epoch": 0.23, + "grad_norm": 1.6564903127491626, + "learning_rate": 8.95963117196634e-06, + "loss": 0.3483, + "step": 8027 + }, + { + "epoch": 0.23, + "grad_norm": 1.4108968532996202, + "learning_rate": 8.95934434312631e-06, + "loss": 0.3742, + "step": 8028 + }, + { + "epoch": 0.23, + "grad_norm": 1.3769484529105127, + "learning_rate": 8.959057479345026e-06, + "loss": 0.3542, + "step": 8029 + }, + { + "epoch": 0.23, + "grad_norm": 1.682101900997127, + "learning_rate": 8.958770580625016e-06, + "loss": 0.3725, + "step": 8030 + }, + { + "epoch": 0.23, + "grad_norm": 1.4851334182379619, + "learning_rate": 8.958483646968811e-06, + "loss": 0.3555, + "step": 8031 + }, + { + "epoch": 0.23, + "grad_norm": 8.877660389525447, + "learning_rate": 8.958196678378944e-06, + "loss": 0.3951, + "step": 8032 + }, + { + "epoch": 0.23, + "grad_norm": 2.308727463652637, + "learning_rate": 8.957909674857947e-06, + "loss": 0.3686, + "step": 8033 + }, + { + "epoch": 0.23, + "grad_norm": 1.8245394126699706, + "learning_rate": 8.957622636408351e-06, + "loss": 0.3764, + "step": 8034 + }, + { + "epoch": 0.23, + "grad_norm": 1.5697105505334803, + "learning_rate": 8.957335563032694e-06, + "loss": 0.372, + "step": 8035 + }, + { + "epoch": 0.23, + "grad_norm": 1.5610935917800401, + "learning_rate": 8.957048454733507e-06, + "loss": 0.387, + "step": 8036 + }, + { + "epoch": 0.23, + "grad_norm": 5.355868840752629, + "learning_rate": 8.956761311513323e-06, + "loss": 0.3709, + "step": 8037 + }, + { + "epoch": 0.23, + "grad_norm": 1.5067314675671026, + "learning_rate": 8.956474133374677e-06, + "loss": 0.3546, + "step": 8038 + }, + { + "epoch": 0.23, + "grad_norm": 1.4725491119753409, + "learning_rate": 8.956186920320102e-06, + "loss": 0.3457, + "step": 8039 + }, + { + "epoch": 0.23, + "grad_norm": 1.5328470875819513, + "learning_rate": 8.955899672352133e-06, + "loss": 0.3852, + "step": 8040 + }, + { + "epoch": 0.23, + "grad_norm": 1.4703157236159994, + "learning_rate": 8.955612389473307e-06, + "loss": 0.3918, + "step": 8041 + }, + { + "epoch": 0.23, + "grad_norm": 1.4108754061495303, + "learning_rate": 8.955325071686159e-06, + "loss": 0.3559, + "step": 8042 + }, + { + "epoch": 0.23, + "grad_norm": 1.2141935546823037, + "learning_rate": 8.95503771899322e-06, + "loss": 0.3474, + "step": 8043 + }, + { + "epoch": 0.23, + "grad_norm": 2.5934353135629324, + "learning_rate": 8.95475033139703e-06, + "loss": 0.3604, + "step": 8044 + }, + { + "epoch": 0.23, + "grad_norm": 1.3829279545095496, + "learning_rate": 8.954462908900125e-06, + "loss": 0.3746, + "step": 8045 + }, + { + "epoch": 0.23, + "grad_norm": 1.4839119521287576, + "learning_rate": 8.954175451505042e-06, + "loss": 0.3748, + "step": 8046 + }, + { + "epoch": 0.23, + "grad_norm": 1.4536835468647766, + "learning_rate": 8.953887959214316e-06, + "loss": 0.3532, + "step": 8047 + }, + { + "epoch": 0.23, + "grad_norm": 1.7826515735885318, + "learning_rate": 8.953600432030485e-06, + "loss": 0.3719, + "step": 8048 + }, + { + "epoch": 0.23, + "grad_norm": 1.3375487391717762, + "learning_rate": 8.953312869956085e-06, + "loss": 0.4033, + "step": 8049 + }, + { + "epoch": 0.23, + "grad_norm": 1.4072128638488572, + "learning_rate": 8.953025272993658e-06, + "loss": 0.3473, + "step": 8050 + }, + { + "epoch": 0.23, + "grad_norm": 0.9422749317704746, + "learning_rate": 8.952737641145737e-06, + "loss": 0.5891, + "step": 8051 + }, + { + "epoch": 0.23, + "grad_norm": 1.5600118791519686, + "learning_rate": 8.952449974414863e-06, + "loss": 0.3831, + "step": 8052 + }, + { + "epoch": 0.23, + "grad_norm": 1.5614097007575343, + "learning_rate": 8.952162272803572e-06, + "loss": 0.3599, + "step": 8053 + }, + { + "epoch": 0.23, + "grad_norm": 1.3912953769869674, + "learning_rate": 8.951874536314408e-06, + "loss": 0.33, + "step": 8054 + }, + { + "epoch": 0.23, + "grad_norm": 1.4358081016231905, + "learning_rate": 8.951586764949907e-06, + "loss": 0.382, + "step": 8055 + }, + { + "epoch": 0.23, + "grad_norm": 1.2764251570743697, + "learning_rate": 8.951298958712608e-06, + "loss": 0.3431, + "step": 8056 + }, + { + "epoch": 0.23, + "grad_norm": 1.374267006974668, + "learning_rate": 8.951011117605052e-06, + "loss": 0.3694, + "step": 8057 + }, + { + "epoch": 0.23, + "grad_norm": 1.4030577813783205, + "learning_rate": 8.950723241629779e-06, + "loss": 0.3547, + "step": 8058 + }, + { + "epoch": 0.23, + "grad_norm": 1.4183069198250648, + "learning_rate": 8.950435330789331e-06, + "loss": 0.3858, + "step": 8059 + }, + { + "epoch": 0.23, + "grad_norm": 1.4404602289817223, + "learning_rate": 8.950147385086246e-06, + "loss": 0.3565, + "step": 8060 + }, + { + "epoch": 0.23, + "grad_norm": 1.293032159858931, + "learning_rate": 8.949859404523068e-06, + "loss": 0.3613, + "step": 8061 + }, + { + "epoch": 0.23, + "grad_norm": 1.5203062498306692, + "learning_rate": 8.949571389102334e-06, + "loss": 0.3813, + "step": 8062 + }, + { + "epoch": 0.23, + "grad_norm": 1.4208652798926995, + "learning_rate": 8.94928333882659e-06, + "loss": 0.3567, + "step": 8063 + }, + { + "epoch": 0.23, + "grad_norm": 1.4117573223916806, + "learning_rate": 8.948995253698378e-06, + "loss": 0.3664, + "step": 8064 + }, + { + "epoch": 0.23, + "grad_norm": 1.400673311061508, + "learning_rate": 8.948707133720238e-06, + "loss": 0.3533, + "step": 8065 + }, + { + "epoch": 0.23, + "grad_norm": 1.3880481256794706, + "learning_rate": 8.948418978894715e-06, + "loss": 0.383, + "step": 8066 + }, + { + "epoch": 0.23, + "grad_norm": 1.5279272366413421, + "learning_rate": 8.948130789224348e-06, + "loss": 0.3578, + "step": 8067 + }, + { + "epoch": 0.23, + "grad_norm": 1.3771503722427503, + "learning_rate": 8.947842564711685e-06, + "loss": 0.3799, + "step": 8068 + }, + { + "epoch": 0.23, + "grad_norm": 1.8830737653295873, + "learning_rate": 8.947554305359267e-06, + "loss": 0.3527, + "step": 8069 + }, + { + "epoch": 0.23, + "grad_norm": 1.8105189454933808, + "learning_rate": 8.947266011169639e-06, + "loss": 0.3613, + "step": 8070 + }, + { + "epoch": 0.23, + "grad_norm": 1.3285412506830578, + "learning_rate": 8.946977682145344e-06, + "loss": 0.3622, + "step": 8071 + }, + { + "epoch": 0.23, + "grad_norm": 1.3859203803441305, + "learning_rate": 8.946689318288929e-06, + "loss": 0.3787, + "step": 8072 + }, + { + "epoch": 0.23, + "grad_norm": 1.9273213617906528, + "learning_rate": 8.946400919602933e-06, + "loss": 0.3908, + "step": 8073 + }, + { + "epoch": 0.23, + "grad_norm": 1.3820078680376282, + "learning_rate": 8.946112486089908e-06, + "loss": 0.3776, + "step": 8074 + }, + { + "epoch": 0.23, + "grad_norm": 1.5077538998700801, + "learning_rate": 8.945824017752397e-06, + "loss": 0.3738, + "step": 8075 + }, + { + "epoch": 0.23, + "grad_norm": 1.4593095134838112, + "learning_rate": 8.945535514592944e-06, + "loss": 0.386, + "step": 8076 + }, + { + "epoch": 0.23, + "grad_norm": 1.8577029726983008, + "learning_rate": 8.945246976614097e-06, + "loss": 0.3684, + "step": 8077 + }, + { + "epoch": 0.23, + "grad_norm": 1.4225683849135622, + "learning_rate": 8.9449584038184e-06, + "loss": 0.3696, + "step": 8078 + }, + { + "epoch": 0.23, + "grad_norm": 1.6716049247831382, + "learning_rate": 8.944669796208403e-06, + "loss": 0.3545, + "step": 8079 + }, + { + "epoch": 0.23, + "grad_norm": 1.4979147419056709, + "learning_rate": 8.944381153786651e-06, + "loss": 0.3759, + "step": 8080 + }, + { + "epoch": 0.23, + "grad_norm": 1.5442492502402358, + "learning_rate": 8.944092476555692e-06, + "loss": 0.3754, + "step": 8081 + }, + { + "epoch": 0.23, + "grad_norm": 1.3563487017965223, + "learning_rate": 8.943803764518073e-06, + "loss": 0.3581, + "step": 8082 + }, + { + "epoch": 0.23, + "grad_norm": 1.8183745193121157, + "learning_rate": 8.94351501767634e-06, + "loss": 0.3603, + "step": 8083 + }, + { + "epoch": 0.23, + "grad_norm": 1.495218697890073, + "learning_rate": 8.943226236033046e-06, + "loss": 0.3957, + "step": 8084 + }, + { + "epoch": 0.23, + "grad_norm": 1.7580582212087275, + "learning_rate": 8.942937419590737e-06, + "loss": 0.3462, + "step": 8085 + }, + { + "epoch": 0.23, + "grad_norm": 1.348098775938002, + "learning_rate": 8.94264856835196e-06, + "loss": 0.3386, + "step": 8086 + }, + { + "epoch": 0.23, + "grad_norm": 1.2914596002534433, + "learning_rate": 8.942359682319265e-06, + "loss": 0.375, + "step": 8087 + }, + { + "epoch": 0.23, + "grad_norm": 1.0762269852299458, + "learning_rate": 8.942070761495203e-06, + "loss": 0.6153, + "step": 8088 + }, + { + "epoch": 0.23, + "grad_norm": 1.4904590866359189, + "learning_rate": 8.941781805882323e-06, + "loss": 0.3847, + "step": 8089 + }, + { + "epoch": 0.23, + "grad_norm": 5.068200929707786, + "learning_rate": 8.941492815483174e-06, + "loss": 0.3991, + "step": 8090 + }, + { + "epoch": 0.23, + "grad_norm": 1.3745108246865971, + "learning_rate": 8.941203790300308e-06, + "loss": 0.3572, + "step": 8091 + }, + { + "epoch": 0.23, + "grad_norm": 1.5597406241368843, + "learning_rate": 8.940914730336274e-06, + "loss": 0.3968, + "step": 8092 + }, + { + "epoch": 0.23, + "grad_norm": 1.3318327276734794, + "learning_rate": 8.940625635593626e-06, + "loss": 0.3926, + "step": 8093 + }, + { + "epoch": 0.23, + "grad_norm": 1.4461277544228361, + "learning_rate": 8.94033650607491e-06, + "loss": 0.3593, + "step": 8094 + }, + { + "epoch": 0.23, + "grad_norm": 1.4953858255855021, + "learning_rate": 8.940047341782683e-06, + "loss": 0.3673, + "step": 8095 + }, + { + "epoch": 0.23, + "grad_norm": 1.5151514570949725, + "learning_rate": 8.939758142719492e-06, + "loss": 0.3638, + "step": 8096 + }, + { + "epoch": 0.23, + "grad_norm": 0.934708355528173, + "learning_rate": 8.939468908887895e-06, + "loss": 0.6007, + "step": 8097 + }, + { + "epoch": 0.23, + "grad_norm": 1.6517947504739927, + "learning_rate": 8.939179640290438e-06, + "loss": 0.3738, + "step": 8098 + }, + { + "epoch": 0.23, + "grad_norm": 2.4458323973990512, + "learning_rate": 8.938890336929677e-06, + "loss": 0.3841, + "step": 8099 + }, + { + "epoch": 0.23, + "grad_norm": 1.373335571523036, + "learning_rate": 8.938600998808168e-06, + "loss": 0.3645, + "step": 8100 + }, + { + "epoch": 0.23, + "grad_norm": 1.4183097301187264, + "learning_rate": 8.93831162592846e-06, + "loss": 0.3833, + "step": 8101 + }, + { + "epoch": 0.23, + "grad_norm": 1.3502827798982708, + "learning_rate": 8.938022218293108e-06, + "loss": 0.3767, + "step": 8102 + }, + { + "epoch": 0.24, + "grad_norm": 1.5498586680800102, + "learning_rate": 8.937732775904667e-06, + "loss": 0.4083, + "step": 8103 + }, + { + "epoch": 0.24, + "grad_norm": 1.3178360870317145, + "learning_rate": 8.93744329876569e-06, + "loss": 0.3517, + "step": 8104 + }, + { + "epoch": 0.24, + "grad_norm": 1.7140225888573901, + "learning_rate": 8.93715378687873e-06, + "loss": 0.3656, + "step": 8105 + }, + { + "epoch": 0.24, + "grad_norm": 1.691504960452923, + "learning_rate": 8.936864240246347e-06, + "loss": 0.344, + "step": 8106 + }, + { + "epoch": 0.24, + "grad_norm": 1.6074145224502208, + "learning_rate": 8.936574658871094e-06, + "loss": 0.3392, + "step": 8107 + }, + { + "epoch": 0.24, + "grad_norm": 1.483083860584976, + "learning_rate": 8.936285042755523e-06, + "loss": 0.3688, + "step": 8108 + }, + { + "epoch": 0.24, + "grad_norm": 1.8235642831291237, + "learning_rate": 8.935995391902194e-06, + "loss": 0.3522, + "step": 8109 + }, + { + "epoch": 0.24, + "grad_norm": 1.5160723696940421, + "learning_rate": 8.935705706313663e-06, + "loss": 0.3974, + "step": 8110 + }, + { + "epoch": 0.24, + "grad_norm": 1.3182788147500446, + "learning_rate": 8.935415985992485e-06, + "loss": 0.3527, + "step": 8111 + }, + { + "epoch": 0.24, + "grad_norm": 1.310397849412012, + "learning_rate": 8.935126230941219e-06, + "loss": 0.3513, + "step": 8112 + }, + { + "epoch": 0.24, + "grad_norm": 1.2885222355576393, + "learning_rate": 8.934836441162418e-06, + "loss": 0.3429, + "step": 8113 + }, + { + "epoch": 0.24, + "grad_norm": 1.673304668198259, + "learning_rate": 8.934546616658642e-06, + "loss": 0.3659, + "step": 8114 + }, + { + "epoch": 0.24, + "grad_norm": 1.7597709173733898, + "learning_rate": 8.934256757432448e-06, + "loss": 0.3507, + "step": 8115 + }, + { + "epoch": 0.24, + "grad_norm": 2.064691787262311, + "learning_rate": 8.933966863486396e-06, + "loss": 0.3663, + "step": 8116 + }, + { + "epoch": 0.24, + "grad_norm": 1.5388326960492813, + "learning_rate": 8.933676934823042e-06, + "loss": 0.3673, + "step": 8117 + }, + { + "epoch": 0.24, + "grad_norm": 1.4807272120172283, + "learning_rate": 8.933386971444945e-06, + "loss": 0.3536, + "step": 8118 + }, + { + "epoch": 0.24, + "grad_norm": 1.4874949822461194, + "learning_rate": 8.933096973354665e-06, + "loss": 0.3517, + "step": 8119 + }, + { + "epoch": 0.24, + "grad_norm": 2.0622710936409647, + "learning_rate": 8.93280694055476e-06, + "loss": 0.3557, + "step": 8120 + }, + { + "epoch": 0.24, + "grad_norm": 1.3791076383499805, + "learning_rate": 8.932516873047792e-06, + "loss": 0.3755, + "step": 8121 + }, + { + "epoch": 0.24, + "grad_norm": 1.3745581666948143, + "learning_rate": 8.932226770836316e-06, + "loss": 0.3496, + "step": 8122 + }, + { + "epoch": 0.24, + "grad_norm": 1.7826428149127198, + "learning_rate": 8.931936633922897e-06, + "loss": 0.4014, + "step": 8123 + }, + { + "epoch": 0.24, + "grad_norm": 1.3985852901356937, + "learning_rate": 8.931646462310093e-06, + "loss": 0.41, + "step": 8124 + }, + { + "epoch": 0.24, + "grad_norm": 1.474997239493543, + "learning_rate": 8.931356256000465e-06, + "loss": 0.3547, + "step": 8125 + }, + { + "epoch": 0.24, + "grad_norm": 1.4858724178412102, + "learning_rate": 8.931066014996576e-06, + "loss": 0.3589, + "step": 8126 + }, + { + "epoch": 0.24, + "grad_norm": 1.413450771517689, + "learning_rate": 8.930775739300984e-06, + "loss": 0.3756, + "step": 8127 + }, + { + "epoch": 0.24, + "grad_norm": 1.9867433101909426, + "learning_rate": 8.930485428916254e-06, + "loss": 0.3278, + "step": 8128 + }, + { + "epoch": 0.24, + "grad_norm": 1.853279637942278, + "learning_rate": 8.930195083844946e-06, + "loss": 0.3571, + "step": 8129 + }, + { + "epoch": 0.24, + "grad_norm": 1.4797920515155776, + "learning_rate": 8.929904704089622e-06, + "loss": 0.383, + "step": 8130 + }, + { + "epoch": 0.24, + "grad_norm": 1.6053433385583034, + "learning_rate": 8.929614289652845e-06, + "loss": 0.3656, + "step": 8131 + }, + { + "epoch": 0.24, + "grad_norm": 1.5675843733102421, + "learning_rate": 8.929323840537181e-06, + "loss": 0.3933, + "step": 8132 + }, + { + "epoch": 0.24, + "grad_norm": 2.329091258040536, + "learning_rate": 8.929033356745187e-06, + "loss": 0.3239, + "step": 8133 + }, + { + "epoch": 0.24, + "grad_norm": 1.7093680626727896, + "learning_rate": 8.928742838279432e-06, + "loss": 0.4016, + "step": 8134 + }, + { + "epoch": 0.24, + "grad_norm": 1.0249774371650262, + "learning_rate": 8.92845228514248e-06, + "loss": 0.6207, + "step": 8135 + }, + { + "epoch": 0.24, + "grad_norm": 1.4124636549164629, + "learning_rate": 8.92816169733689e-06, + "loss": 0.3696, + "step": 8136 + }, + { + "epoch": 0.24, + "grad_norm": 1.8518609747592734, + "learning_rate": 8.92787107486523e-06, + "loss": 0.3982, + "step": 8137 + }, + { + "epoch": 0.24, + "grad_norm": 1.340063666795821, + "learning_rate": 8.927580417730064e-06, + "loss": 0.3726, + "step": 8138 + }, + { + "epoch": 0.24, + "grad_norm": 1.515313010944373, + "learning_rate": 8.927289725933956e-06, + "loss": 0.3575, + "step": 8139 + }, + { + "epoch": 0.24, + "grad_norm": 1.564226880796304, + "learning_rate": 8.926998999479475e-06, + "loss": 0.3632, + "step": 8140 + }, + { + "epoch": 0.24, + "grad_norm": 1.4951437143199886, + "learning_rate": 8.926708238369184e-06, + "loss": 0.3668, + "step": 8141 + }, + { + "epoch": 0.24, + "grad_norm": 1.565189716299746, + "learning_rate": 8.926417442605648e-06, + "loss": 0.3992, + "step": 8142 + }, + { + "epoch": 0.24, + "grad_norm": 1.3799266698924955, + "learning_rate": 8.926126612191437e-06, + "loss": 0.3855, + "step": 8143 + }, + { + "epoch": 0.24, + "grad_norm": 1.3937475080446309, + "learning_rate": 8.925835747129112e-06, + "loss": 0.3264, + "step": 8144 + }, + { + "epoch": 0.24, + "grad_norm": 1.5033946529794164, + "learning_rate": 8.925544847421244e-06, + "loss": 0.3835, + "step": 8145 + }, + { + "epoch": 0.24, + "grad_norm": 1.4350073094045526, + "learning_rate": 8.9252539130704e-06, + "loss": 0.3979, + "step": 8146 + }, + { + "epoch": 0.24, + "grad_norm": 2.3522392159871743, + "learning_rate": 8.924962944079145e-06, + "loss": 0.3789, + "step": 8147 + }, + { + "epoch": 0.24, + "grad_norm": 1.3513493327770634, + "learning_rate": 8.92467194045005e-06, + "loss": 0.3772, + "step": 8148 + }, + { + "epoch": 0.24, + "grad_norm": 1.338110867107777, + "learning_rate": 8.92438090218568e-06, + "loss": 0.352, + "step": 8149 + }, + { + "epoch": 0.24, + "grad_norm": 1.3483296908703155, + "learning_rate": 8.924089829288606e-06, + "loss": 0.3601, + "step": 8150 + }, + { + "epoch": 0.24, + "grad_norm": 1.7881739536063463, + "learning_rate": 8.923798721761398e-06, + "loss": 0.3889, + "step": 8151 + }, + { + "epoch": 0.24, + "grad_norm": 1.6551055292746233, + "learning_rate": 8.92350757960662e-06, + "loss": 0.3956, + "step": 8152 + }, + { + "epoch": 0.24, + "grad_norm": 1.669522595818288, + "learning_rate": 8.923216402826843e-06, + "loss": 0.3558, + "step": 8153 + }, + { + "epoch": 0.24, + "grad_norm": 1.3777922815443566, + "learning_rate": 8.92292519142464e-06, + "loss": 0.3619, + "step": 8154 + }, + { + "epoch": 0.24, + "grad_norm": 1.5651083040960738, + "learning_rate": 8.922633945402578e-06, + "loss": 0.3736, + "step": 8155 + }, + { + "epoch": 0.24, + "grad_norm": 1.6077265712693367, + "learning_rate": 8.92234266476323e-06, + "loss": 0.3789, + "step": 8156 + }, + { + "epoch": 0.24, + "grad_norm": 1.5145965921742615, + "learning_rate": 8.922051349509162e-06, + "loss": 0.353, + "step": 8157 + }, + { + "epoch": 0.24, + "grad_norm": 1.6602287071142339, + "learning_rate": 8.921759999642947e-06, + "loss": 0.4025, + "step": 8158 + }, + { + "epoch": 0.24, + "grad_norm": 1.6232202327419896, + "learning_rate": 8.921468615167158e-06, + "loss": 0.3714, + "step": 8159 + }, + { + "epoch": 0.24, + "grad_norm": 1.4808819562464146, + "learning_rate": 8.921177196084364e-06, + "loss": 0.3768, + "step": 8160 + }, + { + "epoch": 0.24, + "grad_norm": 1.3923800521241025, + "learning_rate": 8.920885742397138e-06, + "loss": 0.3627, + "step": 8161 + }, + { + "epoch": 0.24, + "grad_norm": 1.3506519492619662, + "learning_rate": 8.920594254108052e-06, + "loss": 0.3953, + "step": 8162 + }, + { + "epoch": 0.24, + "grad_norm": 1.5289107108937345, + "learning_rate": 8.92030273121968e-06, + "loss": 0.3584, + "step": 8163 + }, + { + "epoch": 0.24, + "grad_norm": 2.033552986693478, + "learning_rate": 8.920011173734588e-06, + "loss": 0.3685, + "step": 8164 + }, + { + "epoch": 0.24, + "grad_norm": 1.5144358803016462, + "learning_rate": 8.919719581655357e-06, + "loss": 0.3695, + "step": 8165 + }, + { + "epoch": 0.24, + "grad_norm": 1.3660288188538332, + "learning_rate": 8.919427954984559e-06, + "loss": 0.3644, + "step": 8166 + }, + { + "epoch": 0.24, + "grad_norm": 1.460270044954108, + "learning_rate": 8.919136293724762e-06, + "loss": 0.3567, + "step": 8167 + }, + { + "epoch": 0.24, + "grad_norm": 1.4289383537903386, + "learning_rate": 8.918844597878547e-06, + "loss": 0.3595, + "step": 8168 + }, + { + "epoch": 0.24, + "grad_norm": 2.9257754414214494, + "learning_rate": 8.918552867448483e-06, + "loss": 0.3468, + "step": 8169 + }, + { + "epoch": 0.24, + "grad_norm": 1.8468072883045117, + "learning_rate": 8.918261102437146e-06, + "loss": 0.3476, + "step": 8170 + }, + { + "epoch": 0.24, + "grad_norm": 1.4874839225486682, + "learning_rate": 8.917969302847113e-06, + "loss": 0.3431, + "step": 8171 + }, + { + "epoch": 0.24, + "grad_norm": 1.469459613477567, + "learning_rate": 8.917677468680958e-06, + "loss": 0.3567, + "step": 8172 + }, + { + "epoch": 0.24, + "grad_norm": 1.3829449656947308, + "learning_rate": 8.917385599941256e-06, + "loss": 0.3747, + "step": 8173 + }, + { + "epoch": 0.24, + "grad_norm": 1.4019652025522664, + "learning_rate": 8.91709369663058e-06, + "loss": 0.3485, + "step": 8174 + }, + { + "epoch": 0.24, + "grad_norm": 1.4889927209533906, + "learning_rate": 8.91680175875151e-06, + "loss": 0.3889, + "step": 8175 + }, + { + "epoch": 0.24, + "grad_norm": 1.5935292219984807, + "learning_rate": 8.916509786306619e-06, + "loss": 0.3907, + "step": 8176 + }, + { + "epoch": 0.24, + "grad_norm": 1.428081900573338, + "learning_rate": 8.91621777929849e-06, + "loss": 0.3705, + "step": 8177 + }, + { + "epoch": 0.24, + "grad_norm": 1.3086764950399246, + "learning_rate": 8.915925737729692e-06, + "loss": 0.3446, + "step": 8178 + }, + { + "epoch": 0.24, + "grad_norm": 1.6548678270472421, + "learning_rate": 8.915633661602807e-06, + "loss": 0.38, + "step": 8179 + }, + { + "epoch": 0.24, + "grad_norm": 1.4104350851065977, + "learning_rate": 8.915341550920413e-06, + "loss": 0.3505, + "step": 8180 + }, + { + "epoch": 0.24, + "grad_norm": 1.5010447423622246, + "learning_rate": 8.915049405685084e-06, + "loss": 0.3702, + "step": 8181 + }, + { + "epoch": 0.24, + "grad_norm": 1.4994650905931157, + "learning_rate": 8.914757225899402e-06, + "loss": 0.353, + "step": 8182 + }, + { + "epoch": 0.24, + "grad_norm": 1.7604070445520668, + "learning_rate": 8.914465011565945e-06, + "loss": 0.3886, + "step": 8183 + }, + { + "epoch": 0.24, + "grad_norm": 1.7282157683119974, + "learning_rate": 8.91417276268729e-06, + "loss": 0.3506, + "step": 8184 + }, + { + "epoch": 0.24, + "grad_norm": 1.5168989366936132, + "learning_rate": 8.913880479266015e-06, + "loss": 0.3874, + "step": 8185 + }, + { + "epoch": 0.24, + "grad_norm": 1.3589824778280555, + "learning_rate": 8.913588161304703e-06, + "loss": 0.3602, + "step": 8186 + }, + { + "epoch": 0.24, + "grad_norm": 1.3733041785579259, + "learning_rate": 8.913295808805933e-06, + "loss": 0.3637, + "step": 8187 + }, + { + "epoch": 0.24, + "grad_norm": 1.500518235688835, + "learning_rate": 8.913003421772281e-06, + "loss": 0.3621, + "step": 8188 + }, + { + "epoch": 0.24, + "grad_norm": 1.7320593968725921, + "learning_rate": 8.912711000206332e-06, + "loss": 0.3503, + "step": 8189 + }, + { + "epoch": 0.24, + "grad_norm": 5.31746979130642, + "learning_rate": 8.912418544110667e-06, + "loss": 0.3728, + "step": 8190 + }, + { + "epoch": 0.24, + "grad_norm": 1.3579679375962221, + "learning_rate": 8.912126053487864e-06, + "loss": 0.3582, + "step": 8191 + }, + { + "epoch": 0.24, + "grad_norm": 1.7959411381314316, + "learning_rate": 8.911833528340504e-06, + "loss": 0.352, + "step": 8192 + }, + { + "epoch": 0.24, + "grad_norm": 1.6164942231150243, + "learning_rate": 8.91154096867117e-06, + "loss": 0.3901, + "step": 8193 + }, + { + "epoch": 0.24, + "grad_norm": 1.6773340830253254, + "learning_rate": 8.911248374482444e-06, + "loss": 0.3833, + "step": 8194 + }, + { + "epoch": 0.24, + "grad_norm": 1.3327504838067414, + "learning_rate": 8.910955745776908e-06, + "loss": 0.3359, + "step": 8195 + }, + { + "epoch": 0.24, + "grad_norm": 1.5883935200874404, + "learning_rate": 8.910663082557143e-06, + "loss": 0.4098, + "step": 8196 + }, + { + "epoch": 0.24, + "grad_norm": 1.438089373222481, + "learning_rate": 8.910370384825735e-06, + "loss": 0.3928, + "step": 8197 + }, + { + "epoch": 0.24, + "grad_norm": 1.3624588443782508, + "learning_rate": 8.910077652585264e-06, + "loss": 0.3657, + "step": 8198 + }, + { + "epoch": 0.24, + "grad_norm": 1.5375687349876235, + "learning_rate": 8.909784885838314e-06, + "loss": 0.3632, + "step": 8199 + }, + { + "epoch": 0.24, + "grad_norm": 2.4978721072064527, + "learning_rate": 8.909492084587468e-06, + "loss": 0.3816, + "step": 8200 + }, + { + "epoch": 0.24, + "grad_norm": 1.5383158042577807, + "learning_rate": 8.909199248835312e-06, + "loss": 0.3436, + "step": 8201 + }, + { + "epoch": 0.24, + "grad_norm": 1.2317653509879196, + "learning_rate": 8.90890637858443e-06, + "loss": 0.3466, + "step": 8202 + }, + { + "epoch": 0.24, + "grad_norm": 1.4418084064440941, + "learning_rate": 8.908613473837404e-06, + "loss": 0.3474, + "step": 8203 + }, + { + "epoch": 0.24, + "grad_norm": 1.5040900219565392, + "learning_rate": 8.908320534596822e-06, + "loss": 0.3776, + "step": 8204 + }, + { + "epoch": 0.24, + "grad_norm": 1.407634674827319, + "learning_rate": 8.908027560865269e-06, + "loss": 0.3705, + "step": 8205 + }, + { + "epoch": 0.24, + "grad_norm": 0.9854778022318121, + "learning_rate": 8.907734552645328e-06, + "loss": 0.5617, + "step": 8206 + }, + { + "epoch": 0.24, + "grad_norm": 1.423222831829299, + "learning_rate": 8.907441509939585e-06, + "loss": 0.3599, + "step": 8207 + }, + { + "epoch": 0.24, + "grad_norm": 1.606966073902039, + "learning_rate": 8.90714843275063e-06, + "loss": 0.4018, + "step": 8208 + }, + { + "epoch": 0.24, + "grad_norm": 1.8094231752989893, + "learning_rate": 8.906855321081045e-06, + "loss": 0.3483, + "step": 8209 + }, + { + "epoch": 0.24, + "grad_norm": 1.5682472282770192, + "learning_rate": 8.906562174933418e-06, + "loss": 0.3565, + "step": 8210 + }, + { + "epoch": 0.24, + "grad_norm": 1.365145329191799, + "learning_rate": 8.906268994310339e-06, + "loss": 0.3699, + "step": 8211 + }, + { + "epoch": 0.24, + "grad_norm": 1.4140098443159366, + "learning_rate": 8.905975779214391e-06, + "loss": 0.3765, + "step": 8212 + }, + { + "epoch": 0.24, + "grad_norm": 1.272214337621739, + "learning_rate": 8.905682529648163e-06, + "loss": 0.3641, + "step": 8213 + }, + { + "epoch": 0.24, + "grad_norm": 1.2900912342640616, + "learning_rate": 8.905389245614245e-06, + "loss": 0.3631, + "step": 8214 + }, + { + "epoch": 0.24, + "grad_norm": 1.2727746523733987, + "learning_rate": 8.905095927115222e-06, + "loss": 0.38, + "step": 8215 + }, + { + "epoch": 0.24, + "grad_norm": 1.3400922440240595, + "learning_rate": 8.904802574153684e-06, + "loss": 0.3929, + "step": 8216 + }, + { + "epoch": 0.24, + "grad_norm": 1.8578104997422071, + "learning_rate": 8.90450918673222e-06, + "loss": 0.4033, + "step": 8217 + }, + { + "epoch": 0.24, + "grad_norm": 1.409277554012361, + "learning_rate": 8.90421576485342e-06, + "loss": 0.3563, + "step": 8218 + }, + { + "epoch": 0.24, + "grad_norm": 1.485005583936081, + "learning_rate": 8.903922308519872e-06, + "loss": 0.3851, + "step": 8219 + }, + { + "epoch": 0.24, + "grad_norm": 1.428619999337725, + "learning_rate": 8.903628817734166e-06, + "loss": 0.3605, + "step": 8220 + }, + { + "epoch": 0.24, + "grad_norm": 1.4886445895476519, + "learning_rate": 8.903335292498894e-06, + "loss": 0.3725, + "step": 8221 + }, + { + "epoch": 0.24, + "grad_norm": 1.8901190627191542, + "learning_rate": 8.903041732816644e-06, + "loss": 0.3545, + "step": 8222 + }, + { + "epoch": 0.24, + "grad_norm": 1.4411066525076186, + "learning_rate": 8.902748138690007e-06, + "loss": 0.3537, + "step": 8223 + }, + { + "epoch": 0.24, + "grad_norm": 1.3461484008796696, + "learning_rate": 8.902454510121575e-06, + "loss": 0.337, + "step": 8224 + }, + { + "epoch": 0.24, + "grad_norm": 1.3697521629716283, + "learning_rate": 8.902160847113937e-06, + "loss": 0.3705, + "step": 8225 + }, + { + "epoch": 0.24, + "grad_norm": 1.3906790679060905, + "learning_rate": 8.901867149669687e-06, + "loss": 0.3417, + "step": 8226 + }, + { + "epoch": 0.24, + "grad_norm": 1.4976356622333886, + "learning_rate": 8.901573417791418e-06, + "loss": 0.3423, + "step": 8227 + }, + { + "epoch": 0.24, + "grad_norm": 1.4597526183167246, + "learning_rate": 8.901279651481718e-06, + "loss": 0.3799, + "step": 8228 + }, + { + "epoch": 0.24, + "grad_norm": 1.5668915173217826, + "learning_rate": 8.900985850743183e-06, + "loss": 0.356, + "step": 8229 + }, + { + "epoch": 0.24, + "grad_norm": 1.2758951854988176, + "learning_rate": 8.900692015578404e-06, + "loss": 0.3709, + "step": 8230 + }, + { + "epoch": 0.24, + "grad_norm": 2.1121445896817828, + "learning_rate": 8.900398145989974e-06, + "loss": 0.3688, + "step": 8231 + }, + { + "epoch": 0.24, + "grad_norm": 1.3212368722053156, + "learning_rate": 8.900104241980488e-06, + "loss": 0.3709, + "step": 8232 + }, + { + "epoch": 0.24, + "grad_norm": 2.893343257082494, + "learning_rate": 8.89981030355254e-06, + "loss": 0.3605, + "step": 8233 + }, + { + "epoch": 0.24, + "grad_norm": 1.3993332865796542, + "learning_rate": 8.89951633070872e-06, + "loss": 0.3755, + "step": 8234 + }, + { + "epoch": 0.24, + "grad_norm": 1.3601646379601582, + "learning_rate": 8.899222323451628e-06, + "loss": 0.3876, + "step": 8235 + }, + { + "epoch": 0.24, + "grad_norm": 1.736309879623173, + "learning_rate": 8.898928281783854e-06, + "loss": 0.3726, + "step": 8236 + }, + { + "epoch": 0.24, + "grad_norm": 1.565425147776738, + "learning_rate": 8.898634205707995e-06, + "loss": 0.3623, + "step": 8237 + }, + { + "epoch": 0.24, + "grad_norm": 1.514388670609146, + "learning_rate": 8.898340095226646e-06, + "loss": 0.347, + "step": 8238 + }, + { + "epoch": 0.24, + "grad_norm": 1.376370359579825, + "learning_rate": 8.898045950342401e-06, + "loss": 0.3702, + "step": 8239 + }, + { + "epoch": 0.24, + "grad_norm": 1.394508584572808, + "learning_rate": 8.89775177105786e-06, + "loss": 0.3877, + "step": 8240 + }, + { + "epoch": 0.24, + "grad_norm": 1.4012497204873609, + "learning_rate": 8.897457557375614e-06, + "loss": 0.3722, + "step": 8241 + }, + { + "epoch": 0.24, + "grad_norm": 1.4273207710681732, + "learning_rate": 8.897163309298264e-06, + "loss": 0.359, + "step": 8242 + }, + { + "epoch": 0.24, + "grad_norm": 1.7293053201124242, + "learning_rate": 8.896869026828403e-06, + "loss": 0.3836, + "step": 8243 + }, + { + "epoch": 0.24, + "grad_norm": 1.5064479130176303, + "learning_rate": 8.896574709968631e-06, + "loss": 0.3928, + "step": 8244 + }, + { + "epoch": 0.24, + "grad_norm": 1.3663491334351718, + "learning_rate": 8.896280358721543e-06, + "loss": 0.3632, + "step": 8245 + }, + { + "epoch": 0.24, + "grad_norm": 1.5246606900019148, + "learning_rate": 8.895985973089739e-06, + "loss": 0.3585, + "step": 8246 + }, + { + "epoch": 0.24, + "grad_norm": 1.4745787754189728, + "learning_rate": 8.895691553075813e-06, + "loss": 0.3834, + "step": 8247 + }, + { + "epoch": 0.24, + "grad_norm": 1.6772058689594813, + "learning_rate": 8.895397098682367e-06, + "loss": 0.3622, + "step": 8248 + }, + { + "epoch": 0.24, + "grad_norm": 1.5983837504303076, + "learning_rate": 8.895102609911999e-06, + "loss": 0.3701, + "step": 8249 + }, + { + "epoch": 0.24, + "grad_norm": 0.995241664942383, + "learning_rate": 8.894808086767306e-06, + "loss": 0.6226, + "step": 8250 + }, + { + "epoch": 0.24, + "grad_norm": 1.4330967942716644, + "learning_rate": 8.894513529250888e-06, + "loss": 0.3724, + "step": 8251 + }, + { + "epoch": 0.24, + "grad_norm": 2.003328705684465, + "learning_rate": 8.894218937365346e-06, + "loss": 0.3717, + "step": 8252 + }, + { + "epoch": 0.24, + "grad_norm": 0.9690132585312163, + "learning_rate": 8.893924311113279e-06, + "loss": 0.5981, + "step": 8253 + }, + { + "epoch": 0.24, + "grad_norm": 1.471647429970546, + "learning_rate": 8.893629650497287e-06, + "loss": 0.3676, + "step": 8254 + }, + { + "epoch": 0.24, + "grad_norm": 1.3431897013001692, + "learning_rate": 8.89333495551997e-06, + "loss": 0.3749, + "step": 8255 + }, + { + "epoch": 0.24, + "grad_norm": 1.3603021498547525, + "learning_rate": 8.893040226183928e-06, + "loss": 0.3532, + "step": 8256 + }, + { + "epoch": 0.24, + "grad_norm": 1.489979056732371, + "learning_rate": 8.892745462491763e-06, + "loss": 0.349, + "step": 8257 + }, + { + "epoch": 0.24, + "grad_norm": 1.8002582347327358, + "learning_rate": 8.892450664446076e-06, + "loss": 0.3616, + "step": 8258 + }, + { + "epoch": 0.24, + "grad_norm": 1.4204679053407714, + "learning_rate": 8.89215583204947e-06, + "loss": 0.392, + "step": 8259 + }, + { + "epoch": 0.24, + "grad_norm": 1.4965367103582567, + "learning_rate": 8.891860965304543e-06, + "loss": 0.3613, + "step": 8260 + }, + { + "epoch": 0.24, + "grad_norm": 1.379569949853908, + "learning_rate": 8.891566064213902e-06, + "loss": 0.3857, + "step": 8261 + }, + { + "epoch": 0.24, + "grad_norm": 1.4958381567011714, + "learning_rate": 8.891271128780146e-06, + "loss": 0.373, + "step": 8262 + }, + { + "epoch": 0.24, + "grad_norm": 1.5939324898638927, + "learning_rate": 8.890976159005881e-06, + "loss": 0.3831, + "step": 8263 + }, + { + "epoch": 0.24, + "grad_norm": 1.444425868048595, + "learning_rate": 8.890681154893707e-06, + "loss": 0.3821, + "step": 8264 + }, + { + "epoch": 0.24, + "grad_norm": 1.5402767718935135, + "learning_rate": 8.890386116446228e-06, + "loss": 0.3948, + "step": 8265 + }, + { + "epoch": 0.24, + "grad_norm": 1.690362543589016, + "learning_rate": 8.89009104366605e-06, + "loss": 0.377, + "step": 8266 + }, + { + "epoch": 0.24, + "grad_norm": 1.378447893911719, + "learning_rate": 8.889795936555774e-06, + "loss": 0.3563, + "step": 8267 + }, + { + "epoch": 0.24, + "grad_norm": 1.3588754416545137, + "learning_rate": 8.889500795118007e-06, + "loss": 0.3889, + "step": 8268 + }, + { + "epoch": 0.24, + "grad_norm": 1.326858176596197, + "learning_rate": 8.889205619355351e-06, + "loss": 0.3465, + "step": 8269 + }, + { + "epoch": 0.24, + "grad_norm": 1.5848858663257406, + "learning_rate": 8.888910409270412e-06, + "loss": 0.3446, + "step": 8270 + }, + { + "epoch": 0.24, + "grad_norm": 1.45873908828001, + "learning_rate": 8.888615164865798e-06, + "loss": 0.3595, + "step": 8271 + }, + { + "epoch": 0.24, + "grad_norm": 1.3973524474816414, + "learning_rate": 8.888319886144109e-06, + "loss": 0.3656, + "step": 8272 + }, + { + "epoch": 0.24, + "grad_norm": 2.6061416281807235, + "learning_rate": 8.888024573107956e-06, + "loss": 0.3821, + "step": 8273 + }, + { + "epoch": 0.24, + "grad_norm": 1.28700099874986, + "learning_rate": 8.887729225759941e-06, + "loss": 0.3487, + "step": 8274 + }, + { + "epoch": 0.24, + "grad_norm": 1.4943717613616756, + "learning_rate": 8.887433844102675e-06, + "loss": 0.3797, + "step": 8275 + }, + { + "epoch": 0.24, + "grad_norm": 1.5176931978333879, + "learning_rate": 8.88713842813876e-06, + "loss": 0.3618, + "step": 8276 + }, + { + "epoch": 0.24, + "grad_norm": 1.5951492169937633, + "learning_rate": 8.886842977870807e-06, + "loss": 0.3602, + "step": 8277 + }, + { + "epoch": 0.24, + "grad_norm": 1.349051163228469, + "learning_rate": 8.88654749330142e-06, + "loss": 0.3723, + "step": 8278 + }, + { + "epoch": 0.24, + "grad_norm": 1.4104228846567002, + "learning_rate": 8.886251974433209e-06, + "loss": 0.3663, + "step": 8279 + }, + { + "epoch": 0.24, + "grad_norm": 1.397129233685884, + "learning_rate": 8.88595642126878e-06, + "loss": 0.3675, + "step": 8280 + }, + { + "epoch": 0.24, + "grad_norm": 1.4495534679088797, + "learning_rate": 8.885660833810745e-06, + "loss": 0.3559, + "step": 8281 + }, + { + "epoch": 0.24, + "grad_norm": 1.2235538133058004, + "learning_rate": 8.885365212061709e-06, + "loss": 0.3529, + "step": 8282 + }, + { + "epoch": 0.24, + "grad_norm": 2.6852492804626533, + "learning_rate": 8.885069556024281e-06, + "loss": 0.3767, + "step": 8283 + }, + { + "epoch": 0.24, + "grad_norm": 1.5538310374635609, + "learning_rate": 8.884773865701072e-06, + "loss": 0.3475, + "step": 8284 + }, + { + "epoch": 0.24, + "grad_norm": 1.6763200816572932, + "learning_rate": 8.88447814109469e-06, + "loss": 0.3644, + "step": 8285 + }, + { + "epoch": 0.24, + "grad_norm": 1.4137612690260388, + "learning_rate": 8.884182382207745e-06, + "loss": 0.3939, + "step": 8286 + }, + { + "epoch": 0.24, + "grad_norm": 1.8240015643769911, + "learning_rate": 8.883886589042849e-06, + "loss": 0.3816, + "step": 8287 + }, + { + "epoch": 0.24, + "grad_norm": 1.3852605510350442, + "learning_rate": 8.883590761602609e-06, + "loss": 0.3674, + "step": 8288 + }, + { + "epoch": 0.24, + "grad_norm": 1.4505497100689269, + "learning_rate": 8.883294899889638e-06, + "loss": 0.3625, + "step": 8289 + }, + { + "epoch": 0.24, + "grad_norm": 1.6024400045388583, + "learning_rate": 8.882999003906547e-06, + "loss": 0.3955, + "step": 8290 + }, + { + "epoch": 0.24, + "grad_norm": 1.2849621571333516, + "learning_rate": 8.882703073655944e-06, + "loss": 0.3656, + "step": 8291 + }, + { + "epoch": 0.24, + "grad_norm": 1.3367014170192886, + "learning_rate": 8.882407109140447e-06, + "loss": 0.3733, + "step": 8292 + }, + { + "epoch": 0.24, + "grad_norm": 1.3879222614087876, + "learning_rate": 8.882111110362663e-06, + "loss": 0.3813, + "step": 8293 + }, + { + "epoch": 0.24, + "grad_norm": 1.3755993413485685, + "learning_rate": 8.881815077325205e-06, + "loss": 0.3525, + "step": 8294 + }, + { + "epoch": 0.24, + "grad_norm": 1.66639213652087, + "learning_rate": 8.881519010030686e-06, + "loss": 0.3733, + "step": 8295 + }, + { + "epoch": 0.24, + "grad_norm": 1.2912232929239857, + "learning_rate": 8.881222908481719e-06, + "loss": 0.3566, + "step": 8296 + }, + { + "epoch": 0.24, + "grad_norm": 1.4255509722785482, + "learning_rate": 8.880926772680916e-06, + "loss": 0.3756, + "step": 8297 + }, + { + "epoch": 0.24, + "grad_norm": 1.2923151644098347, + "learning_rate": 8.880630602630892e-06, + "loss": 0.3904, + "step": 8298 + }, + { + "epoch": 0.24, + "grad_norm": 1.4662263119085475, + "learning_rate": 8.88033439833426e-06, + "loss": 0.3586, + "step": 8299 + }, + { + "epoch": 0.24, + "grad_norm": 1.4204361347925825, + "learning_rate": 8.880038159793633e-06, + "loss": 0.3596, + "step": 8300 + }, + { + "epoch": 0.24, + "grad_norm": 1.4416499861013554, + "learning_rate": 8.879741887011626e-06, + "loss": 0.3478, + "step": 8301 + }, + { + "epoch": 0.24, + "grad_norm": 1.5502197820414125, + "learning_rate": 8.879445579990855e-06, + "loss": 0.3477, + "step": 8302 + }, + { + "epoch": 0.24, + "grad_norm": 1.344748510901559, + "learning_rate": 8.879149238733932e-06, + "loss": 0.3616, + "step": 8303 + }, + { + "epoch": 0.24, + "grad_norm": 1.582385960988046, + "learning_rate": 8.878852863243477e-06, + "loss": 0.3621, + "step": 8304 + }, + { + "epoch": 0.24, + "grad_norm": 1.559295896713261, + "learning_rate": 8.8785564535221e-06, + "loss": 0.3445, + "step": 8305 + }, + { + "epoch": 0.24, + "grad_norm": 1.3539890631085953, + "learning_rate": 8.87826000957242e-06, + "loss": 0.3746, + "step": 8306 + }, + { + "epoch": 0.24, + "grad_norm": 1.4206717909210704, + "learning_rate": 8.877963531397052e-06, + "loss": 0.3783, + "step": 8307 + }, + { + "epoch": 0.24, + "grad_norm": 3.135398853630182, + "learning_rate": 8.877667018998613e-06, + "loss": 0.4177, + "step": 8308 + }, + { + "epoch": 0.24, + "grad_norm": 1.62120390799164, + "learning_rate": 8.87737047237972e-06, + "loss": 0.3639, + "step": 8309 + }, + { + "epoch": 0.24, + "grad_norm": 1.5000577459343556, + "learning_rate": 8.877073891542988e-06, + "loss": 0.3532, + "step": 8310 + }, + { + "epoch": 0.24, + "grad_norm": 1.3568738640071225, + "learning_rate": 8.876777276491037e-06, + "loss": 0.3799, + "step": 8311 + }, + { + "epoch": 0.24, + "grad_norm": 1.4104243976490185, + "learning_rate": 8.876480627226485e-06, + "loss": 0.4277, + "step": 8312 + }, + { + "epoch": 0.24, + "grad_norm": 1.4150416228822038, + "learning_rate": 8.876183943751946e-06, + "loss": 0.3833, + "step": 8313 + }, + { + "epoch": 0.24, + "grad_norm": 1.2772137169018487, + "learning_rate": 8.875887226070041e-06, + "loss": 0.3723, + "step": 8314 + }, + { + "epoch": 0.24, + "grad_norm": 1.3280523668988913, + "learning_rate": 8.87559047418339e-06, + "loss": 0.3826, + "step": 8315 + }, + { + "epoch": 0.24, + "grad_norm": 1.3635788134274072, + "learning_rate": 8.875293688094608e-06, + "loss": 0.3714, + "step": 8316 + }, + { + "epoch": 0.24, + "grad_norm": 1.4631255089044402, + "learning_rate": 8.874996867806318e-06, + "loss": 0.3572, + "step": 8317 + }, + { + "epoch": 0.24, + "grad_norm": 1.5445838709307662, + "learning_rate": 8.874700013321136e-06, + "loss": 0.3846, + "step": 8318 + }, + { + "epoch": 0.24, + "grad_norm": 1.464334385871424, + "learning_rate": 8.874403124641685e-06, + "loss": 0.3781, + "step": 8319 + }, + { + "epoch": 0.24, + "grad_norm": 1.3908307888172515, + "learning_rate": 8.874106201770583e-06, + "loss": 0.4074, + "step": 8320 + }, + { + "epoch": 0.24, + "grad_norm": 1.9657902303324641, + "learning_rate": 8.87380924471045e-06, + "loss": 0.3669, + "step": 8321 + }, + { + "epoch": 0.24, + "grad_norm": 1.4186803567694388, + "learning_rate": 8.873512253463908e-06, + "loss": 0.388, + "step": 8322 + }, + { + "epoch": 0.24, + "grad_norm": 1.5280046548697963, + "learning_rate": 8.873215228033578e-06, + "loss": 0.396, + "step": 8323 + }, + { + "epoch": 0.24, + "grad_norm": 1.3472109239444656, + "learning_rate": 8.87291816842208e-06, + "loss": 0.3473, + "step": 8324 + }, + { + "epoch": 0.24, + "grad_norm": 1.267197113467158, + "learning_rate": 8.872621074632035e-06, + "loss": 0.3621, + "step": 8325 + }, + { + "epoch": 0.24, + "grad_norm": 1.47187171025412, + "learning_rate": 8.872323946666068e-06, + "loss": 0.3558, + "step": 8326 + }, + { + "epoch": 0.24, + "grad_norm": 1.6704606805917033, + "learning_rate": 8.872026784526798e-06, + "loss": 0.3659, + "step": 8327 + }, + { + "epoch": 0.24, + "grad_norm": 1.350313551765129, + "learning_rate": 8.87172958821685e-06, + "loss": 0.3618, + "step": 8328 + }, + { + "epoch": 0.24, + "grad_norm": 1.3410785703348351, + "learning_rate": 8.871432357738845e-06, + "loss": 0.3827, + "step": 8329 + }, + { + "epoch": 0.24, + "grad_norm": 1.6053301134538016, + "learning_rate": 8.871135093095405e-06, + "loss": 0.3512, + "step": 8330 + }, + { + "epoch": 0.24, + "grad_norm": 1.6900671118474675, + "learning_rate": 8.870837794289155e-06, + "loss": 0.3691, + "step": 8331 + }, + { + "epoch": 0.24, + "grad_norm": 1.5037291238196453, + "learning_rate": 8.87054046132272e-06, + "loss": 0.356, + "step": 8332 + }, + { + "epoch": 0.24, + "grad_norm": 1.5349656166302772, + "learning_rate": 8.870243094198723e-06, + "loss": 0.4111, + "step": 8333 + }, + { + "epoch": 0.24, + "grad_norm": 1.5055134310363127, + "learning_rate": 8.869945692919786e-06, + "loss": 0.3463, + "step": 8334 + }, + { + "epoch": 0.24, + "grad_norm": 1.278960261731191, + "learning_rate": 8.869648257488536e-06, + "loss": 0.381, + "step": 8335 + }, + { + "epoch": 0.24, + "grad_norm": 1.3468975588271932, + "learning_rate": 8.869350787907598e-06, + "loss": 0.3882, + "step": 8336 + }, + { + "epoch": 0.24, + "grad_norm": 1.5806731902840865, + "learning_rate": 8.869053284179595e-06, + "loss": 0.411, + "step": 8337 + }, + { + "epoch": 0.24, + "grad_norm": 1.501787374746414, + "learning_rate": 8.868755746307154e-06, + "loss": 0.4054, + "step": 8338 + }, + { + "epoch": 0.24, + "grad_norm": 1.289854222327474, + "learning_rate": 8.868458174292902e-06, + "loss": 0.3539, + "step": 8339 + }, + { + "epoch": 0.24, + "grad_norm": 1.3532037156705354, + "learning_rate": 8.868160568139462e-06, + "loss": 0.3983, + "step": 8340 + }, + { + "epoch": 0.24, + "grad_norm": 1.5155212567548584, + "learning_rate": 8.867862927849463e-06, + "loss": 0.3665, + "step": 8341 + }, + { + "epoch": 0.24, + "grad_norm": 1.369721909750606, + "learning_rate": 8.867565253425531e-06, + "loss": 0.3992, + "step": 8342 + }, + { + "epoch": 0.24, + "grad_norm": 1.5260999294894473, + "learning_rate": 8.867267544870293e-06, + "loss": 0.4087, + "step": 8343 + }, + { + "epoch": 0.24, + "grad_norm": 1.727538100665941, + "learning_rate": 8.866969802186373e-06, + "loss": 0.3843, + "step": 8344 + }, + { + "epoch": 0.24, + "grad_norm": 1.39875056152155, + "learning_rate": 8.866672025376405e-06, + "loss": 0.3827, + "step": 8345 + }, + { + "epoch": 0.24, + "grad_norm": 1.7666881448729592, + "learning_rate": 8.866374214443012e-06, + "loss": 0.3662, + "step": 8346 + }, + { + "epoch": 0.24, + "grad_norm": 1.4278232500867911, + "learning_rate": 8.866076369388824e-06, + "loss": 0.3613, + "step": 8347 + }, + { + "epoch": 0.24, + "grad_norm": 1.3852739276221007, + "learning_rate": 8.86577849021647e-06, + "loss": 0.3722, + "step": 8348 + }, + { + "epoch": 0.24, + "grad_norm": 1.61021244294432, + "learning_rate": 8.865480576928578e-06, + "loss": 0.3656, + "step": 8349 + }, + { + "epoch": 0.24, + "grad_norm": 1.5278069734300281, + "learning_rate": 8.865182629527775e-06, + "loss": 0.3379, + "step": 8350 + }, + { + "epoch": 0.24, + "grad_norm": 1.33748859298474, + "learning_rate": 8.864884648016693e-06, + "loss": 0.3527, + "step": 8351 + }, + { + "epoch": 0.24, + "grad_norm": 1.333331555513463, + "learning_rate": 8.864586632397963e-06, + "loss": 0.339, + "step": 8352 + }, + { + "epoch": 0.24, + "grad_norm": 1.332463443741237, + "learning_rate": 8.864288582674211e-06, + "loss": 0.3799, + "step": 8353 + }, + { + "epoch": 0.24, + "grad_norm": 1.5793699501647256, + "learning_rate": 8.86399049884807e-06, + "loss": 0.3983, + "step": 8354 + }, + { + "epoch": 0.24, + "grad_norm": 1.4740967373130098, + "learning_rate": 8.863692380922171e-06, + "loss": 0.3964, + "step": 8355 + }, + { + "epoch": 0.24, + "grad_norm": 0.9767941869089374, + "learning_rate": 8.863394228899142e-06, + "loss": 0.6208, + "step": 8356 + }, + { + "epoch": 0.24, + "grad_norm": 1.6792768049091267, + "learning_rate": 8.863096042781619e-06, + "loss": 0.3754, + "step": 8357 + }, + { + "epoch": 0.24, + "grad_norm": 1.6457671808194663, + "learning_rate": 8.862797822572228e-06, + "loss": 0.3848, + "step": 8358 + }, + { + "epoch": 0.24, + "grad_norm": 2.062567891755242, + "learning_rate": 8.862499568273604e-06, + "loss": 0.3548, + "step": 8359 + }, + { + "epoch": 0.24, + "grad_norm": 1.6279784735262022, + "learning_rate": 8.862201279888379e-06, + "loss": 0.4018, + "step": 8360 + }, + { + "epoch": 0.24, + "grad_norm": 1.8023561588725543, + "learning_rate": 8.861902957419183e-06, + "loss": 0.407, + "step": 8361 + }, + { + "epoch": 0.24, + "grad_norm": 1.3006768616396407, + "learning_rate": 8.861604600868653e-06, + "loss": 0.3683, + "step": 8362 + }, + { + "epoch": 0.24, + "grad_norm": 1.427476811920562, + "learning_rate": 8.861306210239418e-06, + "loss": 0.3318, + "step": 8363 + }, + { + "epoch": 0.24, + "grad_norm": 1.9692113138333618, + "learning_rate": 8.861007785534113e-06, + "loss": 0.3798, + "step": 8364 + }, + { + "epoch": 0.24, + "grad_norm": 1.4547239639090144, + "learning_rate": 8.860709326755373e-06, + "loss": 0.367, + "step": 8365 + }, + { + "epoch": 0.24, + "grad_norm": 1.410500692541919, + "learning_rate": 8.860410833905829e-06, + "loss": 0.3477, + "step": 8366 + }, + { + "epoch": 0.24, + "grad_norm": 1.419915196123675, + "learning_rate": 8.860112306988116e-06, + "loss": 0.3539, + "step": 8367 + }, + { + "epoch": 0.24, + "grad_norm": 1.4693422999531423, + "learning_rate": 8.859813746004871e-06, + "loss": 0.3494, + "step": 8368 + }, + { + "epoch": 0.24, + "grad_norm": 1.501031035445198, + "learning_rate": 8.859515150958725e-06, + "loss": 0.3903, + "step": 8369 + }, + { + "epoch": 0.24, + "grad_norm": 1.335176202886131, + "learning_rate": 8.859216521852317e-06, + "loss": 0.3431, + "step": 8370 + }, + { + "epoch": 0.24, + "grad_norm": 0.9644527022124318, + "learning_rate": 8.858917858688279e-06, + "loss": 0.6342, + "step": 8371 + }, + { + "epoch": 0.24, + "grad_norm": 1.523645046807766, + "learning_rate": 8.858619161469246e-06, + "loss": 0.3656, + "step": 8372 + }, + { + "epoch": 0.24, + "grad_norm": 1.4900875970314096, + "learning_rate": 8.858320430197859e-06, + "loss": 0.36, + "step": 8373 + }, + { + "epoch": 0.24, + "grad_norm": 1.4710770814673997, + "learning_rate": 8.85802166487675e-06, + "loss": 0.3595, + "step": 8374 + }, + { + "epoch": 0.24, + "grad_norm": 1.886340378726819, + "learning_rate": 8.857722865508557e-06, + "loss": 0.3419, + "step": 8375 + }, + { + "epoch": 0.24, + "grad_norm": 1.4493629233044338, + "learning_rate": 8.85742403209592e-06, + "loss": 0.3504, + "step": 8376 + }, + { + "epoch": 0.24, + "grad_norm": 1.5710914837560772, + "learning_rate": 8.857125164641469e-06, + "loss": 0.394, + "step": 8377 + }, + { + "epoch": 0.24, + "grad_norm": 1.5463672523103293, + "learning_rate": 8.856826263147848e-06, + "loss": 0.3842, + "step": 8378 + }, + { + "epoch": 0.24, + "grad_norm": 1.4377066349462762, + "learning_rate": 8.856527327617692e-06, + "loss": 0.3655, + "step": 8379 + }, + { + "epoch": 0.24, + "grad_norm": 1.5067674814753103, + "learning_rate": 8.85622835805364e-06, + "loss": 0.357, + "step": 8380 + }, + { + "epoch": 0.24, + "grad_norm": 2.0514371782357252, + "learning_rate": 8.855929354458328e-06, + "loss": 0.3698, + "step": 8381 + }, + { + "epoch": 0.24, + "grad_norm": 1.3157778252675736, + "learning_rate": 8.855630316834398e-06, + "loss": 0.357, + "step": 8382 + }, + { + "epoch": 0.24, + "grad_norm": 1.5215333748376512, + "learning_rate": 8.85533124518449e-06, + "loss": 0.3494, + "step": 8383 + }, + { + "epoch": 0.24, + "grad_norm": 1.4910360383597914, + "learning_rate": 8.855032139511237e-06, + "loss": 0.3521, + "step": 8384 + }, + { + "epoch": 0.24, + "grad_norm": 3.3053065013796323, + "learning_rate": 8.854732999817284e-06, + "loss": 0.3592, + "step": 8385 + }, + { + "epoch": 0.24, + "grad_norm": 1.3443940796057956, + "learning_rate": 8.85443382610527e-06, + "loss": 0.3652, + "step": 8386 + }, + { + "epoch": 0.24, + "grad_norm": 1.409075475578228, + "learning_rate": 8.854134618377835e-06, + "loss": 0.401, + "step": 8387 + }, + { + "epoch": 0.24, + "grad_norm": 1.7177401448135496, + "learning_rate": 8.853835376637619e-06, + "loss": 0.3589, + "step": 8388 + }, + { + "epoch": 0.24, + "grad_norm": 1.4895160460039474, + "learning_rate": 8.853536100887261e-06, + "loss": 0.3726, + "step": 8389 + }, + { + "epoch": 0.24, + "grad_norm": 1.4938168233966251, + "learning_rate": 8.853236791129407e-06, + "loss": 0.3491, + "step": 8390 + }, + { + "epoch": 0.24, + "grad_norm": 1.8354091487916824, + "learning_rate": 8.852937447366695e-06, + "loss": 0.3505, + "step": 8391 + }, + { + "epoch": 0.24, + "grad_norm": 1.3014468920220468, + "learning_rate": 8.852638069601767e-06, + "loss": 0.3729, + "step": 8392 + }, + { + "epoch": 0.24, + "grad_norm": 2.208237722604561, + "learning_rate": 8.852338657837263e-06, + "loss": 0.3365, + "step": 8393 + }, + { + "epoch": 0.24, + "grad_norm": 1.5089412088253347, + "learning_rate": 8.852039212075831e-06, + "loss": 0.3578, + "step": 8394 + }, + { + "epoch": 0.24, + "grad_norm": 1.4947850781266976, + "learning_rate": 8.851739732320109e-06, + "loss": 0.3715, + "step": 8395 + }, + { + "epoch": 0.24, + "grad_norm": 1.7785452310124563, + "learning_rate": 8.85144021857274e-06, + "loss": 0.3539, + "step": 8396 + }, + { + "epoch": 0.24, + "grad_norm": 1.4880953876676153, + "learning_rate": 8.85114067083637e-06, + "loss": 0.361, + "step": 8397 + }, + { + "epoch": 0.24, + "grad_norm": 1.3613182611551204, + "learning_rate": 8.85084108911364e-06, + "loss": 0.3402, + "step": 8398 + }, + { + "epoch": 0.24, + "grad_norm": 1.5169428428928085, + "learning_rate": 8.850541473407195e-06, + "loss": 0.3715, + "step": 8399 + }, + { + "epoch": 0.24, + "grad_norm": 2.195229107612802, + "learning_rate": 8.85024182371968e-06, + "loss": 0.3604, + "step": 8400 + }, + { + "epoch": 0.24, + "grad_norm": 1.9144421944736414, + "learning_rate": 8.849942140053736e-06, + "loss": 0.3697, + "step": 8401 + }, + { + "epoch": 0.24, + "grad_norm": 1.8914602461300545, + "learning_rate": 8.849642422412011e-06, + "loss": 0.3531, + "step": 8402 + }, + { + "epoch": 0.24, + "grad_norm": 1.6005115581179503, + "learning_rate": 8.849342670797148e-06, + "loss": 0.3557, + "step": 8403 + }, + { + "epoch": 0.24, + "grad_norm": 2.088617961315231, + "learning_rate": 8.849042885211795e-06, + "loss": 0.3645, + "step": 8404 + }, + { + "epoch": 0.24, + "grad_norm": 2.0577717932200765, + "learning_rate": 8.848743065658594e-06, + "loss": 0.4049, + "step": 8405 + }, + { + "epoch": 0.24, + "grad_norm": 1.470268613328547, + "learning_rate": 8.848443212140194e-06, + "loss": 0.363, + "step": 8406 + }, + { + "epoch": 0.24, + "grad_norm": 1.3516421603849582, + "learning_rate": 8.848143324659238e-06, + "loss": 0.3665, + "step": 8407 + }, + { + "epoch": 0.24, + "grad_norm": 1.498466092791609, + "learning_rate": 8.847843403218377e-06, + "loss": 0.3446, + "step": 8408 + }, + { + "epoch": 0.24, + "grad_norm": 1.4149293060842134, + "learning_rate": 8.847543447820255e-06, + "loss": 0.3868, + "step": 8409 + }, + { + "epoch": 0.24, + "grad_norm": 1.455587156324831, + "learning_rate": 8.847243458467518e-06, + "loss": 0.3669, + "step": 8410 + }, + { + "epoch": 0.24, + "grad_norm": 1.3499549532003543, + "learning_rate": 8.846943435162815e-06, + "loss": 0.3621, + "step": 8411 + }, + { + "epoch": 0.24, + "grad_norm": 1.5034794429241771, + "learning_rate": 8.846643377908795e-06, + "loss": 0.351, + "step": 8412 + }, + { + "epoch": 0.24, + "grad_norm": 1.63511478287418, + "learning_rate": 8.846343286708103e-06, + "loss": 0.3527, + "step": 8413 + }, + { + "epoch": 0.24, + "grad_norm": 2.461124597918882, + "learning_rate": 8.846043161563389e-06, + "loss": 0.3617, + "step": 8414 + }, + { + "epoch": 0.24, + "grad_norm": 2.315838650867871, + "learning_rate": 8.845743002477302e-06, + "loss": 0.3596, + "step": 8415 + }, + { + "epoch": 0.24, + "grad_norm": 1.6791491499469189, + "learning_rate": 8.845442809452489e-06, + "loss": 0.3496, + "step": 8416 + }, + { + "epoch": 0.24, + "grad_norm": 1.3888085950430686, + "learning_rate": 8.845142582491603e-06, + "loss": 0.3571, + "step": 8417 + }, + { + "epoch": 0.24, + "grad_norm": 1.353998740823964, + "learning_rate": 8.844842321597289e-06, + "loss": 0.3647, + "step": 8418 + }, + { + "epoch": 0.24, + "grad_norm": 1.465010005357479, + "learning_rate": 8.844542026772199e-06, + "loss": 0.3611, + "step": 8419 + }, + { + "epoch": 0.24, + "grad_norm": 1.3725298774028605, + "learning_rate": 8.844241698018984e-06, + "loss": 0.3703, + "step": 8420 + }, + { + "epoch": 0.24, + "grad_norm": 1.7132242792588601, + "learning_rate": 8.843941335340292e-06, + "loss": 0.3472, + "step": 8421 + }, + { + "epoch": 0.24, + "grad_norm": 1.0597027524415203, + "learning_rate": 8.843640938738777e-06, + "loss": 0.6625, + "step": 8422 + }, + { + "epoch": 0.24, + "grad_norm": 1.6152288994056365, + "learning_rate": 8.843340508217087e-06, + "loss": 0.3693, + "step": 8423 + }, + { + "epoch": 0.24, + "grad_norm": 1.7353280383990775, + "learning_rate": 8.843040043777874e-06, + "loss": 0.4013, + "step": 8424 + }, + { + "epoch": 0.24, + "grad_norm": 1.3437156856836168, + "learning_rate": 8.84273954542379e-06, + "loss": 0.3636, + "step": 8425 + }, + { + "epoch": 0.24, + "grad_norm": 1.4127264713890728, + "learning_rate": 8.842439013157488e-06, + "loss": 0.364, + "step": 8426 + }, + { + "epoch": 0.24, + "grad_norm": 1.386845726490401, + "learning_rate": 8.84213844698162e-06, + "loss": 0.3431, + "step": 8427 + }, + { + "epoch": 0.24, + "grad_norm": 1.9848529989094208, + "learning_rate": 8.841837846898835e-06, + "loss": 0.4009, + "step": 8428 + }, + { + "epoch": 0.24, + "grad_norm": 1.3690041846934746, + "learning_rate": 8.841537212911788e-06, + "loss": 0.408, + "step": 8429 + }, + { + "epoch": 0.24, + "grad_norm": 1.6025734253097659, + "learning_rate": 8.841236545023134e-06, + "loss": 0.3411, + "step": 8430 + }, + { + "epoch": 0.24, + "grad_norm": 1.359096618585377, + "learning_rate": 8.840935843235525e-06, + "loss": 0.3538, + "step": 8431 + }, + { + "epoch": 0.24, + "grad_norm": 1.330702840980025, + "learning_rate": 8.840635107551614e-06, + "loss": 0.3427, + "step": 8432 + }, + { + "epoch": 0.24, + "grad_norm": 1.4462857759414103, + "learning_rate": 8.840334337974056e-06, + "loss": 0.3496, + "step": 8433 + }, + { + "epoch": 0.24, + "grad_norm": 1.2519129117469696, + "learning_rate": 8.840033534505505e-06, + "loss": 0.3623, + "step": 8434 + }, + { + "epoch": 0.24, + "grad_norm": 1.4595544332971362, + "learning_rate": 8.839732697148614e-06, + "loss": 0.3595, + "step": 8435 + }, + { + "epoch": 0.24, + "grad_norm": 1.3163504533576735, + "learning_rate": 8.83943182590604e-06, + "loss": 0.3434, + "step": 8436 + }, + { + "epoch": 0.24, + "grad_norm": 1.6489484620221324, + "learning_rate": 8.839130920780439e-06, + "loss": 0.338, + "step": 8437 + }, + { + "epoch": 0.24, + "grad_norm": 1.3729295648734003, + "learning_rate": 8.838829981774464e-06, + "loss": 0.3566, + "step": 8438 + }, + { + "epoch": 0.24, + "grad_norm": 2.9323349655471582, + "learning_rate": 8.838529008890772e-06, + "loss": 0.3535, + "step": 8439 + }, + { + "epoch": 0.24, + "grad_norm": 1.8885704020956222, + "learning_rate": 8.838228002132019e-06, + "loss": 0.3658, + "step": 8440 + }, + { + "epoch": 0.24, + "grad_norm": 1.6144418476037945, + "learning_rate": 8.83792696150086e-06, + "loss": 0.3565, + "step": 8441 + }, + { + "epoch": 0.24, + "grad_norm": 1.2974661717310008, + "learning_rate": 8.837625886999955e-06, + "loss": 0.3501, + "step": 8442 + }, + { + "epoch": 0.24, + "grad_norm": 1.4948710733203927, + "learning_rate": 8.837324778631957e-06, + "loss": 0.3613, + "step": 8443 + }, + { + "epoch": 0.24, + "grad_norm": 1.4160859195842033, + "learning_rate": 8.837023636399526e-06, + "loss": 0.4014, + "step": 8444 + }, + { + "epoch": 0.24, + "grad_norm": 1.457652592438578, + "learning_rate": 8.83672246030532e-06, + "loss": 0.3559, + "step": 8445 + }, + { + "epoch": 0.24, + "grad_norm": 1.6534618242866366, + "learning_rate": 8.836421250351995e-06, + "loss": 0.3453, + "step": 8446 + }, + { + "epoch": 0.25, + "grad_norm": 1.4359349089856182, + "learning_rate": 8.836120006542211e-06, + "loss": 0.3547, + "step": 8447 + }, + { + "epoch": 0.25, + "grad_norm": 1.8707885239292978, + "learning_rate": 8.835818728878623e-06, + "loss": 0.3656, + "step": 8448 + }, + { + "epoch": 0.25, + "grad_norm": 1.5105040008210746, + "learning_rate": 8.835517417363894e-06, + "loss": 0.3455, + "step": 8449 + }, + { + "epoch": 0.25, + "grad_norm": 1.6908306975300265, + "learning_rate": 8.835216072000679e-06, + "loss": 0.3577, + "step": 8450 + }, + { + "epoch": 0.25, + "grad_norm": 1.2853219599019508, + "learning_rate": 8.834914692791643e-06, + "loss": 0.3715, + "step": 8451 + }, + { + "epoch": 0.25, + "grad_norm": 1.3267942799260652, + "learning_rate": 8.834613279739438e-06, + "loss": 0.3388, + "step": 8452 + }, + { + "epoch": 0.25, + "grad_norm": 1.2771377011536011, + "learning_rate": 8.834311832846732e-06, + "loss": 0.3417, + "step": 8453 + }, + { + "epoch": 0.25, + "grad_norm": 1.3690796115330102, + "learning_rate": 8.83401035211618e-06, + "loss": 0.3651, + "step": 8454 + }, + { + "epoch": 0.25, + "grad_norm": 1.5540178832838403, + "learning_rate": 8.833708837550446e-06, + "loss": 0.3618, + "step": 8455 + }, + { + "epoch": 0.25, + "grad_norm": 1.2995182478873402, + "learning_rate": 8.833407289152188e-06, + "loss": 0.3589, + "step": 8456 + }, + { + "epoch": 0.25, + "grad_norm": 1.6126963842974782, + "learning_rate": 8.833105706924066e-06, + "loss": 0.3886, + "step": 8457 + }, + { + "epoch": 0.25, + "grad_norm": 1.3988333606304462, + "learning_rate": 8.832804090868745e-06, + "loss": 0.4018, + "step": 8458 + }, + { + "epoch": 0.25, + "grad_norm": 1.3475060035135729, + "learning_rate": 8.832502440988886e-06, + "loss": 0.4075, + "step": 8459 + }, + { + "epoch": 0.25, + "grad_norm": 1.5287789711265736, + "learning_rate": 8.83220075728715e-06, + "loss": 0.3982, + "step": 8460 + }, + { + "epoch": 0.25, + "grad_norm": 1.0158957571080622, + "learning_rate": 8.8318990397662e-06, + "loss": 0.5679, + "step": 8461 + }, + { + "epoch": 0.25, + "grad_norm": 1.6766199511004483, + "learning_rate": 8.831597288428697e-06, + "loss": 0.3717, + "step": 8462 + }, + { + "epoch": 0.25, + "grad_norm": 2.001812395994784, + "learning_rate": 8.831295503277307e-06, + "loss": 0.4013, + "step": 8463 + }, + { + "epoch": 0.25, + "grad_norm": 1.5414068601412303, + "learning_rate": 8.83099368431469e-06, + "loss": 0.3498, + "step": 8464 + }, + { + "epoch": 0.25, + "grad_norm": 1.4199182218207307, + "learning_rate": 8.830691831543514e-06, + "loss": 0.371, + "step": 8465 + }, + { + "epoch": 0.25, + "grad_norm": 1.7239164881059932, + "learning_rate": 8.83038994496644e-06, + "loss": 0.3697, + "step": 8466 + }, + { + "epoch": 0.25, + "grad_norm": 1.297524062471762, + "learning_rate": 8.83008802458613e-06, + "loss": 0.3737, + "step": 8467 + }, + { + "epoch": 0.25, + "grad_norm": 1.8449532003083482, + "learning_rate": 8.829786070405251e-06, + "loss": 0.3864, + "step": 8468 + }, + { + "epoch": 0.25, + "grad_norm": 1.8033975437511223, + "learning_rate": 8.829484082426469e-06, + "loss": 0.3712, + "step": 8469 + }, + { + "epoch": 0.25, + "grad_norm": 1.2701628066150215, + "learning_rate": 8.829182060652446e-06, + "loss": 0.3327, + "step": 8470 + }, + { + "epoch": 0.25, + "grad_norm": 1.6215896395419314, + "learning_rate": 8.82888000508585e-06, + "loss": 0.3685, + "step": 8471 + }, + { + "epoch": 0.25, + "grad_norm": 1.599097987905945, + "learning_rate": 8.828577915729345e-06, + "loss": 0.3441, + "step": 8472 + }, + { + "epoch": 0.25, + "grad_norm": 1.6744263848084275, + "learning_rate": 8.828275792585596e-06, + "loss": 0.3597, + "step": 8473 + }, + { + "epoch": 0.25, + "grad_norm": 2.4896720878743883, + "learning_rate": 8.827973635657272e-06, + "loss": 0.3834, + "step": 8474 + }, + { + "epoch": 0.25, + "grad_norm": 1.4862890273042164, + "learning_rate": 8.827671444947038e-06, + "loss": 0.3847, + "step": 8475 + }, + { + "epoch": 0.25, + "grad_norm": 1.7352122149453901, + "learning_rate": 8.827369220457563e-06, + "loss": 0.3756, + "step": 8476 + }, + { + "epoch": 0.25, + "grad_norm": 2.136146327363347, + "learning_rate": 8.827066962191509e-06, + "loss": 0.3737, + "step": 8477 + }, + { + "epoch": 0.25, + "grad_norm": 1.5692588429025192, + "learning_rate": 8.82676467015155e-06, + "loss": 0.3699, + "step": 8478 + }, + { + "epoch": 0.25, + "grad_norm": 1.6713387547274343, + "learning_rate": 8.826462344340347e-06, + "loss": 0.3441, + "step": 8479 + }, + { + "epoch": 0.25, + "grad_norm": 1.3943215384432721, + "learning_rate": 8.826159984760572e-06, + "loss": 0.3386, + "step": 8480 + }, + { + "epoch": 0.25, + "grad_norm": 1.4291741188286566, + "learning_rate": 8.825857591414894e-06, + "loss": 0.354, + "step": 8481 + }, + { + "epoch": 0.25, + "grad_norm": 1.766448500979484, + "learning_rate": 8.825555164305978e-06, + "loss": 0.3704, + "step": 8482 + }, + { + "epoch": 0.25, + "grad_norm": 0.9672531047732379, + "learning_rate": 8.825252703436497e-06, + "loss": 0.5806, + "step": 8483 + }, + { + "epoch": 0.25, + "grad_norm": 1.4142160993836432, + "learning_rate": 8.82495020880912e-06, + "loss": 0.3668, + "step": 8484 + }, + { + "epoch": 0.25, + "grad_norm": 1.8271423358289425, + "learning_rate": 8.824647680426512e-06, + "loss": 0.3705, + "step": 8485 + }, + { + "epoch": 0.25, + "grad_norm": 1.5195439004190536, + "learning_rate": 8.824345118291349e-06, + "loss": 0.3769, + "step": 8486 + }, + { + "epoch": 0.25, + "grad_norm": 1.3612271669378542, + "learning_rate": 8.824042522406295e-06, + "loss": 0.3619, + "step": 8487 + }, + { + "epoch": 0.25, + "grad_norm": 2.204257271921491, + "learning_rate": 8.823739892774024e-06, + "loss": 0.3527, + "step": 8488 + }, + { + "epoch": 0.25, + "grad_norm": 1.2894296426373753, + "learning_rate": 8.823437229397207e-06, + "loss": 0.3424, + "step": 8489 + }, + { + "epoch": 0.25, + "grad_norm": 1.5258122761085153, + "learning_rate": 8.823134532278513e-06, + "loss": 0.3468, + "step": 8490 + }, + { + "epoch": 0.25, + "grad_norm": 1.4166829152417515, + "learning_rate": 8.822831801420615e-06, + "loss": 0.365, + "step": 8491 + }, + { + "epoch": 0.25, + "grad_norm": 1.8247321390653455, + "learning_rate": 8.822529036826184e-06, + "loss": 0.3669, + "step": 8492 + }, + { + "epoch": 0.25, + "grad_norm": 1.3048338056478292, + "learning_rate": 8.82222623849789e-06, + "loss": 0.3633, + "step": 8493 + }, + { + "epoch": 0.25, + "grad_norm": 1.4878052772940136, + "learning_rate": 8.82192340643841e-06, + "loss": 0.3422, + "step": 8494 + }, + { + "epoch": 0.25, + "grad_norm": 1.4008097491694556, + "learning_rate": 8.821620540650412e-06, + "loss": 0.3692, + "step": 8495 + }, + { + "epoch": 0.25, + "grad_norm": 1.4269556670561725, + "learning_rate": 8.82131764113657e-06, + "loss": 0.3751, + "step": 8496 + }, + { + "epoch": 0.25, + "grad_norm": 1.3567978815741737, + "learning_rate": 8.821014707899558e-06, + "loss": 0.3585, + "step": 8497 + }, + { + "epoch": 0.25, + "grad_norm": 1.6629761481307301, + "learning_rate": 8.820711740942049e-06, + "loss": 0.3544, + "step": 8498 + }, + { + "epoch": 0.25, + "grad_norm": 1.3617019347795747, + "learning_rate": 8.820408740266716e-06, + "loss": 0.3656, + "step": 8499 + }, + { + "epoch": 0.25, + "grad_norm": 2.2229933507837814, + "learning_rate": 8.820105705876234e-06, + "loss": 0.3292, + "step": 8500 + }, + { + "epoch": 0.25, + "grad_norm": 1.506038355461444, + "learning_rate": 8.819802637773277e-06, + "loss": 0.375, + "step": 8501 + }, + { + "epoch": 0.25, + "grad_norm": 1.4027899606369294, + "learning_rate": 8.819499535960519e-06, + "loss": 0.3571, + "step": 8502 + }, + { + "epoch": 0.25, + "grad_norm": 1.456365483903117, + "learning_rate": 8.819196400440634e-06, + "loss": 0.3529, + "step": 8503 + }, + { + "epoch": 0.25, + "grad_norm": 1.3796272692553813, + "learning_rate": 8.818893231216298e-06, + "loss": 0.3674, + "step": 8504 + }, + { + "epoch": 0.25, + "grad_norm": 1.4720427904190907, + "learning_rate": 8.818590028290189e-06, + "loss": 0.3752, + "step": 8505 + }, + { + "epoch": 0.25, + "grad_norm": 2.361651628364559, + "learning_rate": 8.81828679166498e-06, + "loss": 0.3492, + "step": 8506 + }, + { + "epoch": 0.25, + "grad_norm": 1.4988284926738267, + "learning_rate": 8.817983521343349e-06, + "loss": 0.4048, + "step": 8507 + }, + { + "epoch": 0.25, + "grad_norm": 1.3157268166055474, + "learning_rate": 8.817680217327969e-06, + "loss": 0.3527, + "step": 8508 + }, + { + "epoch": 0.25, + "grad_norm": 1.5676748865702426, + "learning_rate": 8.817376879621519e-06, + "loss": 0.3442, + "step": 8509 + }, + { + "epoch": 0.25, + "grad_norm": 1.3311530368143505, + "learning_rate": 8.817073508226677e-06, + "loss": 0.3736, + "step": 8510 + }, + { + "epoch": 0.25, + "grad_norm": 1.3449479936467446, + "learning_rate": 8.816770103146117e-06, + "loss": 0.3358, + "step": 8511 + }, + { + "epoch": 0.25, + "grad_norm": 1.5161926286527652, + "learning_rate": 8.81646666438252e-06, + "loss": 0.3884, + "step": 8512 + }, + { + "epoch": 0.25, + "grad_norm": 1.4706208166439438, + "learning_rate": 8.816163191938563e-06, + "loss": 0.3809, + "step": 8513 + }, + { + "epoch": 0.25, + "grad_norm": 1.4327369216635946, + "learning_rate": 8.815859685816922e-06, + "loss": 0.3559, + "step": 8514 + }, + { + "epoch": 0.25, + "grad_norm": 1.3361517089679744, + "learning_rate": 8.815556146020276e-06, + "loss": 0.3559, + "step": 8515 + }, + { + "epoch": 0.25, + "grad_norm": 1.511336031285261, + "learning_rate": 8.815252572551308e-06, + "loss": 0.3471, + "step": 8516 + }, + { + "epoch": 0.25, + "grad_norm": 2.216284691135592, + "learning_rate": 8.81494896541269e-06, + "loss": 0.3397, + "step": 8517 + }, + { + "epoch": 0.25, + "grad_norm": 1.7068148973188635, + "learning_rate": 8.814645324607107e-06, + "loss": 0.3394, + "step": 8518 + }, + { + "epoch": 0.25, + "grad_norm": 1.3356901442125813, + "learning_rate": 8.814341650137237e-06, + "loss": 0.3777, + "step": 8519 + }, + { + "epoch": 0.25, + "grad_norm": 2.5384470091616147, + "learning_rate": 8.81403794200576e-06, + "loss": 0.3588, + "step": 8520 + }, + { + "epoch": 0.25, + "grad_norm": 1.5969889411663556, + "learning_rate": 8.813734200215354e-06, + "loss": 0.3579, + "step": 8521 + }, + { + "epoch": 0.25, + "grad_norm": 1.522675763507812, + "learning_rate": 8.813430424768703e-06, + "loss": 0.3706, + "step": 8522 + }, + { + "epoch": 0.25, + "grad_norm": 1.593378469485349, + "learning_rate": 8.813126615668484e-06, + "loss": 0.3631, + "step": 8523 + }, + { + "epoch": 0.25, + "grad_norm": 1.2363688001246482, + "learning_rate": 8.812822772917381e-06, + "loss": 0.3621, + "step": 8524 + }, + { + "epoch": 0.25, + "grad_norm": 1.3487813596516944, + "learning_rate": 8.812518896518077e-06, + "loss": 0.3384, + "step": 8525 + }, + { + "epoch": 0.25, + "grad_norm": 2.9828855070454376, + "learning_rate": 8.81221498647325e-06, + "loss": 0.3805, + "step": 8526 + }, + { + "epoch": 0.25, + "grad_norm": 1.3972827143343087, + "learning_rate": 8.811911042785584e-06, + "loss": 0.3588, + "step": 8527 + }, + { + "epoch": 0.25, + "grad_norm": 1.4791557574580951, + "learning_rate": 8.81160706545776e-06, + "loss": 0.3582, + "step": 8528 + }, + { + "epoch": 0.25, + "grad_norm": 1.4770273626257224, + "learning_rate": 8.811303054492462e-06, + "loss": 0.341, + "step": 8529 + }, + { + "epoch": 0.25, + "grad_norm": 1.5322623784707494, + "learning_rate": 8.810999009892372e-06, + "loss": 0.3781, + "step": 8530 + }, + { + "epoch": 0.25, + "grad_norm": 2.079992601402242, + "learning_rate": 8.810694931660174e-06, + "loss": 0.3525, + "step": 8531 + }, + { + "epoch": 0.25, + "grad_norm": 0.9736853890253758, + "learning_rate": 8.810390819798548e-06, + "loss": 0.6254, + "step": 8532 + }, + { + "epoch": 0.25, + "grad_norm": 1.9522702057734382, + "learning_rate": 8.810086674310184e-06, + "loss": 0.3687, + "step": 8533 + }, + { + "epoch": 0.25, + "grad_norm": 1.3617358511670548, + "learning_rate": 8.809782495197762e-06, + "loss": 0.3594, + "step": 8534 + }, + { + "epoch": 0.25, + "grad_norm": 1.387475580928923, + "learning_rate": 8.809478282463967e-06, + "loss": 0.3716, + "step": 8535 + }, + { + "epoch": 0.25, + "grad_norm": 1.2924600940071715, + "learning_rate": 8.809174036111485e-06, + "loss": 0.336, + "step": 8536 + }, + { + "epoch": 0.25, + "grad_norm": 2.278981651349459, + "learning_rate": 8.808869756142998e-06, + "loss": 0.3643, + "step": 8537 + }, + { + "epoch": 0.25, + "grad_norm": 1.497164317196002, + "learning_rate": 8.808565442561195e-06, + "loss": 0.3494, + "step": 8538 + }, + { + "epoch": 0.25, + "grad_norm": 1.664050165951776, + "learning_rate": 8.808261095368758e-06, + "loss": 0.3453, + "step": 8539 + }, + { + "epoch": 0.25, + "grad_norm": 1.6469813516547562, + "learning_rate": 8.807956714568376e-06, + "loss": 0.39, + "step": 8540 + }, + { + "epoch": 0.25, + "grad_norm": 1.3260908047835631, + "learning_rate": 8.807652300162733e-06, + "loss": 0.3727, + "step": 8541 + }, + { + "epoch": 0.25, + "grad_norm": 1.5794870553770963, + "learning_rate": 8.807347852154515e-06, + "loss": 0.3836, + "step": 8542 + }, + { + "epoch": 0.25, + "grad_norm": 1.3572679263871106, + "learning_rate": 8.807043370546413e-06, + "loss": 0.3449, + "step": 8543 + }, + { + "epoch": 0.25, + "grad_norm": 1.318357180734505, + "learning_rate": 8.806738855341108e-06, + "loss": 0.3631, + "step": 8544 + }, + { + "epoch": 0.25, + "grad_norm": 1.351802823397996, + "learning_rate": 8.806434306541292e-06, + "loss": 0.3766, + "step": 8545 + }, + { + "epoch": 0.25, + "grad_norm": 1.5624419430297216, + "learning_rate": 8.80612972414965e-06, + "loss": 0.3646, + "step": 8546 + }, + { + "epoch": 0.25, + "grad_norm": 1.5064875522660859, + "learning_rate": 8.805825108168873e-06, + "loss": 0.3818, + "step": 8547 + }, + { + "epoch": 0.25, + "grad_norm": 1.9237314577199232, + "learning_rate": 8.805520458601644e-06, + "loss": 0.3684, + "step": 8548 + }, + { + "epoch": 0.25, + "grad_norm": 1.4741146368900524, + "learning_rate": 8.805215775450658e-06, + "loss": 0.3511, + "step": 8549 + }, + { + "epoch": 0.25, + "grad_norm": 1.5773170888022807, + "learning_rate": 8.804911058718598e-06, + "loss": 0.3809, + "step": 8550 + }, + { + "epoch": 0.25, + "grad_norm": 1.3940464134350221, + "learning_rate": 8.804606308408157e-06, + "loss": 0.375, + "step": 8551 + }, + { + "epoch": 0.25, + "grad_norm": 1.4859854349207293, + "learning_rate": 8.804301524522023e-06, + "loss": 0.3411, + "step": 8552 + }, + { + "epoch": 0.25, + "grad_norm": 2.177397173708878, + "learning_rate": 8.803996707062887e-06, + "loss": 0.3515, + "step": 8553 + }, + { + "epoch": 0.25, + "grad_norm": 3.0592806854951524, + "learning_rate": 8.803691856033437e-06, + "loss": 0.3484, + "step": 8554 + }, + { + "epoch": 0.25, + "grad_norm": 1.904666952870549, + "learning_rate": 8.803386971436362e-06, + "loss": 0.3766, + "step": 8555 + }, + { + "epoch": 0.25, + "grad_norm": 1.4837380773833269, + "learning_rate": 8.803082053274357e-06, + "loss": 0.3715, + "step": 8556 + }, + { + "epoch": 0.25, + "grad_norm": 2.03549244832811, + "learning_rate": 8.802777101550111e-06, + "loss": 0.36, + "step": 8557 + }, + { + "epoch": 0.25, + "grad_norm": 1.6749188952761185, + "learning_rate": 8.802472116266315e-06, + "loss": 0.3651, + "step": 8558 + }, + { + "epoch": 0.25, + "grad_norm": 1.5937845551602605, + "learning_rate": 8.802167097425658e-06, + "loss": 0.3717, + "step": 8559 + }, + { + "epoch": 0.25, + "grad_norm": 1.9569376274411376, + "learning_rate": 8.801862045030835e-06, + "loss": 0.335, + "step": 8560 + }, + { + "epoch": 0.25, + "grad_norm": 1.6577073546247645, + "learning_rate": 8.801556959084539e-06, + "loss": 0.3727, + "step": 8561 + }, + { + "epoch": 0.25, + "grad_norm": 1.4152438569988837, + "learning_rate": 8.80125183958946e-06, + "loss": 0.3593, + "step": 8562 + }, + { + "epoch": 0.25, + "grad_norm": 1.4851742445337928, + "learning_rate": 8.800946686548291e-06, + "loss": 0.3904, + "step": 8563 + }, + { + "epoch": 0.25, + "grad_norm": 1.4649456378273178, + "learning_rate": 8.800641499963725e-06, + "loss": 0.3482, + "step": 8564 + }, + { + "epoch": 0.25, + "grad_norm": 2.050389474035133, + "learning_rate": 8.800336279838457e-06, + "loss": 0.3503, + "step": 8565 + }, + { + "epoch": 0.25, + "grad_norm": 0.9875530678239529, + "learning_rate": 8.800031026175178e-06, + "loss": 0.5978, + "step": 8566 + }, + { + "epoch": 0.25, + "grad_norm": 1.9006791612417955, + "learning_rate": 8.799725738976583e-06, + "loss": 0.3501, + "step": 8567 + }, + { + "epoch": 0.25, + "grad_norm": 1.8830821903327366, + "learning_rate": 8.799420418245366e-06, + "loss": 0.3789, + "step": 8568 + }, + { + "epoch": 0.25, + "grad_norm": 1.4238789970866381, + "learning_rate": 8.799115063984223e-06, + "loss": 0.3464, + "step": 8569 + }, + { + "epoch": 0.25, + "grad_norm": 1.5735818547264713, + "learning_rate": 8.798809676195845e-06, + "loss": 0.3837, + "step": 8570 + }, + { + "epoch": 0.25, + "grad_norm": 1.6718111001144234, + "learning_rate": 8.798504254882932e-06, + "loss": 0.3527, + "step": 8571 + }, + { + "epoch": 0.25, + "grad_norm": 1.8027911563314305, + "learning_rate": 8.798198800048175e-06, + "loss": 0.3746, + "step": 8572 + }, + { + "epoch": 0.25, + "grad_norm": 1.4220555250311193, + "learning_rate": 8.797893311694272e-06, + "loss": 0.3659, + "step": 8573 + }, + { + "epoch": 0.25, + "grad_norm": 1.526623964839009, + "learning_rate": 8.79758778982392e-06, + "loss": 0.3694, + "step": 8574 + }, + { + "epoch": 0.25, + "grad_norm": 1.4108732283234742, + "learning_rate": 8.797282234439811e-06, + "loss": 0.3537, + "step": 8575 + }, + { + "epoch": 0.25, + "grad_norm": 1.4917963953582005, + "learning_rate": 8.796976645544646e-06, + "loss": 0.3522, + "step": 8576 + }, + { + "epoch": 0.25, + "grad_norm": 1.395289582901949, + "learning_rate": 8.796671023141119e-06, + "loss": 0.345, + "step": 8577 + }, + { + "epoch": 0.25, + "grad_norm": 1.3596727949537537, + "learning_rate": 8.796365367231929e-06, + "loss": 0.3368, + "step": 8578 + }, + { + "epoch": 0.25, + "grad_norm": 1.3039665296169118, + "learning_rate": 8.796059677819773e-06, + "loss": 0.338, + "step": 8579 + }, + { + "epoch": 0.25, + "grad_norm": 1.7612623455054082, + "learning_rate": 8.795753954907346e-06, + "loss": 0.3814, + "step": 8580 + }, + { + "epoch": 0.25, + "grad_norm": 1.9310061673242054, + "learning_rate": 8.79544819849735e-06, + "loss": 0.382, + "step": 8581 + }, + { + "epoch": 0.25, + "grad_norm": 1.5722013011761171, + "learning_rate": 8.795142408592481e-06, + "loss": 0.3734, + "step": 8582 + }, + { + "epoch": 0.25, + "grad_norm": 1.3767093352513637, + "learning_rate": 8.794836585195437e-06, + "loss": 0.3731, + "step": 8583 + }, + { + "epoch": 0.25, + "grad_norm": 1.483233132896017, + "learning_rate": 8.79453072830892e-06, + "loss": 0.3505, + "step": 8584 + }, + { + "epoch": 0.25, + "grad_norm": 1.8241321678066549, + "learning_rate": 8.794224837935626e-06, + "loss": 0.3464, + "step": 8585 + }, + { + "epoch": 0.25, + "grad_norm": 1.6381039214508497, + "learning_rate": 8.793918914078256e-06, + "loss": 0.3597, + "step": 8586 + }, + { + "epoch": 0.25, + "grad_norm": 1.5834882908871522, + "learning_rate": 8.79361295673951e-06, + "loss": 0.3606, + "step": 8587 + }, + { + "epoch": 0.25, + "grad_norm": 2.0824480269520187, + "learning_rate": 8.793306965922088e-06, + "loss": 0.3376, + "step": 8588 + }, + { + "epoch": 0.25, + "grad_norm": 1.4712059948078469, + "learning_rate": 8.79300094162869e-06, + "loss": 0.3905, + "step": 8589 + }, + { + "epoch": 0.25, + "grad_norm": 1.6987434026659443, + "learning_rate": 8.792694883862014e-06, + "loss": 0.3816, + "step": 8590 + }, + { + "epoch": 0.25, + "grad_norm": 1.4535735876093525, + "learning_rate": 8.792388792624765e-06, + "loss": 0.3529, + "step": 8591 + }, + { + "epoch": 0.25, + "grad_norm": 1.3934612523561054, + "learning_rate": 8.792082667919644e-06, + "loss": 0.3791, + "step": 8592 + }, + { + "epoch": 0.25, + "grad_norm": 1.712630163522426, + "learning_rate": 8.79177650974935e-06, + "loss": 0.3832, + "step": 8593 + }, + { + "epoch": 0.25, + "grad_norm": 1.5277795859307377, + "learning_rate": 8.791470318116586e-06, + "loss": 0.3404, + "step": 8594 + }, + { + "epoch": 0.25, + "grad_norm": 1.4288937456735942, + "learning_rate": 8.791164093024056e-06, + "loss": 0.3931, + "step": 8595 + }, + { + "epoch": 0.25, + "grad_norm": 1.5143616517132827, + "learning_rate": 8.79085783447446e-06, + "loss": 0.3818, + "step": 8596 + }, + { + "epoch": 0.25, + "grad_norm": 1.6406936099031855, + "learning_rate": 8.790551542470502e-06, + "loss": 0.3856, + "step": 8597 + }, + { + "epoch": 0.25, + "grad_norm": 1.4673660673210838, + "learning_rate": 8.790245217014883e-06, + "loss": 0.3434, + "step": 8598 + }, + { + "epoch": 0.25, + "grad_norm": 1.547224637565903, + "learning_rate": 8.78993885811031e-06, + "loss": 0.3504, + "step": 8599 + }, + { + "epoch": 0.25, + "grad_norm": 1.4257504596804773, + "learning_rate": 8.789632465759483e-06, + "loss": 0.3448, + "step": 8600 + }, + { + "epoch": 0.25, + "grad_norm": 1.5041138426932663, + "learning_rate": 8.789326039965108e-06, + "loss": 0.3713, + "step": 8601 + }, + { + "epoch": 0.25, + "grad_norm": 1.6329962891865148, + "learning_rate": 8.789019580729889e-06, + "loss": 0.3546, + "step": 8602 + }, + { + "epoch": 0.25, + "grad_norm": 1.6066748277961354, + "learning_rate": 8.78871308805653e-06, + "loss": 0.3533, + "step": 8603 + }, + { + "epoch": 0.25, + "grad_norm": 1.3870280474044496, + "learning_rate": 8.788406561947736e-06, + "loss": 0.3405, + "step": 8604 + }, + { + "epoch": 0.25, + "grad_norm": 1.5901419738490674, + "learning_rate": 8.788100002406212e-06, + "loss": 0.3805, + "step": 8605 + }, + { + "epoch": 0.25, + "grad_norm": 1.6271873869327227, + "learning_rate": 8.787793409434663e-06, + "loss": 0.3509, + "step": 8606 + }, + { + "epoch": 0.25, + "grad_norm": 1.5761279671344746, + "learning_rate": 8.787486783035795e-06, + "loss": 0.368, + "step": 8607 + }, + { + "epoch": 0.25, + "grad_norm": 3.43937299529145, + "learning_rate": 8.787180123212316e-06, + "loss": 0.3576, + "step": 8608 + }, + { + "epoch": 0.25, + "grad_norm": 1.7768712255310635, + "learning_rate": 8.78687342996693e-06, + "loss": 0.3587, + "step": 8609 + }, + { + "epoch": 0.25, + "grad_norm": 1.3704795110591494, + "learning_rate": 8.786566703302342e-06, + "loss": 0.3549, + "step": 8610 + }, + { + "epoch": 0.25, + "grad_norm": 1.8322374003547848, + "learning_rate": 8.78625994322126e-06, + "loss": 0.3444, + "step": 8611 + }, + { + "epoch": 0.25, + "grad_norm": 1.4405229730540663, + "learning_rate": 8.785953149726394e-06, + "loss": 0.3597, + "step": 8612 + }, + { + "epoch": 0.25, + "grad_norm": 1.5447687502374992, + "learning_rate": 8.78564632282045e-06, + "loss": 0.3608, + "step": 8613 + }, + { + "epoch": 0.25, + "grad_norm": 1.59890519748591, + "learning_rate": 8.785339462506136e-06, + "loss": 0.3382, + "step": 8614 + }, + { + "epoch": 0.25, + "grad_norm": 2.4966799149690515, + "learning_rate": 8.785032568786155e-06, + "loss": 0.3616, + "step": 8615 + }, + { + "epoch": 0.25, + "grad_norm": 1.3286626894046423, + "learning_rate": 8.784725641663224e-06, + "loss": 0.3484, + "step": 8616 + }, + { + "epoch": 0.25, + "grad_norm": 1.5179395722648354, + "learning_rate": 8.784418681140046e-06, + "loss": 0.3689, + "step": 8617 + }, + { + "epoch": 0.25, + "grad_norm": 1.7546716303592929, + "learning_rate": 8.78411168721933e-06, + "loss": 0.3495, + "step": 8618 + }, + { + "epoch": 0.25, + "grad_norm": 0.9214378179653192, + "learning_rate": 8.783804659903787e-06, + "loss": 0.5892, + "step": 8619 + }, + { + "epoch": 0.25, + "grad_norm": 1.5216143142830045, + "learning_rate": 8.783497599196126e-06, + "loss": 0.3724, + "step": 8620 + }, + { + "epoch": 0.25, + "grad_norm": 1.5454136578512392, + "learning_rate": 8.783190505099057e-06, + "loss": 0.3572, + "step": 8621 + }, + { + "epoch": 0.25, + "grad_norm": 1.8159883146373836, + "learning_rate": 8.78288337761529e-06, + "loss": 0.4138, + "step": 8622 + }, + { + "epoch": 0.25, + "grad_norm": 4.312538329959897, + "learning_rate": 8.782576216747534e-06, + "loss": 0.3508, + "step": 8623 + }, + { + "epoch": 0.25, + "grad_norm": 1.5952261306648796, + "learning_rate": 8.782269022498502e-06, + "loss": 0.344, + "step": 8624 + }, + { + "epoch": 0.25, + "grad_norm": 2.1518279986631805, + "learning_rate": 8.781961794870903e-06, + "loss": 0.3696, + "step": 8625 + }, + { + "epoch": 0.25, + "grad_norm": 2.1927204516754886, + "learning_rate": 8.78165453386745e-06, + "loss": 0.365, + "step": 8626 + }, + { + "epoch": 0.25, + "grad_norm": 1.4638416854776626, + "learning_rate": 8.781347239490853e-06, + "loss": 0.3536, + "step": 8627 + }, + { + "epoch": 0.25, + "grad_norm": 1.5019915204986714, + "learning_rate": 8.781039911743826e-06, + "loss": 0.3678, + "step": 8628 + }, + { + "epoch": 0.25, + "grad_norm": 1.4211231376631583, + "learning_rate": 8.780732550629078e-06, + "loss": 0.336, + "step": 8629 + }, + { + "epoch": 0.25, + "grad_norm": 1.5378094138742433, + "learning_rate": 8.780425156149326e-06, + "loss": 0.3511, + "step": 8630 + }, + { + "epoch": 0.25, + "grad_norm": 1.7710707770695635, + "learning_rate": 8.780117728307277e-06, + "loss": 0.3781, + "step": 8631 + }, + { + "epoch": 0.25, + "grad_norm": 1.574272425890301, + "learning_rate": 8.779810267105649e-06, + "loss": 0.3757, + "step": 8632 + }, + { + "epoch": 0.25, + "grad_norm": 1.8986610468824878, + "learning_rate": 8.779502772547153e-06, + "loss": 0.3652, + "step": 8633 + }, + { + "epoch": 0.25, + "grad_norm": 1.6242957508728135, + "learning_rate": 8.779195244634505e-06, + "loss": 0.3562, + "step": 8634 + }, + { + "epoch": 0.25, + "grad_norm": 1.5530225455177848, + "learning_rate": 8.778887683370415e-06, + "loss": 0.3612, + "step": 8635 + }, + { + "epoch": 0.25, + "grad_norm": 1.380877991268582, + "learning_rate": 8.778580088757597e-06, + "loss": 0.3429, + "step": 8636 + }, + { + "epoch": 0.25, + "grad_norm": 1.457570494685505, + "learning_rate": 8.778272460798771e-06, + "loss": 0.3496, + "step": 8637 + }, + { + "epoch": 0.25, + "grad_norm": 1.4327859269878307, + "learning_rate": 8.777964799496648e-06, + "loss": 0.38, + "step": 8638 + }, + { + "epoch": 0.25, + "grad_norm": 1.5247612683482543, + "learning_rate": 8.777657104853944e-06, + "loss": 0.3368, + "step": 8639 + }, + { + "epoch": 0.25, + "grad_norm": 1.4250304107829102, + "learning_rate": 8.777349376873375e-06, + "loss": 0.3846, + "step": 8640 + }, + { + "epoch": 0.25, + "grad_norm": 1.8336898046069094, + "learning_rate": 8.777041615557653e-06, + "loss": 0.3496, + "step": 8641 + }, + { + "epoch": 0.25, + "grad_norm": 1.5126977676742688, + "learning_rate": 8.7767338209095e-06, + "loss": 0.3764, + "step": 8642 + }, + { + "epoch": 0.25, + "grad_norm": 1.4025131322418027, + "learning_rate": 8.776425992931626e-06, + "loss": 0.3569, + "step": 8643 + }, + { + "epoch": 0.25, + "grad_norm": 1.4228244547754556, + "learning_rate": 8.776118131626751e-06, + "loss": 0.3517, + "step": 8644 + }, + { + "epoch": 0.25, + "grad_norm": 1.4419397687756472, + "learning_rate": 8.775810236997595e-06, + "loss": 0.3616, + "step": 8645 + }, + { + "epoch": 0.25, + "grad_norm": 1.5621170848452393, + "learning_rate": 8.775502309046869e-06, + "loss": 0.3572, + "step": 8646 + }, + { + "epoch": 0.25, + "grad_norm": 1.6449585053874054, + "learning_rate": 8.775194347777294e-06, + "loss": 0.3793, + "step": 8647 + }, + { + "epoch": 0.25, + "grad_norm": 1.3618825661472376, + "learning_rate": 8.774886353191587e-06, + "loss": 0.3457, + "step": 8648 + }, + { + "epoch": 0.25, + "grad_norm": 1.6248861137295898, + "learning_rate": 8.774578325292467e-06, + "loss": 0.3461, + "step": 8649 + }, + { + "epoch": 0.25, + "grad_norm": 3.4188044512396316, + "learning_rate": 8.77427026408265e-06, + "loss": 0.4005, + "step": 8650 + }, + { + "epoch": 0.25, + "grad_norm": 1.5561209151925313, + "learning_rate": 8.773962169564856e-06, + "loss": 0.3765, + "step": 8651 + }, + { + "epoch": 0.25, + "grad_norm": 1.6488848175864543, + "learning_rate": 8.773654041741804e-06, + "loss": 0.3544, + "step": 8652 + }, + { + "epoch": 0.25, + "grad_norm": 1.5736279541174576, + "learning_rate": 8.773345880616214e-06, + "loss": 0.4187, + "step": 8653 + }, + { + "epoch": 0.25, + "grad_norm": 1.5611816957236868, + "learning_rate": 8.773037686190805e-06, + "loss": 0.3508, + "step": 8654 + }, + { + "epoch": 0.25, + "grad_norm": 1.4539549911645981, + "learning_rate": 8.772729458468295e-06, + "loss": 0.3876, + "step": 8655 + }, + { + "epoch": 0.25, + "grad_norm": 1.4266986374855648, + "learning_rate": 8.772421197451407e-06, + "loss": 0.3384, + "step": 8656 + }, + { + "epoch": 0.25, + "grad_norm": 1.928813501302163, + "learning_rate": 8.772112903142859e-06, + "loss": 0.381, + "step": 8657 + }, + { + "epoch": 0.25, + "grad_norm": 1.5852873096836952, + "learning_rate": 8.771804575545373e-06, + "loss": 0.3585, + "step": 8658 + }, + { + "epoch": 0.25, + "grad_norm": 1.747891195600556, + "learning_rate": 8.77149621466167e-06, + "loss": 0.3622, + "step": 8659 + }, + { + "epoch": 0.25, + "grad_norm": 1.5075043853648482, + "learning_rate": 8.771187820494473e-06, + "loss": 0.3556, + "step": 8660 + }, + { + "epoch": 0.25, + "grad_norm": 1.3450125453613573, + "learning_rate": 8.770879393046498e-06, + "loss": 0.38, + "step": 8661 + }, + { + "epoch": 0.25, + "grad_norm": 1.5571450923599, + "learning_rate": 8.770570932320474e-06, + "loss": 0.3486, + "step": 8662 + }, + { + "epoch": 0.25, + "grad_norm": 1.5154063225218344, + "learning_rate": 8.770262438319117e-06, + "loss": 0.3592, + "step": 8663 + }, + { + "epoch": 0.25, + "grad_norm": 1.757902586427933, + "learning_rate": 8.769953911045155e-06, + "loss": 0.3954, + "step": 8664 + }, + { + "epoch": 0.25, + "grad_norm": 2.306497336949329, + "learning_rate": 8.769645350501307e-06, + "loss": 0.3502, + "step": 8665 + }, + { + "epoch": 0.25, + "grad_norm": 1.0014544482310976, + "learning_rate": 8.769336756690295e-06, + "loss": 0.6132, + "step": 8666 + }, + { + "epoch": 0.25, + "grad_norm": 1.4141088308246708, + "learning_rate": 8.769028129614845e-06, + "loss": 0.3396, + "step": 8667 + }, + { + "epoch": 0.25, + "grad_norm": 1.5757134215765516, + "learning_rate": 8.76871946927768e-06, + "loss": 0.3382, + "step": 8668 + }, + { + "epoch": 0.25, + "grad_norm": 1.370154872860843, + "learning_rate": 8.768410775681526e-06, + "loss": 0.3533, + "step": 8669 + }, + { + "epoch": 0.25, + "grad_norm": 1.4577946443859855, + "learning_rate": 8.768102048829102e-06, + "loss": 0.3561, + "step": 8670 + }, + { + "epoch": 0.25, + "grad_norm": 1.523985844292487, + "learning_rate": 8.767793288723137e-06, + "loss": 0.347, + "step": 8671 + }, + { + "epoch": 0.25, + "grad_norm": 1.401003108995126, + "learning_rate": 8.767484495366353e-06, + "loss": 0.339, + "step": 8672 + }, + { + "epoch": 0.25, + "grad_norm": 0.849307976037672, + "learning_rate": 8.76717566876148e-06, + "loss": 0.5298, + "step": 8673 + }, + { + "epoch": 0.25, + "grad_norm": 1.4425534872365586, + "learning_rate": 8.766866808911238e-06, + "loss": 0.3745, + "step": 8674 + }, + { + "epoch": 0.25, + "grad_norm": 1.5672932672368916, + "learning_rate": 8.766557915818354e-06, + "loss": 0.3601, + "step": 8675 + }, + { + "epoch": 0.25, + "grad_norm": 2.567257981980601, + "learning_rate": 8.766248989485555e-06, + "loss": 0.3606, + "step": 8676 + }, + { + "epoch": 0.25, + "grad_norm": 1.609460386734852, + "learning_rate": 8.765940029915567e-06, + "loss": 0.3612, + "step": 8677 + }, + { + "epoch": 0.25, + "grad_norm": 1.6613099823234525, + "learning_rate": 8.765631037111118e-06, + "loss": 0.3559, + "step": 8678 + }, + { + "epoch": 0.25, + "grad_norm": 1.9533522586463332, + "learning_rate": 8.765322011074931e-06, + "loss": 0.36, + "step": 8679 + }, + { + "epoch": 0.25, + "grad_norm": 1.7318673701082254, + "learning_rate": 8.765012951809736e-06, + "loss": 0.3693, + "step": 8680 + }, + { + "epoch": 0.25, + "grad_norm": 1.4770518881048322, + "learning_rate": 8.76470385931826e-06, + "loss": 0.3647, + "step": 8681 + }, + { + "epoch": 0.25, + "grad_norm": 1.3763229272151087, + "learning_rate": 8.764394733603232e-06, + "loss": 0.3755, + "step": 8682 + }, + { + "epoch": 0.25, + "grad_norm": 1.4991714454754201, + "learning_rate": 8.764085574667377e-06, + "loss": 0.3769, + "step": 8683 + }, + { + "epoch": 0.25, + "grad_norm": 1.5610069865874285, + "learning_rate": 8.763776382513426e-06, + "loss": 0.3692, + "step": 8684 + }, + { + "epoch": 0.25, + "grad_norm": 1.4715425325124347, + "learning_rate": 8.763467157144107e-06, + "loss": 0.3668, + "step": 8685 + }, + { + "epoch": 0.25, + "grad_norm": 1.7082911987738807, + "learning_rate": 8.763157898562148e-06, + "loss": 0.3637, + "step": 8686 + }, + { + "epoch": 0.25, + "grad_norm": 1.4466082052426772, + "learning_rate": 8.762848606770278e-06, + "loss": 0.3367, + "step": 8687 + }, + { + "epoch": 0.25, + "grad_norm": 4.193930188238849, + "learning_rate": 8.762539281771227e-06, + "loss": 0.3889, + "step": 8688 + }, + { + "epoch": 0.25, + "grad_norm": 1.3201706482470246, + "learning_rate": 8.762229923567727e-06, + "loss": 0.3572, + "step": 8689 + }, + { + "epoch": 0.25, + "grad_norm": 1.877418088192612, + "learning_rate": 8.761920532162506e-06, + "loss": 0.3912, + "step": 8690 + }, + { + "epoch": 0.25, + "grad_norm": 1.39167349583751, + "learning_rate": 8.761611107558295e-06, + "loss": 0.3482, + "step": 8691 + }, + { + "epoch": 0.25, + "grad_norm": 1.4264012715431884, + "learning_rate": 8.761301649757822e-06, + "loss": 0.3963, + "step": 8692 + }, + { + "epoch": 0.25, + "grad_norm": 2.246199593603597, + "learning_rate": 8.760992158763821e-06, + "loss": 0.3591, + "step": 8693 + }, + { + "epoch": 0.25, + "grad_norm": 1.5331274496555873, + "learning_rate": 8.760682634579023e-06, + "loss": 0.3407, + "step": 8694 + }, + { + "epoch": 0.25, + "grad_norm": 1.5104441865399911, + "learning_rate": 8.76037307720616e-06, + "loss": 0.3521, + "step": 8695 + }, + { + "epoch": 0.25, + "grad_norm": 1.4659385698365563, + "learning_rate": 8.76006348664796e-06, + "loss": 0.3904, + "step": 8696 + }, + { + "epoch": 0.25, + "grad_norm": 2.2720851205844035, + "learning_rate": 8.759753862907161e-06, + "loss": 0.3701, + "step": 8697 + }, + { + "epoch": 0.25, + "grad_norm": 1.6537259187491242, + "learning_rate": 8.759444205986492e-06, + "loss": 0.3446, + "step": 8698 + }, + { + "epoch": 0.25, + "grad_norm": 2.721580932311322, + "learning_rate": 8.759134515888685e-06, + "loss": 0.3879, + "step": 8699 + }, + { + "epoch": 0.25, + "grad_norm": 3.1429995058287474, + "learning_rate": 8.758824792616473e-06, + "loss": 0.3723, + "step": 8700 + }, + { + "epoch": 0.25, + "grad_norm": 1.8696070947174348, + "learning_rate": 8.758515036172593e-06, + "loss": 0.3616, + "step": 8701 + }, + { + "epoch": 0.25, + "grad_norm": 1.4590584957437567, + "learning_rate": 8.758205246559774e-06, + "loss": 0.3601, + "step": 8702 + }, + { + "epoch": 0.25, + "grad_norm": 1.444936051148754, + "learning_rate": 8.757895423780754e-06, + "loss": 0.3537, + "step": 8703 + }, + { + "epoch": 0.25, + "grad_norm": 1.348868340991508, + "learning_rate": 8.757585567838265e-06, + "loss": 0.366, + "step": 8704 + }, + { + "epoch": 0.25, + "grad_norm": 1.4348485032656784, + "learning_rate": 8.757275678735041e-06, + "loss": 0.3779, + "step": 8705 + }, + { + "epoch": 0.25, + "grad_norm": 1.3502875890336383, + "learning_rate": 8.756965756473818e-06, + "loss": 0.3664, + "step": 8706 + }, + { + "epoch": 0.25, + "grad_norm": 1.6001103975106108, + "learning_rate": 8.75665580105733e-06, + "loss": 0.4054, + "step": 8707 + }, + { + "epoch": 0.25, + "grad_norm": 1.4201927604305233, + "learning_rate": 8.756345812488312e-06, + "loss": 0.3577, + "step": 8708 + }, + { + "epoch": 0.25, + "grad_norm": 1.374697249975299, + "learning_rate": 8.756035790769502e-06, + "loss": 0.3718, + "step": 8709 + }, + { + "epoch": 0.25, + "grad_norm": 1.4855272357966456, + "learning_rate": 8.755725735903634e-06, + "loss": 0.3591, + "step": 8710 + }, + { + "epoch": 0.25, + "grad_norm": 1.4310254352122982, + "learning_rate": 8.755415647893445e-06, + "loss": 0.3447, + "step": 8711 + }, + { + "epoch": 0.25, + "grad_norm": 1.378509715763888, + "learning_rate": 8.755105526741672e-06, + "loss": 0.359, + "step": 8712 + }, + { + "epoch": 0.25, + "grad_norm": 1.5616147131301121, + "learning_rate": 8.754795372451049e-06, + "loss": 0.3503, + "step": 8713 + }, + { + "epoch": 0.25, + "grad_norm": 1.5748374972669432, + "learning_rate": 8.754485185024316e-06, + "loss": 0.369, + "step": 8714 + }, + { + "epoch": 0.25, + "grad_norm": 1.4475541326609132, + "learning_rate": 8.75417496446421e-06, + "loss": 0.3548, + "step": 8715 + }, + { + "epoch": 0.25, + "grad_norm": 1.6271142300950516, + "learning_rate": 8.753864710773468e-06, + "loss": 0.3587, + "step": 8716 + }, + { + "epoch": 0.25, + "grad_norm": 1.749962695475408, + "learning_rate": 8.753554423954828e-06, + "loss": 0.3586, + "step": 8717 + }, + { + "epoch": 0.25, + "grad_norm": 1.567596646564159, + "learning_rate": 8.75324410401103e-06, + "loss": 0.3774, + "step": 8718 + }, + { + "epoch": 0.25, + "grad_norm": 1.8131736936789833, + "learning_rate": 8.75293375094481e-06, + "loss": 0.3456, + "step": 8719 + }, + { + "epoch": 0.25, + "grad_norm": 2.7666431356916688, + "learning_rate": 8.752623364758907e-06, + "loss": 0.3473, + "step": 8720 + }, + { + "epoch": 0.25, + "grad_norm": 1.505289470351088, + "learning_rate": 8.752312945456064e-06, + "loss": 0.3661, + "step": 8721 + }, + { + "epoch": 0.25, + "grad_norm": 1.4612724818646703, + "learning_rate": 8.752002493039014e-06, + "loss": 0.3784, + "step": 8722 + }, + { + "epoch": 0.25, + "grad_norm": 1.3344614017418206, + "learning_rate": 8.751692007510503e-06, + "loss": 0.3406, + "step": 8723 + }, + { + "epoch": 0.25, + "grad_norm": 1.606879004745978, + "learning_rate": 8.751381488873267e-06, + "loss": 0.3565, + "step": 8724 + }, + { + "epoch": 0.25, + "grad_norm": 1.397331566985153, + "learning_rate": 8.751070937130049e-06, + "loss": 0.3796, + "step": 8725 + }, + { + "epoch": 0.25, + "grad_norm": 1.5005160042717627, + "learning_rate": 8.750760352283588e-06, + "loss": 0.3831, + "step": 8726 + }, + { + "epoch": 0.25, + "grad_norm": 1.5262111052657508, + "learning_rate": 8.750449734336626e-06, + "loss": 0.3474, + "step": 8727 + }, + { + "epoch": 0.25, + "grad_norm": 2.478706030821129, + "learning_rate": 8.750139083291902e-06, + "loss": 0.4121, + "step": 8728 + }, + { + "epoch": 0.25, + "grad_norm": 1.4616503727562096, + "learning_rate": 8.74982839915216e-06, + "loss": 0.383, + "step": 8729 + }, + { + "epoch": 0.25, + "grad_norm": 1.3768755391387584, + "learning_rate": 8.749517681920139e-06, + "loss": 0.3763, + "step": 8730 + }, + { + "epoch": 0.25, + "grad_norm": 1.470074039646498, + "learning_rate": 8.749206931598584e-06, + "loss": 0.3373, + "step": 8731 + }, + { + "epoch": 0.25, + "grad_norm": 1.521842533195358, + "learning_rate": 8.748896148190238e-06, + "loss": 0.3557, + "step": 8732 + }, + { + "epoch": 0.25, + "grad_norm": 1.5418487739411957, + "learning_rate": 8.74858533169784e-06, + "loss": 0.3884, + "step": 8733 + }, + { + "epoch": 0.25, + "grad_norm": 1.056864610502713, + "learning_rate": 8.748274482124135e-06, + "loss": 0.6615, + "step": 8734 + }, + { + "epoch": 0.25, + "grad_norm": 1.4475047192593347, + "learning_rate": 8.747963599471867e-06, + "loss": 0.374, + "step": 8735 + }, + { + "epoch": 0.25, + "grad_norm": 1.0203113855787138, + "learning_rate": 8.747652683743777e-06, + "loss": 0.6225, + "step": 8736 + }, + { + "epoch": 0.25, + "grad_norm": 1.3222353727822715, + "learning_rate": 8.747341734942611e-06, + "loss": 0.3747, + "step": 8737 + }, + { + "epoch": 0.25, + "grad_norm": 2.807946393133956, + "learning_rate": 8.747030753071113e-06, + "loss": 0.3586, + "step": 8738 + }, + { + "epoch": 0.25, + "grad_norm": 1.4340825359847817, + "learning_rate": 8.746719738132028e-06, + "loss": 0.3681, + "step": 8739 + }, + { + "epoch": 0.25, + "grad_norm": 1.392685637108941, + "learning_rate": 8.746408690128098e-06, + "loss": 0.35, + "step": 8740 + }, + { + "epoch": 0.25, + "grad_norm": 1.6306104739461218, + "learning_rate": 8.746097609062072e-06, + "loss": 0.3651, + "step": 8741 + }, + { + "epoch": 0.25, + "grad_norm": 1.5772405792126616, + "learning_rate": 8.745786494936691e-06, + "loss": 0.3867, + "step": 8742 + }, + { + "epoch": 0.25, + "grad_norm": 1.266874613337466, + "learning_rate": 8.745475347754704e-06, + "loss": 0.3528, + "step": 8743 + }, + { + "epoch": 0.25, + "grad_norm": 1.952815918758433, + "learning_rate": 8.745164167518853e-06, + "loss": 0.3552, + "step": 8744 + }, + { + "epoch": 0.25, + "grad_norm": 1.8005304884610767, + "learning_rate": 8.744852954231889e-06, + "loss": 0.347, + "step": 8745 + }, + { + "epoch": 0.25, + "grad_norm": 7.0937775256940405, + "learning_rate": 8.744541707896557e-06, + "loss": 0.3808, + "step": 8746 + }, + { + "epoch": 0.25, + "grad_norm": 1.4104563694896832, + "learning_rate": 8.744230428515602e-06, + "loss": 0.3184, + "step": 8747 + }, + { + "epoch": 0.25, + "grad_norm": 1.6624719419973457, + "learning_rate": 8.74391911609177e-06, + "loss": 0.3581, + "step": 8748 + }, + { + "epoch": 0.25, + "grad_norm": 1.3858956923997416, + "learning_rate": 8.743607770627812e-06, + "loss": 0.3456, + "step": 8749 + }, + { + "epoch": 0.25, + "grad_norm": 1.7023441842123308, + "learning_rate": 8.743296392126473e-06, + "loss": 0.3431, + "step": 8750 + }, + { + "epoch": 0.25, + "grad_norm": 1.613400844318302, + "learning_rate": 8.742984980590502e-06, + "loss": 0.3946, + "step": 8751 + }, + { + "epoch": 0.25, + "grad_norm": 1.7572463515497119, + "learning_rate": 8.742673536022647e-06, + "loss": 0.35, + "step": 8752 + }, + { + "epoch": 0.25, + "grad_norm": 1.4139246259626035, + "learning_rate": 8.742362058425658e-06, + "loss": 0.395, + "step": 8753 + }, + { + "epoch": 0.25, + "grad_norm": 1.5991734026327686, + "learning_rate": 8.74205054780228e-06, + "loss": 0.3545, + "step": 8754 + }, + { + "epoch": 0.25, + "grad_norm": 1.5544375407351525, + "learning_rate": 8.741739004155267e-06, + "loss": 0.3454, + "step": 8755 + }, + { + "epoch": 0.25, + "grad_norm": 1.3904874276466925, + "learning_rate": 8.741427427487365e-06, + "loss": 0.355, + "step": 8756 + }, + { + "epoch": 0.25, + "grad_norm": 1.821977426681583, + "learning_rate": 8.741115817801325e-06, + "loss": 0.3427, + "step": 8757 + }, + { + "epoch": 0.25, + "grad_norm": 2.891078035248181, + "learning_rate": 8.740804175099895e-06, + "loss": 0.327, + "step": 8758 + }, + { + "epoch": 0.25, + "grad_norm": 1.5928132474301513, + "learning_rate": 8.740492499385826e-06, + "loss": 0.3304, + "step": 8759 + }, + { + "epoch": 0.25, + "grad_norm": 1.4153900350520716, + "learning_rate": 8.74018079066187e-06, + "loss": 0.3633, + "step": 8760 + }, + { + "epoch": 0.25, + "grad_norm": 1.9398314425155188, + "learning_rate": 8.739869048930779e-06, + "loss": 0.3712, + "step": 8761 + }, + { + "epoch": 0.25, + "grad_norm": 1.5534487804641488, + "learning_rate": 8.739557274195298e-06, + "loss": 0.3629, + "step": 8762 + }, + { + "epoch": 0.25, + "grad_norm": 1.56898896729511, + "learning_rate": 8.739245466458187e-06, + "loss": 0.3953, + "step": 8763 + }, + { + "epoch": 0.25, + "grad_norm": 1.470725202701764, + "learning_rate": 8.73893362572219e-06, + "loss": 0.3635, + "step": 8764 + }, + { + "epoch": 0.25, + "grad_norm": 1.4893413665715003, + "learning_rate": 8.738621751990064e-06, + "loss": 0.3476, + "step": 8765 + }, + { + "epoch": 0.25, + "grad_norm": 1.6921965790439968, + "learning_rate": 8.738309845264558e-06, + "loss": 0.3605, + "step": 8766 + }, + { + "epoch": 0.25, + "grad_norm": 1.3295202753510347, + "learning_rate": 8.737997905548428e-06, + "loss": 0.3639, + "step": 8767 + }, + { + "epoch": 0.25, + "grad_norm": 1.5275547200750128, + "learning_rate": 8.737685932844424e-06, + "loss": 0.3714, + "step": 8768 + }, + { + "epoch": 0.25, + "grad_norm": 1.314147863674502, + "learning_rate": 8.7373739271553e-06, + "loss": 0.3522, + "step": 8769 + }, + { + "epoch": 0.25, + "grad_norm": 1.3306935062066367, + "learning_rate": 8.73706188848381e-06, + "loss": 0.3463, + "step": 8770 + }, + { + "epoch": 0.25, + "grad_norm": 1.5478015041010178, + "learning_rate": 8.736749816832709e-06, + "loss": 0.3795, + "step": 8771 + }, + { + "epoch": 0.25, + "grad_norm": 1.4465640788559526, + "learning_rate": 8.736437712204746e-06, + "loss": 0.378, + "step": 8772 + }, + { + "epoch": 0.25, + "grad_norm": 1.4416582626225538, + "learning_rate": 8.73612557460268e-06, + "loss": 0.3588, + "step": 8773 + }, + { + "epoch": 0.25, + "grad_norm": 1.3861649030785272, + "learning_rate": 8.735813404029267e-06, + "loss": 0.3644, + "step": 8774 + }, + { + "epoch": 0.25, + "grad_norm": 1.9673079215441813, + "learning_rate": 8.735501200487256e-06, + "loss": 0.3925, + "step": 8775 + }, + { + "epoch": 0.25, + "grad_norm": 1.6190278774041866, + "learning_rate": 8.735188963979406e-06, + "loss": 0.3878, + "step": 8776 + }, + { + "epoch": 0.25, + "grad_norm": 1.4872068187741811, + "learning_rate": 8.734876694508474e-06, + "loss": 0.3564, + "step": 8777 + }, + { + "epoch": 0.25, + "grad_norm": 0.9803683247062717, + "learning_rate": 8.734564392077212e-06, + "loss": 0.6436, + "step": 8778 + }, + { + "epoch": 0.25, + "grad_norm": 1.4821642329283502, + "learning_rate": 8.734252056688377e-06, + "loss": 0.344, + "step": 8779 + }, + { + "epoch": 0.25, + "grad_norm": 1.4718998659481572, + "learning_rate": 8.733939688344727e-06, + "loss": 0.3586, + "step": 8780 + }, + { + "epoch": 0.25, + "grad_norm": 1.5884321573120264, + "learning_rate": 8.73362728704902e-06, + "loss": 0.3577, + "step": 8781 + }, + { + "epoch": 0.25, + "grad_norm": 1.471772356482991, + "learning_rate": 8.73331485280401e-06, + "loss": 0.3614, + "step": 8782 + }, + { + "epoch": 0.25, + "grad_norm": 1.3888280241400162, + "learning_rate": 8.733002385612452e-06, + "loss": 0.3343, + "step": 8783 + }, + { + "epoch": 0.25, + "grad_norm": 1.6360544603898626, + "learning_rate": 8.732689885477108e-06, + "loss": 0.3613, + "step": 8784 + }, + { + "epoch": 0.25, + "grad_norm": 3.0994844345684904, + "learning_rate": 8.732377352400736e-06, + "loss": 0.3651, + "step": 8785 + }, + { + "epoch": 0.25, + "grad_norm": 1.4955813739303097, + "learning_rate": 8.73206478638609e-06, + "loss": 0.345, + "step": 8786 + }, + { + "epoch": 0.25, + "grad_norm": 2.058084569856631, + "learning_rate": 8.731752187435933e-06, + "loss": 0.3982, + "step": 8787 + }, + { + "epoch": 0.25, + "grad_norm": 1.5949981240753015, + "learning_rate": 8.73143955555302e-06, + "loss": 0.3913, + "step": 8788 + }, + { + "epoch": 0.25, + "grad_norm": 1.0216181442036012, + "learning_rate": 8.731126890740113e-06, + "loss": 0.5878, + "step": 8789 + }, + { + "epoch": 0.25, + "grad_norm": 2.164477392147943, + "learning_rate": 8.730814192999968e-06, + "loss": 0.3499, + "step": 8790 + }, + { + "epoch": 0.25, + "grad_norm": 1.6243148437231743, + "learning_rate": 8.730501462335348e-06, + "loss": 0.3526, + "step": 8791 + }, + { + "epoch": 0.26, + "grad_norm": 1.3103266138521665, + "learning_rate": 8.730188698749012e-06, + "loss": 0.3672, + "step": 8792 + }, + { + "epoch": 0.26, + "grad_norm": 1.3024476254272301, + "learning_rate": 8.729875902243716e-06, + "loss": 0.3451, + "step": 8793 + }, + { + "epoch": 0.26, + "grad_norm": 1.4029264942260393, + "learning_rate": 8.729563072822227e-06, + "loss": 0.395, + "step": 8794 + }, + { + "epoch": 0.26, + "grad_norm": 1.536790112582566, + "learning_rate": 8.7292502104873e-06, + "loss": 0.3767, + "step": 8795 + }, + { + "epoch": 0.26, + "grad_norm": 1.4578409804715304, + "learning_rate": 8.7289373152417e-06, + "loss": 0.3901, + "step": 8796 + }, + { + "epoch": 0.26, + "grad_norm": 2.0973204177394624, + "learning_rate": 8.728624387088187e-06, + "loss": 0.3427, + "step": 8797 + }, + { + "epoch": 0.26, + "grad_norm": 0.9598520086815299, + "learning_rate": 8.728311426029521e-06, + "loss": 0.5773, + "step": 8798 + }, + { + "epoch": 0.26, + "grad_norm": 1.398949336962567, + "learning_rate": 8.727998432068467e-06, + "loss": 0.3551, + "step": 8799 + }, + { + "epoch": 0.26, + "grad_norm": 1.7000077348127087, + "learning_rate": 8.727685405207784e-06, + "loss": 0.3817, + "step": 8800 + }, + { + "epoch": 0.26, + "grad_norm": 2.8896328052397786, + "learning_rate": 8.727372345450238e-06, + "loss": 0.3501, + "step": 8801 + }, + { + "epoch": 0.26, + "grad_norm": 1.367037562830666, + "learning_rate": 8.727059252798588e-06, + "loss": 0.3786, + "step": 8802 + }, + { + "epoch": 0.26, + "grad_norm": 2.172549537771034, + "learning_rate": 8.7267461272556e-06, + "loss": 0.3655, + "step": 8803 + }, + { + "epoch": 0.26, + "grad_norm": 1.4008110662032058, + "learning_rate": 8.726432968824034e-06, + "loss": 0.3688, + "step": 8804 + }, + { + "epoch": 0.26, + "grad_norm": 1.4362978891389586, + "learning_rate": 8.726119777506658e-06, + "loss": 0.3665, + "step": 8805 + }, + { + "epoch": 0.26, + "grad_norm": 1.6170761670667113, + "learning_rate": 8.725806553306233e-06, + "loss": 0.3759, + "step": 8806 + }, + { + "epoch": 0.26, + "grad_norm": 1.6057353981837157, + "learning_rate": 8.725493296225522e-06, + "loss": 0.3577, + "step": 8807 + }, + { + "epoch": 0.26, + "grad_norm": 1.5264667944864458, + "learning_rate": 8.725180006267293e-06, + "loss": 0.3673, + "step": 8808 + }, + { + "epoch": 0.26, + "grad_norm": 1.525376903156184, + "learning_rate": 8.72486668343431e-06, + "loss": 0.3344, + "step": 8809 + }, + { + "epoch": 0.26, + "grad_norm": 1.6884420074223734, + "learning_rate": 8.724553327729335e-06, + "loss": 0.3624, + "step": 8810 + }, + { + "epoch": 0.26, + "grad_norm": 1.0024283453152274, + "learning_rate": 8.724239939155136e-06, + "loss": 0.5932, + "step": 8811 + }, + { + "epoch": 0.26, + "grad_norm": 1.606789883846878, + "learning_rate": 8.72392651771448e-06, + "loss": 0.375, + "step": 8812 + }, + { + "epoch": 0.26, + "grad_norm": 7.441087197207663, + "learning_rate": 8.72361306341013e-06, + "loss": 0.3465, + "step": 8813 + }, + { + "epoch": 0.26, + "grad_norm": 1.4088603486818307, + "learning_rate": 8.723299576244853e-06, + "loss": 0.3668, + "step": 8814 + }, + { + "epoch": 0.26, + "grad_norm": 1.6266596680045553, + "learning_rate": 8.722986056221417e-06, + "loss": 0.3428, + "step": 8815 + }, + { + "epoch": 0.26, + "grad_norm": 1.5202102362356849, + "learning_rate": 8.722672503342587e-06, + "loss": 0.3481, + "step": 8816 + }, + { + "epoch": 0.26, + "grad_norm": 1.4181009805308682, + "learning_rate": 8.72235891761113e-06, + "loss": 0.3805, + "step": 8817 + }, + { + "epoch": 0.26, + "grad_norm": 1.8022230364803464, + "learning_rate": 8.722045299029817e-06, + "loss": 0.4013, + "step": 8818 + }, + { + "epoch": 0.26, + "grad_norm": 1.7430727892423, + "learning_rate": 8.72173164760141e-06, + "loss": 0.4034, + "step": 8819 + }, + { + "epoch": 0.26, + "grad_norm": 1.6077601584021357, + "learning_rate": 8.721417963328683e-06, + "loss": 0.3406, + "step": 8820 + }, + { + "epoch": 0.26, + "grad_norm": 1.4465014710444195, + "learning_rate": 8.721104246214398e-06, + "loss": 0.3641, + "step": 8821 + }, + { + "epoch": 0.26, + "grad_norm": 2.6925542065362946, + "learning_rate": 8.720790496261329e-06, + "loss": 0.3566, + "step": 8822 + }, + { + "epoch": 0.26, + "grad_norm": 1.4905763190943258, + "learning_rate": 8.720476713472242e-06, + "loss": 0.3903, + "step": 8823 + }, + { + "epoch": 0.26, + "grad_norm": 2.141363280829378, + "learning_rate": 8.720162897849906e-06, + "loss": 0.3474, + "step": 8824 + }, + { + "epoch": 0.26, + "grad_norm": 1.7188560418825767, + "learning_rate": 8.719849049397091e-06, + "loss": 0.3684, + "step": 8825 + }, + { + "epoch": 0.26, + "grad_norm": 1.3873306414409017, + "learning_rate": 8.719535168116568e-06, + "loss": 0.3449, + "step": 8826 + }, + { + "epoch": 0.26, + "grad_norm": 1.7534427149471583, + "learning_rate": 8.719221254011105e-06, + "loss": 0.3747, + "step": 8827 + }, + { + "epoch": 0.26, + "grad_norm": 1.6396023805446096, + "learning_rate": 8.718907307083473e-06, + "loss": 0.3691, + "step": 8828 + }, + { + "epoch": 0.26, + "grad_norm": 1.6341682355271765, + "learning_rate": 8.718593327336444e-06, + "loss": 0.3671, + "step": 8829 + }, + { + "epoch": 0.26, + "grad_norm": 1.3874827408469912, + "learning_rate": 8.718279314772786e-06, + "loss": 0.3631, + "step": 8830 + }, + { + "epoch": 0.26, + "grad_norm": 1.7747868975345509, + "learning_rate": 8.717965269395272e-06, + "loss": 0.3801, + "step": 8831 + }, + { + "epoch": 0.26, + "grad_norm": 1.7163890422236718, + "learning_rate": 8.717651191206675e-06, + "loss": 0.3589, + "step": 8832 + }, + { + "epoch": 0.26, + "grad_norm": 1.5131758224871292, + "learning_rate": 8.717337080209765e-06, + "loss": 0.3418, + "step": 8833 + }, + { + "epoch": 0.26, + "grad_norm": 1.5394392531063497, + "learning_rate": 8.717022936407312e-06, + "loss": 0.3541, + "step": 8834 + }, + { + "epoch": 0.26, + "grad_norm": 1.5167644346253026, + "learning_rate": 8.71670875980209e-06, + "loss": 0.3607, + "step": 8835 + }, + { + "epoch": 0.26, + "grad_norm": 1.3088821129149562, + "learning_rate": 8.716394550396873e-06, + "loss": 0.3487, + "step": 8836 + }, + { + "epoch": 0.26, + "grad_norm": 1.5874241513822254, + "learning_rate": 8.716080308194435e-06, + "loss": 0.3888, + "step": 8837 + }, + { + "epoch": 0.26, + "grad_norm": 1.8287347163011767, + "learning_rate": 8.715766033197545e-06, + "loss": 0.3507, + "step": 8838 + }, + { + "epoch": 0.26, + "grad_norm": 1.3963010889777832, + "learning_rate": 8.715451725408978e-06, + "loss": 0.4254, + "step": 8839 + }, + { + "epoch": 0.26, + "grad_norm": 1.5400350691512623, + "learning_rate": 8.715137384831508e-06, + "loss": 0.3738, + "step": 8840 + }, + { + "epoch": 0.26, + "grad_norm": 2.045862953436329, + "learning_rate": 8.714823011467913e-06, + "loss": 0.3396, + "step": 8841 + }, + { + "epoch": 0.26, + "grad_norm": 1.4013438750605973, + "learning_rate": 8.714508605320959e-06, + "loss": 0.3834, + "step": 8842 + }, + { + "epoch": 0.26, + "grad_norm": 1.7438235945770544, + "learning_rate": 8.714194166393428e-06, + "loss": 0.3417, + "step": 8843 + }, + { + "epoch": 0.26, + "grad_norm": 1.4569362018842675, + "learning_rate": 8.713879694688092e-06, + "loss": 0.375, + "step": 8844 + }, + { + "epoch": 0.26, + "grad_norm": 1.3792605388880925, + "learning_rate": 8.713565190207726e-06, + "loss": 0.3624, + "step": 8845 + }, + { + "epoch": 0.26, + "grad_norm": 1.792344379010317, + "learning_rate": 8.713250652955107e-06, + "loss": 0.3648, + "step": 8846 + }, + { + "epoch": 0.26, + "grad_norm": 1.895143825872831, + "learning_rate": 8.712936082933007e-06, + "loss": 0.3409, + "step": 8847 + }, + { + "epoch": 0.26, + "grad_norm": 1.5483033559689487, + "learning_rate": 8.712621480144208e-06, + "loss": 0.3833, + "step": 8848 + }, + { + "epoch": 0.26, + "grad_norm": 1.5503149523631465, + "learning_rate": 8.712306844591483e-06, + "loss": 0.3465, + "step": 8849 + }, + { + "epoch": 0.26, + "grad_norm": 1.4951333216086975, + "learning_rate": 8.711992176277608e-06, + "loss": 0.3443, + "step": 8850 + }, + { + "epoch": 0.26, + "grad_norm": 1.4853329813728513, + "learning_rate": 8.71167747520536e-06, + "loss": 0.3563, + "step": 8851 + }, + { + "epoch": 0.26, + "grad_norm": 1.6255351776512434, + "learning_rate": 8.711362741377519e-06, + "loss": 0.3653, + "step": 8852 + }, + { + "epoch": 0.26, + "grad_norm": 1.4611893145153025, + "learning_rate": 8.71104797479686e-06, + "loss": 0.3764, + "step": 8853 + }, + { + "epoch": 0.26, + "grad_norm": 1.9794844397604237, + "learning_rate": 8.710733175466161e-06, + "loss": 0.3604, + "step": 8854 + }, + { + "epoch": 0.26, + "grad_norm": 1.6302339529641185, + "learning_rate": 8.7104183433882e-06, + "loss": 0.3554, + "step": 8855 + }, + { + "epoch": 0.26, + "grad_norm": 1.3866182242594218, + "learning_rate": 8.710103478565756e-06, + "loss": 0.3576, + "step": 8856 + }, + { + "epoch": 0.26, + "grad_norm": 1.4802734652597942, + "learning_rate": 8.709788581001608e-06, + "loss": 0.3627, + "step": 8857 + }, + { + "epoch": 0.26, + "grad_norm": 1.3144500017832386, + "learning_rate": 8.709473650698536e-06, + "loss": 0.3278, + "step": 8858 + }, + { + "epoch": 0.26, + "grad_norm": 1.5620618362445566, + "learning_rate": 8.709158687659317e-06, + "loss": 0.3316, + "step": 8859 + }, + { + "epoch": 0.26, + "grad_norm": 1.4435671940748396, + "learning_rate": 8.708843691886731e-06, + "loss": 0.36, + "step": 8860 + }, + { + "epoch": 0.26, + "grad_norm": 1.5998917246671387, + "learning_rate": 8.70852866338356e-06, + "loss": 0.3547, + "step": 8861 + }, + { + "epoch": 0.26, + "grad_norm": 1.3101237385243956, + "learning_rate": 8.70821360215258e-06, + "loss": 0.3358, + "step": 8862 + }, + { + "epoch": 0.26, + "grad_norm": 1.3672910630869202, + "learning_rate": 8.707898508196575e-06, + "loss": 0.3509, + "step": 8863 + }, + { + "epoch": 0.26, + "grad_norm": 1.6175488239376024, + "learning_rate": 8.707583381518323e-06, + "loss": 0.3275, + "step": 8864 + }, + { + "epoch": 0.26, + "grad_norm": 1.410477090498158, + "learning_rate": 8.707268222120609e-06, + "loss": 0.3537, + "step": 8865 + }, + { + "epoch": 0.26, + "grad_norm": 1.5499733655985124, + "learning_rate": 8.70695303000621e-06, + "loss": 0.3827, + "step": 8866 + }, + { + "epoch": 0.26, + "grad_norm": 0.992317676390753, + "learning_rate": 8.70663780517791e-06, + "loss": 0.6352, + "step": 8867 + }, + { + "epoch": 0.26, + "grad_norm": 2.0291709631885415, + "learning_rate": 8.70632254763849e-06, + "loss": 0.3388, + "step": 8868 + }, + { + "epoch": 0.26, + "grad_norm": 1.7739741465614915, + "learning_rate": 8.706007257390733e-06, + "loss": 0.3492, + "step": 8869 + }, + { + "epoch": 0.26, + "grad_norm": 1.4521308312945804, + "learning_rate": 8.70569193443742e-06, + "loss": 0.3491, + "step": 8870 + }, + { + "epoch": 0.26, + "grad_norm": 1.5545591202324283, + "learning_rate": 8.705376578781335e-06, + "loss": 0.3851, + "step": 8871 + }, + { + "epoch": 0.26, + "grad_norm": 1.5978514811719553, + "learning_rate": 8.705061190425262e-06, + "loss": 0.3981, + "step": 8872 + }, + { + "epoch": 0.26, + "grad_norm": 1.4259810248406963, + "learning_rate": 8.70474576937198e-06, + "loss": 0.3435, + "step": 8873 + }, + { + "epoch": 0.26, + "grad_norm": 1.615788460377532, + "learning_rate": 8.704430315624278e-06, + "loss": 0.3349, + "step": 8874 + }, + { + "epoch": 0.26, + "grad_norm": 1.5220547706048677, + "learning_rate": 8.704114829184935e-06, + "loss": 0.3503, + "step": 8875 + }, + { + "epoch": 0.26, + "grad_norm": 1.525167114714541, + "learning_rate": 8.70379931005674e-06, + "loss": 0.4061, + "step": 8876 + }, + { + "epoch": 0.26, + "grad_norm": 1.5169106796418277, + "learning_rate": 8.703483758242474e-06, + "loss": 0.372, + "step": 8877 + }, + { + "epoch": 0.26, + "grad_norm": 1.5873521215262376, + "learning_rate": 8.703168173744922e-06, + "loss": 0.3463, + "step": 8878 + }, + { + "epoch": 0.26, + "grad_norm": 1.3836476102881068, + "learning_rate": 8.70285255656687e-06, + "loss": 0.3652, + "step": 8879 + }, + { + "epoch": 0.26, + "grad_norm": 1.4607495113591284, + "learning_rate": 8.702536906711104e-06, + "loss": 0.3786, + "step": 8880 + }, + { + "epoch": 0.26, + "grad_norm": 1.5794042522856468, + "learning_rate": 8.702221224180409e-06, + "loss": 0.3474, + "step": 8881 + }, + { + "epoch": 0.26, + "grad_norm": 1.5389735264445408, + "learning_rate": 8.701905508977569e-06, + "loss": 0.35, + "step": 8882 + }, + { + "epoch": 0.26, + "grad_norm": 1.3650695240732715, + "learning_rate": 8.701589761105373e-06, + "loss": 0.3533, + "step": 8883 + }, + { + "epoch": 0.26, + "grad_norm": 2.047453495651834, + "learning_rate": 8.701273980566608e-06, + "loss": 0.3449, + "step": 8884 + }, + { + "epoch": 0.26, + "grad_norm": 1.6591155136299092, + "learning_rate": 8.700958167364055e-06, + "loss": 0.3681, + "step": 8885 + }, + { + "epoch": 0.26, + "grad_norm": 1.403430132127772, + "learning_rate": 8.700642321500507e-06, + "loss": 0.3561, + "step": 8886 + }, + { + "epoch": 0.26, + "grad_norm": 1.5434570132361274, + "learning_rate": 8.700326442978751e-06, + "loss": 0.3362, + "step": 8887 + }, + { + "epoch": 0.26, + "grad_norm": 1.9279137896204048, + "learning_rate": 8.700010531801571e-06, + "loss": 0.366, + "step": 8888 + }, + { + "epoch": 0.26, + "grad_norm": 1.4881425397878738, + "learning_rate": 8.699694587971758e-06, + "loss": 0.351, + "step": 8889 + }, + { + "epoch": 0.26, + "grad_norm": 1.5274050542606057, + "learning_rate": 8.6993786114921e-06, + "loss": 0.3569, + "step": 8890 + }, + { + "epoch": 0.26, + "grad_norm": 5.158929774663165, + "learning_rate": 8.699062602365382e-06, + "loss": 0.3298, + "step": 8891 + }, + { + "epoch": 0.26, + "grad_norm": 0.9827359014584041, + "learning_rate": 8.698746560594398e-06, + "loss": 0.5601, + "step": 8892 + }, + { + "epoch": 0.26, + "grad_norm": 1.6260635468794775, + "learning_rate": 8.698430486181933e-06, + "loss": 0.3852, + "step": 8893 + }, + { + "epoch": 0.26, + "grad_norm": 1.49612222390617, + "learning_rate": 8.69811437913078e-06, + "loss": 0.3469, + "step": 8894 + }, + { + "epoch": 0.26, + "grad_norm": 1.3841408059306073, + "learning_rate": 8.697798239443725e-06, + "loss": 0.37, + "step": 8895 + }, + { + "epoch": 0.26, + "grad_norm": 0.9028110196214765, + "learning_rate": 8.69748206712356e-06, + "loss": 0.6156, + "step": 8896 + }, + { + "epoch": 0.26, + "grad_norm": 1.3077462093596992, + "learning_rate": 8.697165862173073e-06, + "loss": 0.3356, + "step": 8897 + }, + { + "epoch": 0.26, + "grad_norm": 1.693652776953721, + "learning_rate": 8.696849624595058e-06, + "loss": 0.3696, + "step": 8898 + }, + { + "epoch": 0.26, + "grad_norm": 1.525020236158971, + "learning_rate": 8.696533354392304e-06, + "loss": 0.3688, + "step": 8899 + }, + { + "epoch": 0.26, + "grad_norm": 1.9891418778982268, + "learning_rate": 8.696217051567603e-06, + "loss": 0.4104, + "step": 8900 + }, + { + "epoch": 0.26, + "grad_norm": 1.2830835005830457, + "learning_rate": 8.695900716123744e-06, + "loss": 0.3526, + "step": 8901 + }, + { + "epoch": 0.26, + "grad_norm": 1.5079354328594992, + "learning_rate": 8.695584348063522e-06, + "loss": 0.3489, + "step": 8902 + }, + { + "epoch": 0.26, + "grad_norm": 1.8684489169456266, + "learning_rate": 8.695267947389725e-06, + "loss": 0.3399, + "step": 8903 + }, + { + "epoch": 0.26, + "grad_norm": 1.5916596980678126, + "learning_rate": 8.694951514105148e-06, + "loss": 0.369, + "step": 8904 + }, + { + "epoch": 0.26, + "grad_norm": 1.8480736432789406, + "learning_rate": 8.694635048212582e-06, + "loss": 0.3737, + "step": 8905 + }, + { + "epoch": 0.26, + "grad_norm": 1.4508159083160241, + "learning_rate": 8.694318549714823e-06, + "loss": 0.3472, + "step": 8906 + }, + { + "epoch": 0.26, + "grad_norm": 1.4551983113708227, + "learning_rate": 8.694002018614661e-06, + "loss": 0.3409, + "step": 8907 + }, + { + "epoch": 0.26, + "grad_norm": 1.2842302202177844, + "learning_rate": 8.693685454914889e-06, + "loss": 0.3563, + "step": 8908 + }, + { + "epoch": 0.26, + "grad_norm": 1.5816911430898315, + "learning_rate": 8.693368858618303e-06, + "loss": 0.3741, + "step": 8909 + }, + { + "epoch": 0.26, + "grad_norm": 2.014281916002831, + "learning_rate": 8.693052229727697e-06, + "loss": 0.3483, + "step": 8910 + }, + { + "epoch": 0.26, + "grad_norm": 1.5519506758177926, + "learning_rate": 8.69273556824586e-06, + "loss": 0.3276, + "step": 8911 + }, + { + "epoch": 0.26, + "grad_norm": 1.621346920986559, + "learning_rate": 8.692418874175594e-06, + "loss": 0.3637, + "step": 8912 + }, + { + "epoch": 0.26, + "grad_norm": 1.3356646748090648, + "learning_rate": 8.692102147519692e-06, + "loss": 0.3286, + "step": 8913 + }, + { + "epoch": 0.26, + "grad_norm": 1.4964192399236491, + "learning_rate": 8.691785388280944e-06, + "loss": 0.3628, + "step": 8914 + }, + { + "epoch": 0.26, + "grad_norm": 1.4952334412512247, + "learning_rate": 8.691468596462152e-06, + "loss": 0.4097, + "step": 8915 + }, + { + "epoch": 0.26, + "grad_norm": 1.3392886191511861, + "learning_rate": 8.691151772066105e-06, + "loss": 0.3532, + "step": 8916 + }, + { + "epoch": 0.26, + "grad_norm": 1.5877440178744304, + "learning_rate": 8.690834915095605e-06, + "loss": 0.3366, + "step": 8917 + }, + { + "epoch": 0.26, + "grad_norm": 1.9825399233755248, + "learning_rate": 8.690518025553446e-06, + "loss": 0.3571, + "step": 8918 + }, + { + "epoch": 0.26, + "grad_norm": 1.6255753460481626, + "learning_rate": 8.690201103442424e-06, + "loss": 0.3711, + "step": 8919 + }, + { + "epoch": 0.26, + "grad_norm": 1.3775051086324377, + "learning_rate": 8.689884148765336e-06, + "loss": 0.3578, + "step": 8920 + }, + { + "epoch": 0.26, + "grad_norm": 4.5386686210673926, + "learning_rate": 8.68956716152498e-06, + "loss": 0.3593, + "step": 8921 + }, + { + "epoch": 0.26, + "grad_norm": 1.3051865790265582, + "learning_rate": 8.689250141724153e-06, + "loss": 0.3378, + "step": 8922 + }, + { + "epoch": 0.26, + "grad_norm": 1.4454387744762023, + "learning_rate": 8.688933089365651e-06, + "loss": 0.3788, + "step": 8923 + }, + { + "epoch": 0.26, + "grad_norm": 1.5530708626858314, + "learning_rate": 8.688616004452277e-06, + "loss": 0.341, + "step": 8924 + }, + { + "epoch": 0.26, + "grad_norm": 1.5253765693577728, + "learning_rate": 8.688298886986823e-06, + "loss": 0.3762, + "step": 8925 + }, + { + "epoch": 0.26, + "grad_norm": 1.211395230971729, + "learning_rate": 8.687981736972092e-06, + "loss": 0.3205, + "step": 8926 + }, + { + "epoch": 0.26, + "grad_norm": 1.5946563063560995, + "learning_rate": 8.68766455441088e-06, + "loss": 0.3473, + "step": 8927 + }, + { + "epoch": 0.26, + "grad_norm": 1.2801712025837046, + "learning_rate": 8.687347339305987e-06, + "loss": 0.3495, + "step": 8928 + }, + { + "epoch": 0.26, + "grad_norm": 1.9741499729454317, + "learning_rate": 8.687030091660214e-06, + "loss": 0.3686, + "step": 8929 + }, + { + "epoch": 0.26, + "grad_norm": 1.2084104295158633, + "learning_rate": 8.686712811476358e-06, + "loss": 0.3189, + "step": 8930 + }, + { + "epoch": 0.26, + "grad_norm": 1.4093174349577817, + "learning_rate": 8.686395498757223e-06, + "loss": 0.3492, + "step": 8931 + }, + { + "epoch": 0.26, + "grad_norm": 1.4429534457931543, + "learning_rate": 8.686078153505605e-06, + "loss": 0.3618, + "step": 8932 + }, + { + "epoch": 0.26, + "grad_norm": 1.5216965122084374, + "learning_rate": 8.685760775724308e-06, + "loss": 0.3381, + "step": 8933 + }, + { + "epoch": 0.26, + "grad_norm": 1.3477219616201581, + "learning_rate": 8.685443365416132e-06, + "loss": 0.3916, + "step": 8934 + }, + { + "epoch": 0.26, + "grad_norm": 1.857421122418383, + "learning_rate": 8.685125922583874e-06, + "loss": 0.3541, + "step": 8935 + }, + { + "epoch": 0.26, + "grad_norm": 1.487379101011675, + "learning_rate": 8.68480844723034e-06, + "loss": 0.3584, + "step": 8936 + }, + { + "epoch": 0.26, + "grad_norm": 1.316859526901685, + "learning_rate": 8.684490939358333e-06, + "loss": 0.3665, + "step": 8937 + }, + { + "epoch": 0.26, + "grad_norm": 1.4703226735913795, + "learning_rate": 8.684173398970648e-06, + "loss": 0.3388, + "step": 8938 + }, + { + "epoch": 0.26, + "grad_norm": 3.32373224796239, + "learning_rate": 8.683855826070095e-06, + "loss": 0.3613, + "step": 8939 + }, + { + "epoch": 0.26, + "grad_norm": 1.325027179130592, + "learning_rate": 8.683538220659474e-06, + "loss": 0.3674, + "step": 8940 + }, + { + "epoch": 0.26, + "grad_norm": 1.7982000609911941, + "learning_rate": 8.683220582741585e-06, + "loss": 0.3407, + "step": 8941 + }, + { + "epoch": 0.26, + "grad_norm": 1.3654720866473127, + "learning_rate": 8.682902912319235e-06, + "loss": 0.3597, + "step": 8942 + }, + { + "epoch": 0.26, + "grad_norm": 1.4455869923418865, + "learning_rate": 8.682585209395226e-06, + "loss": 0.3552, + "step": 8943 + }, + { + "epoch": 0.26, + "grad_norm": 1.472690335598986, + "learning_rate": 8.682267473972362e-06, + "loss": 0.3592, + "step": 8944 + }, + { + "epoch": 0.26, + "grad_norm": 1.4918289836118044, + "learning_rate": 8.681949706053446e-06, + "loss": 0.3305, + "step": 8945 + }, + { + "epoch": 0.26, + "grad_norm": 1.327517568984424, + "learning_rate": 8.681631905641283e-06, + "loss": 0.3426, + "step": 8946 + }, + { + "epoch": 0.26, + "grad_norm": 1.3930382436741675, + "learning_rate": 8.681314072738678e-06, + "loss": 0.3452, + "step": 8947 + }, + { + "epoch": 0.26, + "grad_norm": 1.3663133358506783, + "learning_rate": 8.680996207348435e-06, + "loss": 0.377, + "step": 8948 + }, + { + "epoch": 0.26, + "grad_norm": 1.4376154982070555, + "learning_rate": 8.68067830947336e-06, + "loss": 0.3374, + "step": 8949 + }, + { + "epoch": 0.26, + "grad_norm": 1.4408155377046892, + "learning_rate": 8.680360379116258e-06, + "loss": 0.3708, + "step": 8950 + }, + { + "epoch": 0.26, + "grad_norm": 1.7668137651043172, + "learning_rate": 8.680042416279934e-06, + "loss": 0.339, + "step": 8951 + }, + { + "epoch": 0.26, + "grad_norm": 1.2520914257281288, + "learning_rate": 8.679724420967197e-06, + "loss": 0.3192, + "step": 8952 + }, + { + "epoch": 0.26, + "grad_norm": 1.4666282005734368, + "learning_rate": 8.67940639318085e-06, + "loss": 0.3708, + "step": 8953 + }, + { + "epoch": 0.26, + "grad_norm": 1.3316524232033742, + "learning_rate": 8.6790883329237e-06, + "loss": 0.3797, + "step": 8954 + }, + { + "epoch": 0.26, + "grad_norm": 1.4467160891916702, + "learning_rate": 8.678770240198556e-06, + "loss": 0.3443, + "step": 8955 + }, + { + "epoch": 0.26, + "grad_norm": 1.6870654282430864, + "learning_rate": 8.678452115008223e-06, + "loss": 0.3631, + "step": 8956 + }, + { + "epoch": 0.26, + "grad_norm": 1.3916559875366081, + "learning_rate": 8.678133957355508e-06, + "loss": 0.3618, + "step": 8957 + }, + { + "epoch": 0.26, + "grad_norm": 1.4296225063203645, + "learning_rate": 8.677815767243222e-06, + "loss": 0.3669, + "step": 8958 + }, + { + "epoch": 0.26, + "grad_norm": 1.7501890822862531, + "learning_rate": 8.67749754467417e-06, + "loss": 0.388, + "step": 8959 + }, + { + "epoch": 0.26, + "grad_norm": 1.5915641805663292, + "learning_rate": 8.677179289651163e-06, + "loss": 0.3769, + "step": 8960 + }, + { + "epoch": 0.26, + "grad_norm": 1.5069144042534246, + "learning_rate": 8.676861002177007e-06, + "loss": 0.3552, + "step": 8961 + }, + { + "epoch": 0.26, + "grad_norm": 1.5826931956994912, + "learning_rate": 8.676542682254509e-06, + "loss": 0.3384, + "step": 8962 + }, + { + "epoch": 0.26, + "grad_norm": 2.13251600527443, + "learning_rate": 8.676224329886484e-06, + "loss": 0.3427, + "step": 8963 + }, + { + "epoch": 0.26, + "grad_norm": 1.3195282030771014, + "learning_rate": 8.67590594507574e-06, + "loss": 0.357, + "step": 8964 + }, + { + "epoch": 0.26, + "grad_norm": 1.032879412702867, + "learning_rate": 8.675587527825083e-06, + "loss": 0.5932, + "step": 8965 + }, + { + "epoch": 0.26, + "grad_norm": 0.9481499343548314, + "learning_rate": 8.675269078137326e-06, + "loss": 0.596, + "step": 8966 + }, + { + "epoch": 0.26, + "grad_norm": 0.9236856922257913, + "learning_rate": 8.674950596015276e-06, + "loss": 0.6452, + "step": 8967 + }, + { + "epoch": 0.26, + "grad_norm": 1.7765668865149857, + "learning_rate": 8.67463208146175e-06, + "loss": 0.3626, + "step": 8968 + }, + { + "epoch": 0.26, + "grad_norm": 1.5914890303794216, + "learning_rate": 8.674313534479553e-06, + "loss": 0.3736, + "step": 8969 + }, + { + "epoch": 0.26, + "grad_norm": 2.1318988294677257, + "learning_rate": 8.6739949550715e-06, + "loss": 0.3615, + "step": 8970 + }, + { + "epoch": 0.26, + "grad_norm": 1.7228779317431255, + "learning_rate": 8.673676343240399e-06, + "loss": 0.3731, + "step": 8971 + }, + { + "epoch": 0.26, + "grad_norm": 1.3021956219712922, + "learning_rate": 8.673357698989064e-06, + "loss": 0.3584, + "step": 8972 + }, + { + "epoch": 0.26, + "grad_norm": 1.3742153767550445, + "learning_rate": 8.673039022320307e-06, + "loss": 0.3367, + "step": 8973 + }, + { + "epoch": 0.26, + "grad_norm": 1.8611580189546975, + "learning_rate": 8.67272031323694e-06, + "loss": 0.3669, + "step": 8974 + }, + { + "epoch": 0.26, + "grad_norm": 1.5045507589041462, + "learning_rate": 8.672401571741776e-06, + "loss": 0.3677, + "step": 8975 + }, + { + "epoch": 0.26, + "grad_norm": 1.633874125269726, + "learning_rate": 8.672082797837625e-06, + "loss": 0.3733, + "step": 8976 + }, + { + "epoch": 0.26, + "grad_norm": 1.3126625182478173, + "learning_rate": 8.671763991527304e-06, + "loss": 0.3685, + "step": 8977 + }, + { + "epoch": 0.26, + "grad_norm": 1.3777780544943994, + "learning_rate": 8.671445152813627e-06, + "loss": 0.3562, + "step": 8978 + }, + { + "epoch": 0.26, + "grad_norm": 1.7904247651122127, + "learning_rate": 8.671126281699403e-06, + "loss": 0.3678, + "step": 8979 + }, + { + "epoch": 0.26, + "grad_norm": 1.3286854858120352, + "learning_rate": 8.670807378187449e-06, + "loss": 0.3759, + "step": 8980 + }, + { + "epoch": 0.26, + "grad_norm": 1.3793204288242515, + "learning_rate": 8.670488442280581e-06, + "loss": 0.3432, + "step": 8981 + }, + { + "epoch": 0.26, + "grad_norm": 1.453343699177997, + "learning_rate": 8.670169473981611e-06, + "loss": 0.3441, + "step": 8982 + }, + { + "epoch": 0.26, + "grad_norm": 1.4263979272640899, + "learning_rate": 8.669850473293354e-06, + "loss": 0.3728, + "step": 8983 + }, + { + "epoch": 0.26, + "grad_norm": 1.5545746714451818, + "learning_rate": 8.669531440218625e-06, + "loss": 0.3596, + "step": 8984 + }, + { + "epoch": 0.26, + "grad_norm": 1.608014568589907, + "learning_rate": 8.669212374760244e-06, + "loss": 0.3587, + "step": 8985 + }, + { + "epoch": 0.26, + "grad_norm": 1.3901353585733058, + "learning_rate": 8.66889327692102e-06, + "loss": 0.3921, + "step": 8986 + }, + { + "epoch": 0.26, + "grad_norm": 1.6560392092782337, + "learning_rate": 8.668574146703771e-06, + "loss": 0.3614, + "step": 8987 + }, + { + "epoch": 0.26, + "grad_norm": 4.47502863617103, + "learning_rate": 8.668254984111317e-06, + "loss": 0.3605, + "step": 8988 + }, + { + "epoch": 0.26, + "grad_norm": 1.803898802306155, + "learning_rate": 8.667935789146471e-06, + "loss": 0.3686, + "step": 8989 + }, + { + "epoch": 0.26, + "grad_norm": 1.4353548063262669, + "learning_rate": 8.667616561812051e-06, + "loss": 0.3583, + "step": 8990 + }, + { + "epoch": 0.26, + "grad_norm": 1.3072354583997143, + "learning_rate": 8.667297302110875e-06, + "loss": 0.3243, + "step": 8991 + }, + { + "epoch": 0.26, + "grad_norm": 1.5052753148932565, + "learning_rate": 8.666978010045758e-06, + "loss": 0.3686, + "step": 8992 + }, + { + "epoch": 0.26, + "grad_norm": 1.9034806562241002, + "learning_rate": 8.666658685619523e-06, + "loss": 0.369, + "step": 8993 + }, + { + "epoch": 0.26, + "grad_norm": 1.4749126868183926, + "learning_rate": 8.66633932883498e-06, + "loss": 0.3099, + "step": 8994 + }, + { + "epoch": 0.26, + "grad_norm": 1.283240214694567, + "learning_rate": 8.666019939694955e-06, + "loss": 0.3523, + "step": 8995 + }, + { + "epoch": 0.26, + "grad_norm": 1.365303409067601, + "learning_rate": 8.665700518202262e-06, + "loss": 0.3364, + "step": 8996 + }, + { + "epoch": 0.26, + "grad_norm": 1.4933464059115673, + "learning_rate": 8.665381064359721e-06, + "loss": 0.3662, + "step": 8997 + }, + { + "epoch": 0.26, + "grad_norm": 1.544395753466916, + "learning_rate": 8.665061578170152e-06, + "loss": 0.373, + "step": 8998 + }, + { + "epoch": 0.26, + "grad_norm": 1.3446717360131133, + "learning_rate": 8.664742059636373e-06, + "loss": 0.366, + "step": 8999 + }, + { + "epoch": 0.26, + "grad_norm": 1.3075610121707928, + "learning_rate": 8.664422508761205e-06, + "loss": 0.3571, + "step": 9000 + }, + { + "epoch": 0.26, + "grad_norm": 1.4157009121840842, + "learning_rate": 8.664102925547468e-06, + "loss": 0.3906, + "step": 9001 + }, + { + "epoch": 0.26, + "grad_norm": 1.4314610812322208, + "learning_rate": 8.663783309997982e-06, + "loss": 0.3485, + "step": 9002 + }, + { + "epoch": 0.26, + "grad_norm": 1.6081889598275294, + "learning_rate": 8.663463662115568e-06, + "loss": 0.3725, + "step": 9003 + }, + { + "epoch": 0.26, + "grad_norm": 1.6118490759331707, + "learning_rate": 8.663143981903047e-06, + "loss": 0.341, + "step": 9004 + }, + { + "epoch": 0.26, + "grad_norm": 1.5422583020272915, + "learning_rate": 8.662824269363238e-06, + "loss": 0.3593, + "step": 9005 + }, + { + "epoch": 0.26, + "grad_norm": 1.4774880766818759, + "learning_rate": 8.662504524498965e-06, + "loss": 0.3723, + "step": 9006 + }, + { + "epoch": 0.26, + "grad_norm": 2.9268170675926064, + "learning_rate": 8.66218474731305e-06, + "loss": 0.3646, + "step": 9007 + }, + { + "epoch": 0.26, + "grad_norm": 1.3094300370388219, + "learning_rate": 8.661864937808314e-06, + "loss": 0.3291, + "step": 9008 + }, + { + "epoch": 0.26, + "grad_norm": 1.6436366130922326, + "learning_rate": 8.661545095987578e-06, + "loss": 0.3743, + "step": 9009 + }, + { + "epoch": 0.26, + "grad_norm": 1.5268981121574252, + "learning_rate": 8.661225221853666e-06, + "loss": 0.3552, + "step": 9010 + }, + { + "epoch": 0.26, + "grad_norm": 1.2901129441557473, + "learning_rate": 8.660905315409402e-06, + "loss": 0.3753, + "step": 9011 + }, + { + "epoch": 0.26, + "grad_norm": 2.574576601955063, + "learning_rate": 8.660585376657607e-06, + "loss": 0.3557, + "step": 9012 + }, + { + "epoch": 0.26, + "grad_norm": 1.7124690485263914, + "learning_rate": 8.660265405601105e-06, + "loss": 0.3434, + "step": 9013 + }, + { + "epoch": 0.26, + "grad_norm": 1.6439170954793365, + "learning_rate": 8.659945402242719e-06, + "loss": 0.3751, + "step": 9014 + }, + { + "epoch": 0.26, + "grad_norm": 1.4019355079434541, + "learning_rate": 8.659625366585278e-06, + "loss": 0.3727, + "step": 9015 + }, + { + "epoch": 0.26, + "grad_norm": 1.4928233571494907, + "learning_rate": 8.6593052986316e-06, + "loss": 0.3747, + "step": 9016 + }, + { + "epoch": 0.26, + "grad_norm": 1.5874297171140561, + "learning_rate": 8.658985198384513e-06, + "loss": 0.3552, + "step": 9017 + }, + { + "epoch": 0.26, + "grad_norm": 1.4613681735787034, + "learning_rate": 8.65866506584684e-06, + "loss": 0.3477, + "step": 9018 + }, + { + "epoch": 0.26, + "grad_norm": 1.3051028098850015, + "learning_rate": 8.658344901021408e-06, + "loss": 0.3618, + "step": 9019 + }, + { + "epoch": 0.26, + "grad_norm": 2.057359649074224, + "learning_rate": 8.658024703911042e-06, + "loss": 0.3639, + "step": 9020 + }, + { + "epoch": 0.26, + "grad_norm": 1.6541881605858832, + "learning_rate": 8.657704474518566e-06, + "loss": 0.379, + "step": 9021 + }, + { + "epoch": 0.26, + "grad_norm": 1.3430332532478966, + "learning_rate": 8.657384212846809e-06, + "loss": 0.3466, + "step": 9022 + }, + { + "epoch": 0.26, + "grad_norm": 1.36540897134131, + "learning_rate": 8.657063918898595e-06, + "loss": 0.3558, + "step": 9023 + }, + { + "epoch": 0.26, + "grad_norm": 1.345355937584334, + "learning_rate": 8.65674359267675e-06, + "loss": 0.3361, + "step": 9024 + }, + { + "epoch": 0.26, + "grad_norm": 1.823939229500799, + "learning_rate": 8.656423234184105e-06, + "loss": 0.3601, + "step": 9025 + }, + { + "epoch": 0.26, + "grad_norm": 1.6360122520479878, + "learning_rate": 8.656102843423483e-06, + "loss": 0.37, + "step": 9026 + }, + { + "epoch": 0.26, + "grad_norm": 1.2867051243737626, + "learning_rate": 8.655782420397714e-06, + "loss": 0.3329, + "step": 9027 + }, + { + "epoch": 0.26, + "grad_norm": 1.6575583503247195, + "learning_rate": 8.655461965109623e-06, + "loss": 0.3578, + "step": 9028 + }, + { + "epoch": 0.26, + "grad_norm": 1.2894088267212855, + "learning_rate": 8.65514147756204e-06, + "loss": 0.3413, + "step": 9029 + }, + { + "epoch": 0.26, + "grad_norm": 1.3457477255878996, + "learning_rate": 8.654820957757792e-06, + "loss": 0.3542, + "step": 9030 + }, + { + "epoch": 0.26, + "grad_norm": 1.6652079855740056, + "learning_rate": 8.654500405699711e-06, + "loss": 0.3718, + "step": 9031 + }, + { + "epoch": 0.26, + "grad_norm": 1.399020354157621, + "learning_rate": 8.65417982139062e-06, + "loss": 0.3824, + "step": 9032 + }, + { + "epoch": 0.26, + "grad_norm": 2.175120020550393, + "learning_rate": 8.653859204833354e-06, + "loss": 0.3745, + "step": 9033 + }, + { + "epoch": 0.26, + "grad_norm": 1.3288082418958367, + "learning_rate": 8.653538556030741e-06, + "loss": 0.6001, + "step": 9034 + }, + { + "epoch": 0.26, + "grad_norm": 1.7556878937764127, + "learning_rate": 8.653217874985607e-06, + "loss": 0.3373, + "step": 9035 + }, + { + "epoch": 0.26, + "grad_norm": 1.4662644802733826, + "learning_rate": 8.652897161700787e-06, + "loss": 0.3765, + "step": 9036 + }, + { + "epoch": 0.26, + "grad_norm": 1.2912191488129283, + "learning_rate": 8.652576416179107e-06, + "loss": 0.3461, + "step": 9037 + }, + { + "epoch": 0.26, + "grad_norm": 1.4820030331940872, + "learning_rate": 8.652255638423402e-06, + "loss": 0.3623, + "step": 9038 + }, + { + "epoch": 0.26, + "grad_norm": 1.6154643335099157, + "learning_rate": 8.651934828436497e-06, + "loss": 0.3534, + "step": 9039 + }, + { + "epoch": 0.26, + "grad_norm": 1.4271078006394429, + "learning_rate": 8.651613986221229e-06, + "loss": 0.3512, + "step": 9040 + }, + { + "epoch": 0.26, + "grad_norm": 1.786434805512141, + "learning_rate": 8.651293111780428e-06, + "loss": 0.3707, + "step": 9041 + }, + { + "epoch": 0.26, + "grad_norm": 1.5522292734024459, + "learning_rate": 8.650972205116923e-06, + "loss": 0.3755, + "step": 9042 + }, + { + "epoch": 0.26, + "grad_norm": 3.0698106232345865, + "learning_rate": 8.650651266233547e-06, + "loss": 0.356, + "step": 9043 + }, + { + "epoch": 0.26, + "grad_norm": 1.4634319043177006, + "learning_rate": 8.650330295133136e-06, + "loss": 0.3456, + "step": 9044 + }, + { + "epoch": 0.26, + "grad_norm": 1.4209146749870356, + "learning_rate": 8.650009291818517e-06, + "loss": 0.3777, + "step": 9045 + }, + { + "epoch": 0.26, + "grad_norm": 1.6220194970423785, + "learning_rate": 8.649688256292527e-06, + "loss": 0.3843, + "step": 9046 + }, + { + "epoch": 0.26, + "grad_norm": 1.2981621001541117, + "learning_rate": 8.649367188557996e-06, + "loss": 0.3708, + "step": 9047 + }, + { + "epoch": 0.26, + "grad_norm": 1.4811847585251432, + "learning_rate": 8.649046088617761e-06, + "loss": 0.3537, + "step": 9048 + }, + { + "epoch": 0.26, + "grad_norm": 1.4293960841464395, + "learning_rate": 8.648724956474652e-06, + "loss": 0.38, + "step": 9049 + }, + { + "epoch": 0.26, + "grad_norm": 1.3137733549502038, + "learning_rate": 8.648403792131507e-06, + "loss": 0.3647, + "step": 9050 + }, + { + "epoch": 0.26, + "grad_norm": 2.329987988617172, + "learning_rate": 8.648082595591156e-06, + "loss": 0.3766, + "step": 9051 + }, + { + "epoch": 0.26, + "grad_norm": 1.5848686134392924, + "learning_rate": 8.647761366856438e-06, + "loss": 0.3315, + "step": 9052 + }, + { + "epoch": 0.26, + "grad_norm": 1.3886361750762986, + "learning_rate": 8.647440105930183e-06, + "loss": 0.4419, + "step": 9053 + }, + { + "epoch": 0.26, + "grad_norm": 1.4433994159078336, + "learning_rate": 8.64711881281523e-06, + "loss": 0.3329, + "step": 9054 + }, + { + "epoch": 0.26, + "grad_norm": 1.6135865809216206, + "learning_rate": 8.646797487514412e-06, + "loss": 0.3751, + "step": 9055 + }, + { + "epoch": 0.26, + "grad_norm": 1.761569578464996, + "learning_rate": 8.646476130030567e-06, + "loss": 0.3599, + "step": 9056 + }, + { + "epoch": 0.26, + "grad_norm": 1.4853002356421783, + "learning_rate": 8.646154740366528e-06, + "loss": 0.3613, + "step": 9057 + }, + { + "epoch": 0.26, + "grad_norm": 1.516635539414164, + "learning_rate": 8.645833318525134e-06, + "loss": 0.3521, + "step": 9058 + }, + { + "epoch": 0.26, + "grad_norm": 1.2965757905743396, + "learning_rate": 8.645511864509221e-06, + "loss": 0.3604, + "step": 9059 + }, + { + "epoch": 0.26, + "grad_norm": 1.4456386872266642, + "learning_rate": 8.645190378321624e-06, + "loss": 0.3525, + "step": 9060 + }, + { + "epoch": 0.26, + "grad_norm": 1.5829795693465776, + "learning_rate": 8.644868859965184e-06, + "loss": 0.3988, + "step": 9061 + }, + { + "epoch": 0.26, + "grad_norm": 1.8358243809766672, + "learning_rate": 8.644547309442734e-06, + "loss": 0.372, + "step": 9062 + }, + { + "epoch": 0.26, + "grad_norm": 1.672059466125895, + "learning_rate": 8.644225726757112e-06, + "loss": 0.3562, + "step": 9063 + }, + { + "epoch": 0.26, + "grad_norm": 1.4066509143591863, + "learning_rate": 8.64390411191116e-06, + "loss": 0.384, + "step": 9064 + }, + { + "epoch": 0.26, + "grad_norm": 1.735564812993709, + "learning_rate": 8.643582464907713e-06, + "loss": 0.369, + "step": 9065 + }, + { + "epoch": 0.26, + "grad_norm": 1.5367756651951958, + "learning_rate": 8.64326078574961e-06, + "loss": 0.3605, + "step": 9066 + }, + { + "epoch": 0.26, + "grad_norm": 1.2940673940210314, + "learning_rate": 8.642939074439691e-06, + "loss": 0.3451, + "step": 9067 + }, + { + "epoch": 0.26, + "grad_norm": 1.8939194292680384, + "learning_rate": 8.642617330980793e-06, + "loss": 0.3761, + "step": 9068 + }, + { + "epoch": 0.26, + "grad_norm": 0.9918109707589915, + "learning_rate": 8.64229555537576e-06, + "loss": 0.5862, + "step": 9069 + }, + { + "epoch": 0.26, + "grad_norm": 1.7069352862120972, + "learning_rate": 8.641973747627423e-06, + "loss": 0.3607, + "step": 9070 + }, + { + "epoch": 0.26, + "grad_norm": 1.7860267885496077, + "learning_rate": 8.641651907738631e-06, + "loss": 0.3348, + "step": 9071 + }, + { + "epoch": 0.26, + "grad_norm": 2.0067505721402377, + "learning_rate": 8.641330035712219e-06, + "loss": 0.3434, + "step": 9072 + }, + { + "epoch": 0.26, + "grad_norm": 1.3321182745712044, + "learning_rate": 8.64100813155103e-06, + "loss": 0.3571, + "step": 9073 + }, + { + "epoch": 0.26, + "grad_norm": 1.469847795383522, + "learning_rate": 8.640686195257903e-06, + "loss": 0.3596, + "step": 9074 + }, + { + "epoch": 0.26, + "grad_norm": 1.7325999327026083, + "learning_rate": 8.64036422683568e-06, + "loss": 0.3776, + "step": 9075 + }, + { + "epoch": 0.26, + "grad_norm": 1.3797965312768272, + "learning_rate": 8.640042226287203e-06, + "loss": 0.3903, + "step": 9076 + }, + { + "epoch": 0.26, + "grad_norm": 1.642903795121439, + "learning_rate": 8.639720193615312e-06, + "loss": 0.35, + "step": 9077 + }, + { + "epoch": 0.26, + "grad_norm": 1.0025426306626197, + "learning_rate": 8.639398128822853e-06, + "loss": 0.5785, + "step": 9078 + }, + { + "epoch": 0.26, + "grad_norm": 1.5430975535930538, + "learning_rate": 8.63907603191266e-06, + "loss": 0.363, + "step": 9079 + }, + { + "epoch": 0.26, + "grad_norm": 1.6815492661872067, + "learning_rate": 8.638753902887585e-06, + "loss": 0.3511, + "step": 9080 + }, + { + "epoch": 0.26, + "grad_norm": 1.361983277301449, + "learning_rate": 8.638431741750465e-06, + "loss": 0.33, + "step": 9081 + }, + { + "epoch": 0.26, + "grad_norm": 1.6804617930583368, + "learning_rate": 8.638109548504145e-06, + "loss": 0.355, + "step": 9082 + }, + { + "epoch": 0.26, + "grad_norm": 1.4510810948181567, + "learning_rate": 8.637787323151467e-06, + "loss": 0.3642, + "step": 9083 + }, + { + "epoch": 0.26, + "grad_norm": 1.3469338640746087, + "learning_rate": 8.637465065695275e-06, + "loss": 0.3336, + "step": 9084 + }, + { + "epoch": 0.26, + "grad_norm": 1.4842052041360547, + "learning_rate": 8.637142776138415e-06, + "loss": 0.3629, + "step": 9085 + }, + { + "epoch": 0.26, + "grad_norm": 1.4307215490316842, + "learning_rate": 8.636820454483727e-06, + "loss": 0.3379, + "step": 9086 + }, + { + "epoch": 0.26, + "grad_norm": 1.3564323703245218, + "learning_rate": 8.636498100734061e-06, + "loss": 0.3675, + "step": 9087 + }, + { + "epoch": 0.26, + "grad_norm": 1.4759362121049593, + "learning_rate": 8.636175714892257e-06, + "loss": 0.3648, + "step": 9088 + }, + { + "epoch": 0.26, + "grad_norm": 1.4220058137657028, + "learning_rate": 8.635853296961164e-06, + "loss": 0.366, + "step": 9089 + }, + { + "epoch": 0.26, + "grad_norm": 1.3399238716259603, + "learning_rate": 8.635530846943624e-06, + "loss": 0.3454, + "step": 9090 + }, + { + "epoch": 0.26, + "grad_norm": 1.4551360045725814, + "learning_rate": 8.635208364842485e-06, + "loss": 0.356, + "step": 9091 + }, + { + "epoch": 0.26, + "grad_norm": 1.419371740652954, + "learning_rate": 8.634885850660591e-06, + "loss": 0.3513, + "step": 9092 + }, + { + "epoch": 0.26, + "grad_norm": 1.3876181603671465, + "learning_rate": 8.634563304400787e-06, + "loss": 0.31, + "step": 9093 + }, + { + "epoch": 0.26, + "grad_norm": 1.3365765082231937, + "learning_rate": 8.634240726065925e-06, + "loss": 0.357, + "step": 9094 + }, + { + "epoch": 0.26, + "grad_norm": 1.4545721963826548, + "learning_rate": 8.633918115658845e-06, + "loss": 0.3307, + "step": 9095 + }, + { + "epoch": 0.26, + "grad_norm": 2.689961880848844, + "learning_rate": 8.6335954731824e-06, + "loss": 0.3623, + "step": 9096 + }, + { + "epoch": 0.26, + "grad_norm": 1.2960277901827002, + "learning_rate": 8.633272798639433e-06, + "loss": 0.3637, + "step": 9097 + }, + { + "epoch": 0.26, + "grad_norm": 1.7762055484561472, + "learning_rate": 8.632950092032796e-06, + "loss": 0.3881, + "step": 9098 + }, + { + "epoch": 0.26, + "grad_norm": 1.4851085889800002, + "learning_rate": 8.632627353365332e-06, + "loss": 0.3614, + "step": 9099 + }, + { + "epoch": 0.26, + "grad_norm": 1.6881184234100481, + "learning_rate": 8.632304582639891e-06, + "loss": 0.3538, + "step": 9100 + }, + { + "epoch": 0.26, + "grad_norm": 1.5384798836548657, + "learning_rate": 8.631981779859322e-06, + "loss": 0.359, + "step": 9101 + }, + { + "epoch": 0.26, + "grad_norm": 1.470765498365647, + "learning_rate": 8.631658945026473e-06, + "loss": 0.3627, + "step": 9102 + }, + { + "epoch": 0.26, + "grad_norm": 1.5220746294137786, + "learning_rate": 8.631336078144195e-06, + "loss": 0.3716, + "step": 9103 + }, + { + "epoch": 0.26, + "grad_norm": 1.432337644332657, + "learning_rate": 8.631013179215336e-06, + "loss": 0.3449, + "step": 9104 + }, + { + "epoch": 0.26, + "grad_norm": 1.5075688481588767, + "learning_rate": 8.630690248242744e-06, + "loss": 0.3675, + "step": 9105 + }, + { + "epoch": 0.26, + "grad_norm": 1.3667269336080077, + "learning_rate": 8.630367285229271e-06, + "loss": 0.3546, + "step": 9106 + }, + { + "epoch": 0.26, + "grad_norm": 1.8478423341772754, + "learning_rate": 8.630044290177766e-06, + "loss": 0.3331, + "step": 9107 + }, + { + "epoch": 0.26, + "grad_norm": 1.4971893348768608, + "learning_rate": 8.62972126309108e-06, + "loss": 0.3683, + "step": 9108 + }, + { + "epoch": 0.26, + "grad_norm": 2.3472456382515228, + "learning_rate": 8.629398203972063e-06, + "loss": 0.3829, + "step": 9109 + }, + { + "epoch": 0.26, + "grad_norm": 1.4399631347301867, + "learning_rate": 8.629075112823567e-06, + "loss": 0.3653, + "step": 9110 + }, + { + "epoch": 0.26, + "grad_norm": 1.3239209906033016, + "learning_rate": 8.628751989648444e-06, + "loss": 0.3583, + "step": 9111 + }, + { + "epoch": 0.26, + "grad_norm": 1.4438855292117445, + "learning_rate": 8.628428834449543e-06, + "loss": 0.3644, + "step": 9112 + }, + { + "epoch": 0.26, + "grad_norm": 1.5005832521644162, + "learning_rate": 8.628105647229718e-06, + "loss": 0.3493, + "step": 9113 + }, + { + "epoch": 0.26, + "grad_norm": 1.518162156645259, + "learning_rate": 8.62778242799182e-06, + "loss": 0.3882, + "step": 9114 + }, + { + "epoch": 0.26, + "grad_norm": 1.6470718530661603, + "learning_rate": 8.627459176738701e-06, + "loss": 0.365, + "step": 9115 + }, + { + "epoch": 0.26, + "grad_norm": 1.3069872533220515, + "learning_rate": 8.627135893473217e-06, + "loss": 0.3724, + "step": 9116 + }, + { + "epoch": 0.26, + "grad_norm": 1.5868462677155195, + "learning_rate": 8.626812578198217e-06, + "loss": 0.3728, + "step": 9117 + }, + { + "epoch": 0.26, + "grad_norm": 1.4646265687360418, + "learning_rate": 8.626489230916556e-06, + "loss": 0.3525, + "step": 9118 + }, + { + "epoch": 0.26, + "grad_norm": 1.3977085414394597, + "learning_rate": 8.626165851631087e-06, + "loss": 0.3677, + "step": 9119 + }, + { + "epoch": 0.26, + "grad_norm": 1.5883014757358809, + "learning_rate": 8.625842440344665e-06, + "loss": 0.3499, + "step": 9120 + }, + { + "epoch": 0.26, + "grad_norm": 7.363849289398156, + "learning_rate": 8.625518997060143e-06, + "loss": 0.3759, + "step": 9121 + }, + { + "epoch": 0.26, + "grad_norm": 1.4201440431518475, + "learning_rate": 8.625195521780374e-06, + "loss": 0.3639, + "step": 9122 + }, + { + "epoch": 0.26, + "grad_norm": 1.5337616055318404, + "learning_rate": 8.624872014508217e-06, + "loss": 0.3614, + "step": 9123 + }, + { + "epoch": 0.26, + "grad_norm": 1.5140089733282747, + "learning_rate": 8.624548475246522e-06, + "loss": 0.369, + "step": 9124 + }, + { + "epoch": 0.26, + "grad_norm": 2.0799551715314917, + "learning_rate": 8.624224903998146e-06, + "loss": 0.3785, + "step": 9125 + }, + { + "epoch": 0.26, + "grad_norm": 0.9691036358511357, + "learning_rate": 8.623901300765946e-06, + "loss": 0.5813, + "step": 9126 + }, + { + "epoch": 0.26, + "grad_norm": 1.716267770161286, + "learning_rate": 8.623577665552778e-06, + "loss": 0.3413, + "step": 9127 + }, + { + "epoch": 0.26, + "grad_norm": 1.5791763763875029, + "learning_rate": 8.623253998361496e-06, + "loss": 0.3684, + "step": 9128 + }, + { + "epoch": 0.26, + "grad_norm": 1.5413007336734228, + "learning_rate": 8.622930299194956e-06, + "loss": 0.3552, + "step": 9129 + }, + { + "epoch": 0.26, + "grad_norm": 1.4102233396882549, + "learning_rate": 8.622606568056018e-06, + "loss": 0.346, + "step": 9130 + }, + { + "epoch": 0.26, + "grad_norm": 2.3542201329251253, + "learning_rate": 8.622282804947537e-06, + "loss": 0.3613, + "step": 9131 + }, + { + "epoch": 0.26, + "grad_norm": 1.325111563058854, + "learning_rate": 8.62195900987237e-06, + "loss": 0.3361, + "step": 9132 + }, + { + "epoch": 0.26, + "grad_norm": 1.4186512707495798, + "learning_rate": 8.621635182833373e-06, + "loss": 0.3472, + "step": 9133 + }, + { + "epoch": 0.26, + "grad_norm": 1.6346244719494651, + "learning_rate": 8.621311323833406e-06, + "loss": 0.3673, + "step": 9134 + }, + { + "epoch": 0.26, + "grad_norm": 1.580466470810993, + "learning_rate": 8.620987432875326e-06, + "loss": 0.3498, + "step": 9135 + }, + { + "epoch": 0.26, + "grad_norm": 1.9386353822792501, + "learning_rate": 8.620663509961992e-06, + "loss": 0.3586, + "step": 9136 + }, + { + "epoch": 0.27, + "grad_norm": 1.8349388761292402, + "learning_rate": 8.620339555096262e-06, + "loss": 0.3407, + "step": 9137 + }, + { + "epoch": 0.27, + "grad_norm": 1.705190713240775, + "learning_rate": 8.620015568280996e-06, + "loss": 0.4017, + "step": 9138 + }, + { + "epoch": 0.27, + "grad_norm": 1.5261083388713748, + "learning_rate": 8.619691549519052e-06, + "loss": 0.3508, + "step": 9139 + }, + { + "epoch": 0.27, + "grad_norm": 1.8065163150517103, + "learning_rate": 8.61936749881329e-06, + "loss": 0.4045, + "step": 9140 + }, + { + "epoch": 0.27, + "grad_norm": 3.5742706815145606, + "learning_rate": 8.61904341616657e-06, + "loss": 0.3606, + "step": 9141 + }, + { + "epoch": 0.27, + "grad_norm": 1.4889363604551336, + "learning_rate": 8.618719301581752e-06, + "loss": 0.3524, + "step": 9142 + }, + { + "epoch": 0.27, + "grad_norm": 1.4594871920541845, + "learning_rate": 8.618395155061695e-06, + "loss": 0.3832, + "step": 9143 + }, + { + "epoch": 0.27, + "grad_norm": 1.3869990359715092, + "learning_rate": 8.61807097660926e-06, + "loss": 0.3676, + "step": 9144 + }, + { + "epoch": 0.27, + "grad_norm": 1.5876724705690308, + "learning_rate": 8.61774676622731e-06, + "loss": 0.3536, + "step": 9145 + }, + { + "epoch": 0.27, + "grad_norm": 1.601017573254863, + "learning_rate": 8.617422523918704e-06, + "loss": 0.3704, + "step": 9146 + }, + { + "epoch": 0.27, + "grad_norm": 1.39729332201378, + "learning_rate": 8.617098249686306e-06, + "loss": 0.3944, + "step": 9147 + }, + { + "epoch": 0.27, + "grad_norm": 1.5540345389611034, + "learning_rate": 8.616773943532975e-06, + "loss": 0.3654, + "step": 9148 + }, + { + "epoch": 0.27, + "grad_norm": 1.5138345040215901, + "learning_rate": 8.616449605461572e-06, + "loss": 0.3521, + "step": 9149 + }, + { + "epoch": 0.27, + "grad_norm": 1.6670846594286088, + "learning_rate": 8.616125235474962e-06, + "loss": 0.3762, + "step": 9150 + }, + { + "epoch": 0.27, + "grad_norm": 1.9239308360889045, + "learning_rate": 8.615800833576008e-06, + "loss": 0.3772, + "step": 9151 + }, + { + "epoch": 0.27, + "grad_norm": 1.458288687349678, + "learning_rate": 8.615476399767569e-06, + "loss": 0.3766, + "step": 9152 + }, + { + "epoch": 0.27, + "grad_norm": 1.3460742847390756, + "learning_rate": 8.615151934052512e-06, + "loss": 0.3578, + "step": 9153 + }, + { + "epoch": 0.27, + "grad_norm": 1.5995112931440882, + "learning_rate": 8.614827436433699e-06, + "loss": 0.3605, + "step": 9154 + }, + { + "epoch": 0.27, + "grad_norm": 1.8443741256526123, + "learning_rate": 8.614502906913995e-06, + "loss": 0.3512, + "step": 9155 + }, + { + "epoch": 0.27, + "grad_norm": 1.8875478325955126, + "learning_rate": 8.61417834549626e-06, + "loss": 0.3732, + "step": 9156 + }, + { + "epoch": 0.27, + "grad_norm": 1.4932794205043394, + "learning_rate": 8.613853752183364e-06, + "loss": 0.3434, + "step": 9157 + }, + { + "epoch": 0.27, + "grad_norm": 1.4249449179066098, + "learning_rate": 8.613529126978166e-06, + "loss": 0.3522, + "step": 9158 + }, + { + "epoch": 0.27, + "grad_norm": 3.263451522554585, + "learning_rate": 8.613204469883535e-06, + "loss": 0.3416, + "step": 9159 + }, + { + "epoch": 0.27, + "grad_norm": 3.7455093445709244, + "learning_rate": 8.612879780902333e-06, + "loss": 0.3801, + "step": 9160 + }, + { + "epoch": 0.27, + "grad_norm": 1.855752599583426, + "learning_rate": 8.612555060037429e-06, + "loss": 0.3705, + "step": 9161 + }, + { + "epoch": 0.27, + "grad_norm": 1.6067476601620112, + "learning_rate": 8.612230307291685e-06, + "loss": 0.3294, + "step": 9162 + }, + { + "epoch": 0.27, + "grad_norm": 1.4743573072227139, + "learning_rate": 8.611905522667968e-06, + "loss": 0.3837, + "step": 9163 + }, + { + "epoch": 0.27, + "grad_norm": 1.9487860246957003, + "learning_rate": 8.611580706169147e-06, + "loss": 0.3491, + "step": 9164 + }, + { + "epoch": 0.27, + "grad_norm": 1.6716559083337272, + "learning_rate": 8.611255857798084e-06, + "loss": 0.3346, + "step": 9165 + }, + { + "epoch": 0.27, + "grad_norm": 1.6849662494574005, + "learning_rate": 8.61093097755765e-06, + "loss": 0.4025, + "step": 9166 + }, + { + "epoch": 0.27, + "grad_norm": 1.498476049445247, + "learning_rate": 8.610606065450707e-06, + "loss": 0.338, + "step": 9167 + }, + { + "epoch": 0.27, + "grad_norm": 3.8356865463791348, + "learning_rate": 8.610281121480128e-06, + "loss": 0.3712, + "step": 9168 + }, + { + "epoch": 0.27, + "grad_norm": 1.4510887611903225, + "learning_rate": 8.609956145648777e-06, + "loss": 0.363, + "step": 9169 + }, + { + "epoch": 0.27, + "grad_norm": 1.7348174453372474, + "learning_rate": 8.609631137959523e-06, + "loss": 0.3493, + "step": 9170 + }, + { + "epoch": 0.27, + "grad_norm": 1.852690982818107, + "learning_rate": 8.609306098415236e-06, + "loss": 0.3865, + "step": 9171 + }, + { + "epoch": 0.27, + "grad_norm": 1.6533236583972828, + "learning_rate": 8.608981027018779e-06, + "loss": 0.3924, + "step": 9172 + }, + { + "epoch": 0.27, + "grad_norm": 1.4341488314567519, + "learning_rate": 8.608655923773027e-06, + "loss": 0.3459, + "step": 9173 + }, + { + "epoch": 0.27, + "grad_norm": 1.4780635472587875, + "learning_rate": 8.608330788680845e-06, + "loss": 0.3391, + "step": 9174 + }, + { + "epoch": 0.27, + "grad_norm": 2.8586501324442106, + "learning_rate": 8.608005621745106e-06, + "loss": 0.3507, + "step": 9175 + }, + { + "epoch": 0.27, + "grad_norm": 1.6341476907245605, + "learning_rate": 8.607680422968676e-06, + "loss": 0.3238, + "step": 9176 + }, + { + "epoch": 0.27, + "grad_norm": 2.7867415102854944, + "learning_rate": 8.607355192354425e-06, + "loss": 0.3598, + "step": 9177 + }, + { + "epoch": 0.27, + "grad_norm": 1.5441067008930838, + "learning_rate": 8.607029929905226e-06, + "loss": 0.3576, + "step": 9178 + }, + { + "epoch": 0.27, + "grad_norm": 2.3945057457475776, + "learning_rate": 8.606704635623948e-06, + "loss": 0.3664, + "step": 9179 + }, + { + "epoch": 0.27, + "grad_norm": 1.7489090027013858, + "learning_rate": 8.606379309513463e-06, + "loss": 0.3383, + "step": 9180 + }, + { + "epoch": 0.27, + "grad_norm": 1.3793286469193826, + "learning_rate": 8.606053951576635e-06, + "loss": 0.3777, + "step": 9181 + }, + { + "epoch": 0.27, + "grad_norm": 4.38140762555986, + "learning_rate": 8.605728561816347e-06, + "loss": 0.3581, + "step": 9182 + }, + { + "epoch": 0.27, + "grad_norm": 1.7864075541588103, + "learning_rate": 8.605403140235461e-06, + "loss": 0.3557, + "step": 9183 + }, + { + "epoch": 0.27, + "grad_norm": 1.3663704726074417, + "learning_rate": 8.605077686836854e-06, + "loss": 0.3331, + "step": 9184 + }, + { + "epoch": 0.27, + "grad_norm": 1.769847130333031, + "learning_rate": 8.604752201623396e-06, + "loss": 0.3784, + "step": 9185 + }, + { + "epoch": 0.27, + "grad_norm": 5.009424421152143, + "learning_rate": 8.604426684597959e-06, + "loss": 0.3523, + "step": 9186 + }, + { + "epoch": 0.27, + "grad_norm": 1.836295711416256, + "learning_rate": 8.604101135763417e-06, + "loss": 0.3625, + "step": 9187 + }, + { + "epoch": 0.27, + "grad_norm": 2.1348658587262417, + "learning_rate": 8.603775555122643e-06, + "loss": 0.3654, + "step": 9188 + }, + { + "epoch": 0.27, + "grad_norm": 1.7968943191302096, + "learning_rate": 8.603449942678509e-06, + "loss": 0.3773, + "step": 9189 + }, + { + "epoch": 0.27, + "grad_norm": 3.7619599505256867, + "learning_rate": 8.60312429843389e-06, + "loss": 0.3479, + "step": 9190 + }, + { + "epoch": 0.27, + "grad_norm": 3.889015063458807, + "learning_rate": 8.602798622391657e-06, + "loss": 0.3588, + "step": 9191 + }, + { + "epoch": 0.27, + "grad_norm": 1.802706043432451, + "learning_rate": 8.60247291455469e-06, + "loss": 0.3427, + "step": 9192 + }, + { + "epoch": 0.27, + "grad_norm": 1.6503971243043811, + "learning_rate": 8.602147174925857e-06, + "loss": 0.3781, + "step": 9193 + }, + { + "epoch": 0.27, + "grad_norm": 1.4190943583333016, + "learning_rate": 8.601821403508034e-06, + "loss": 0.3524, + "step": 9194 + }, + { + "epoch": 0.27, + "grad_norm": 1.6328182690885533, + "learning_rate": 8.6014956003041e-06, + "loss": 0.3543, + "step": 9195 + }, + { + "epoch": 0.27, + "grad_norm": 2.764010744579747, + "learning_rate": 8.601169765316926e-06, + "loss": 0.3381, + "step": 9196 + }, + { + "epoch": 0.27, + "grad_norm": 1.5186166325206385, + "learning_rate": 8.60084389854939e-06, + "loss": 0.3325, + "step": 9197 + }, + { + "epoch": 0.27, + "grad_norm": 2.2001209216353352, + "learning_rate": 8.600518000004365e-06, + "loss": 0.355, + "step": 9198 + }, + { + "epoch": 0.27, + "grad_norm": 2.9945890930559065, + "learning_rate": 8.600192069684729e-06, + "loss": 0.3486, + "step": 9199 + }, + { + "epoch": 0.27, + "grad_norm": 1.5333210482889161, + "learning_rate": 8.599866107593358e-06, + "loss": 0.3669, + "step": 9200 + }, + { + "epoch": 0.27, + "grad_norm": 1.4314960841049575, + "learning_rate": 8.599540113733129e-06, + "loss": 0.3506, + "step": 9201 + }, + { + "epoch": 0.27, + "grad_norm": 3.018456721635327, + "learning_rate": 8.59921408810692e-06, + "loss": 0.3728, + "step": 9202 + }, + { + "epoch": 0.27, + "grad_norm": 2.010966543278546, + "learning_rate": 8.598888030717605e-06, + "loss": 0.3564, + "step": 9203 + }, + { + "epoch": 0.27, + "grad_norm": 1.6283499876977416, + "learning_rate": 8.598561941568064e-06, + "loss": 0.3661, + "step": 9204 + }, + { + "epoch": 0.27, + "grad_norm": 2.0106101457031063, + "learning_rate": 8.598235820661174e-06, + "loss": 0.3718, + "step": 9205 + }, + { + "epoch": 0.27, + "grad_norm": 3.869528676878295, + "learning_rate": 8.59790966799981e-06, + "loss": 0.376, + "step": 9206 + }, + { + "epoch": 0.27, + "grad_norm": 1.467120515244656, + "learning_rate": 8.597583483586858e-06, + "loss": 0.363, + "step": 9207 + }, + { + "epoch": 0.27, + "grad_norm": 1.5175590634304765, + "learning_rate": 8.59725726742519e-06, + "loss": 0.3352, + "step": 9208 + }, + { + "epoch": 0.27, + "grad_norm": 1.8898486521760256, + "learning_rate": 8.596931019517684e-06, + "loss": 0.3427, + "step": 9209 + }, + { + "epoch": 0.27, + "grad_norm": 1.4786116716488487, + "learning_rate": 8.596604739867226e-06, + "loss": 0.3324, + "step": 9210 + }, + { + "epoch": 0.27, + "grad_norm": 1.4884246763339142, + "learning_rate": 8.59627842847669e-06, + "loss": 0.3486, + "step": 9211 + }, + { + "epoch": 0.27, + "grad_norm": 1.836910611900521, + "learning_rate": 8.595952085348955e-06, + "loss": 0.3529, + "step": 9212 + }, + { + "epoch": 0.27, + "grad_norm": 1.7325596715907459, + "learning_rate": 8.595625710486906e-06, + "loss": 0.338, + "step": 9213 + }, + { + "epoch": 0.27, + "grad_norm": 1.5077812532563986, + "learning_rate": 8.595299303893418e-06, + "loss": 0.3577, + "step": 9214 + }, + { + "epoch": 0.27, + "grad_norm": 1.5077272365973953, + "learning_rate": 8.594972865571375e-06, + "loss": 0.3318, + "step": 9215 + }, + { + "epoch": 0.27, + "grad_norm": 3.2591464522349822, + "learning_rate": 8.594646395523656e-06, + "loss": 0.3551, + "step": 9216 + }, + { + "epoch": 0.27, + "grad_norm": 1.5709774819591389, + "learning_rate": 8.594319893753142e-06, + "loss": 0.3444, + "step": 9217 + }, + { + "epoch": 0.27, + "grad_norm": 1.4825216521011406, + "learning_rate": 8.593993360262716e-06, + "loss": 0.3398, + "step": 9218 + }, + { + "epoch": 0.27, + "grad_norm": 1.0869347999234278, + "learning_rate": 8.593666795055257e-06, + "loss": 0.649, + "step": 9219 + }, + { + "epoch": 0.27, + "grad_norm": 1.7201390163650376, + "learning_rate": 8.59334019813365e-06, + "loss": 0.3551, + "step": 9220 + }, + { + "epoch": 0.27, + "grad_norm": 1.407574673431708, + "learning_rate": 8.593013569500777e-06, + "loss": 0.3415, + "step": 9221 + }, + { + "epoch": 0.27, + "grad_norm": 1.6607672980028205, + "learning_rate": 8.592686909159517e-06, + "loss": 0.3746, + "step": 9222 + }, + { + "epoch": 0.27, + "grad_norm": 0.9777181499207626, + "learning_rate": 8.592360217112759e-06, + "loss": 0.6438, + "step": 9223 + }, + { + "epoch": 0.27, + "grad_norm": 1.4458639997604124, + "learning_rate": 8.592033493363378e-06, + "loss": 0.3557, + "step": 9224 + }, + { + "epoch": 0.27, + "grad_norm": 1.7184302745457831, + "learning_rate": 8.591706737914264e-06, + "loss": 0.3219, + "step": 9225 + }, + { + "epoch": 0.27, + "grad_norm": 1.584466672804748, + "learning_rate": 8.591379950768298e-06, + "loss": 0.386, + "step": 9226 + }, + { + "epoch": 0.27, + "grad_norm": 1.8527649093524028, + "learning_rate": 8.591053131928364e-06, + "loss": 0.3602, + "step": 9227 + }, + { + "epoch": 0.27, + "grad_norm": 1.9876169833493844, + "learning_rate": 8.590726281397345e-06, + "loss": 0.3488, + "step": 9228 + }, + { + "epoch": 0.27, + "grad_norm": 1.5601807767145721, + "learning_rate": 8.59039939917813e-06, + "loss": 0.3516, + "step": 9229 + }, + { + "epoch": 0.27, + "grad_norm": 1.5369981383195097, + "learning_rate": 8.590072485273596e-06, + "loss": 0.3339, + "step": 9230 + }, + { + "epoch": 0.27, + "grad_norm": 1.4584032807692382, + "learning_rate": 8.589745539686636e-06, + "loss": 0.3353, + "step": 9231 + }, + { + "epoch": 0.27, + "grad_norm": 1.6760846637654536, + "learning_rate": 8.589418562420128e-06, + "loss": 0.3433, + "step": 9232 + }, + { + "epoch": 0.27, + "grad_norm": 1.9979434063607964, + "learning_rate": 8.589091553476965e-06, + "loss": 0.3554, + "step": 9233 + }, + { + "epoch": 0.27, + "grad_norm": 1.5403087617824487, + "learning_rate": 8.588764512860028e-06, + "loss": 0.3652, + "step": 9234 + }, + { + "epoch": 0.27, + "grad_norm": 1.8814367409094734, + "learning_rate": 8.588437440572206e-06, + "loss": 0.3644, + "step": 9235 + }, + { + "epoch": 0.27, + "grad_norm": 1.8491163429662054, + "learning_rate": 8.58811033661638e-06, + "loss": 0.3722, + "step": 9236 + }, + { + "epoch": 0.27, + "grad_norm": 2.8273771729366444, + "learning_rate": 8.587783200995443e-06, + "loss": 0.3781, + "step": 9237 + }, + { + "epoch": 0.27, + "grad_norm": 1.655701997899344, + "learning_rate": 8.58745603371228e-06, + "loss": 0.3497, + "step": 9238 + }, + { + "epoch": 0.27, + "grad_norm": 4.170276091085929, + "learning_rate": 8.587128834769776e-06, + "loss": 0.3855, + "step": 9239 + }, + { + "epoch": 0.27, + "grad_norm": 1.491687892535382, + "learning_rate": 8.586801604170822e-06, + "loss": 0.3493, + "step": 9240 + }, + { + "epoch": 0.27, + "grad_norm": 1.1028675829370609, + "learning_rate": 8.586474341918302e-06, + "loss": 0.5693, + "step": 9241 + }, + { + "epoch": 0.27, + "grad_norm": 1.9639053209906026, + "learning_rate": 8.586147048015107e-06, + "loss": 0.3522, + "step": 9242 + }, + { + "epoch": 0.27, + "grad_norm": 1.4771023718107519, + "learning_rate": 8.585819722464127e-06, + "loss": 0.3646, + "step": 9243 + }, + { + "epoch": 0.27, + "grad_norm": 1.5618030752442222, + "learning_rate": 8.585492365268245e-06, + "loss": 0.3562, + "step": 9244 + }, + { + "epoch": 0.27, + "grad_norm": 1.6829809019044224, + "learning_rate": 8.585164976430354e-06, + "loss": 0.3616, + "step": 9245 + }, + { + "epoch": 0.27, + "grad_norm": 1.976233354374641, + "learning_rate": 8.584837555953342e-06, + "loss": 0.3518, + "step": 9246 + }, + { + "epoch": 0.27, + "grad_norm": 1.534573882660409, + "learning_rate": 8.5845101038401e-06, + "loss": 0.3216, + "step": 9247 + }, + { + "epoch": 0.27, + "grad_norm": 1.6009691993932034, + "learning_rate": 8.584182620093516e-06, + "loss": 0.333, + "step": 9248 + }, + { + "epoch": 0.27, + "grad_norm": 1.675840665018952, + "learning_rate": 8.583855104716479e-06, + "loss": 0.3383, + "step": 9249 + }, + { + "epoch": 0.27, + "grad_norm": 1.8300901410964787, + "learning_rate": 8.583527557711882e-06, + "loss": 0.3484, + "step": 9250 + }, + { + "epoch": 0.27, + "grad_norm": 1.5557499076872998, + "learning_rate": 8.583199979082616e-06, + "loss": 0.3533, + "step": 9251 + }, + { + "epoch": 0.27, + "grad_norm": 1.4286489191702891, + "learning_rate": 8.582872368831568e-06, + "loss": 0.3403, + "step": 9252 + }, + { + "epoch": 0.27, + "grad_norm": 1.7080915129472585, + "learning_rate": 8.582544726961634e-06, + "loss": 0.3756, + "step": 9253 + }, + { + "epoch": 0.27, + "grad_norm": 1.363235619352683, + "learning_rate": 8.582217053475701e-06, + "loss": 0.344, + "step": 9254 + }, + { + "epoch": 0.27, + "grad_norm": 3.3802870517816506, + "learning_rate": 8.581889348376664e-06, + "loss": 0.3614, + "step": 9255 + }, + { + "epoch": 0.27, + "grad_norm": 1.9407773577944534, + "learning_rate": 8.581561611667414e-06, + "loss": 0.3326, + "step": 9256 + }, + { + "epoch": 0.27, + "grad_norm": 1.5649764260166392, + "learning_rate": 8.581233843350841e-06, + "loss": 0.3595, + "step": 9257 + }, + { + "epoch": 0.27, + "grad_norm": 1.4642633604096313, + "learning_rate": 8.580906043429842e-06, + "loss": 0.3385, + "step": 9258 + }, + { + "epoch": 0.27, + "grad_norm": 1.7015338438587202, + "learning_rate": 8.580578211907306e-06, + "loss": 0.368, + "step": 9259 + }, + { + "epoch": 0.27, + "grad_norm": 1.2993970804463628, + "learning_rate": 8.580250348786128e-06, + "loss": 0.3215, + "step": 9260 + }, + { + "epoch": 0.27, + "grad_norm": 1.7070386727473237, + "learning_rate": 8.5799224540692e-06, + "loss": 0.3555, + "step": 9261 + }, + { + "epoch": 0.27, + "grad_norm": 1.08657725531263, + "learning_rate": 8.579594527759418e-06, + "loss": 0.5872, + "step": 9262 + }, + { + "epoch": 0.27, + "grad_norm": 3.129043023689008, + "learning_rate": 8.579266569859675e-06, + "loss": 0.3431, + "step": 9263 + }, + { + "epoch": 0.27, + "grad_norm": 2.0204036058706305, + "learning_rate": 8.578938580372865e-06, + "loss": 0.3379, + "step": 9264 + }, + { + "epoch": 0.27, + "grad_norm": 1.3480323067531652, + "learning_rate": 8.578610559301881e-06, + "loss": 0.3496, + "step": 9265 + }, + { + "epoch": 0.27, + "grad_norm": 1.6205281365958804, + "learning_rate": 8.57828250664962e-06, + "loss": 0.3766, + "step": 9266 + }, + { + "epoch": 0.27, + "grad_norm": 1.5420926828897554, + "learning_rate": 8.577954422418975e-06, + "loss": 0.3674, + "step": 9267 + }, + { + "epoch": 0.27, + "grad_norm": 1.4266829139063464, + "learning_rate": 8.577626306612842e-06, + "loss": 0.3548, + "step": 9268 + }, + { + "epoch": 0.27, + "grad_norm": 2.288970712656842, + "learning_rate": 8.57729815923412e-06, + "loss": 0.3269, + "step": 9269 + }, + { + "epoch": 0.27, + "grad_norm": 1.578955483571764, + "learning_rate": 8.5769699802857e-06, + "loss": 0.3511, + "step": 9270 + }, + { + "epoch": 0.27, + "grad_norm": 1.5892992169139912, + "learning_rate": 8.57664176977048e-06, + "loss": 0.3802, + "step": 9271 + }, + { + "epoch": 0.27, + "grad_norm": 1.5465083814693263, + "learning_rate": 8.576313527691358e-06, + "loss": 0.3695, + "step": 9272 + }, + { + "epoch": 0.27, + "grad_norm": 1.9400300603202631, + "learning_rate": 8.575985254051228e-06, + "loss": 0.3654, + "step": 9273 + }, + { + "epoch": 0.27, + "grad_norm": 2.9278677228710346, + "learning_rate": 8.57565694885299e-06, + "loss": 0.3782, + "step": 9274 + }, + { + "epoch": 0.27, + "grad_norm": 1.428549504109866, + "learning_rate": 8.575328612099539e-06, + "loss": 0.357, + "step": 9275 + }, + { + "epoch": 0.27, + "grad_norm": 1.5932370271437726, + "learning_rate": 8.575000243793773e-06, + "loss": 0.363, + "step": 9276 + }, + { + "epoch": 0.27, + "grad_norm": 1.65596521594348, + "learning_rate": 8.574671843938592e-06, + "loss": 0.3418, + "step": 9277 + }, + { + "epoch": 0.27, + "grad_norm": 1.4212059700565625, + "learning_rate": 8.57434341253689e-06, + "loss": 0.3348, + "step": 9278 + }, + { + "epoch": 0.27, + "grad_norm": 1.6539555360371776, + "learning_rate": 8.574014949591567e-06, + "loss": 0.355, + "step": 9279 + }, + { + "epoch": 0.27, + "grad_norm": 1.637146551153646, + "learning_rate": 8.573686455105526e-06, + "loss": 0.3707, + "step": 9280 + }, + { + "epoch": 0.27, + "grad_norm": 1.800770573066113, + "learning_rate": 8.57335792908166e-06, + "loss": 0.3311, + "step": 9281 + }, + { + "epoch": 0.27, + "grad_norm": 1.5811278616783513, + "learning_rate": 8.57302937152287e-06, + "loss": 0.3611, + "step": 9282 + }, + { + "epoch": 0.27, + "grad_norm": 2.530838170584207, + "learning_rate": 8.572700782432057e-06, + "loss": 0.368, + "step": 9283 + }, + { + "epoch": 0.27, + "grad_norm": 1.4424634900528526, + "learning_rate": 8.572372161812122e-06, + "loss": 0.3469, + "step": 9284 + }, + { + "epoch": 0.27, + "grad_norm": 1.7281587486970191, + "learning_rate": 8.57204350966596e-06, + "loss": 0.371, + "step": 9285 + }, + { + "epoch": 0.27, + "grad_norm": 1.4287358119991689, + "learning_rate": 8.571714825996475e-06, + "loss": 0.337, + "step": 9286 + }, + { + "epoch": 0.27, + "grad_norm": 1.4689668463949115, + "learning_rate": 8.571386110806569e-06, + "loss": 0.3569, + "step": 9287 + }, + { + "epoch": 0.27, + "grad_norm": 1.6567983423121384, + "learning_rate": 8.571057364099141e-06, + "loss": 0.3764, + "step": 9288 + }, + { + "epoch": 0.27, + "grad_norm": 1.6014060551735234, + "learning_rate": 8.570728585877088e-06, + "loss": 0.34, + "step": 9289 + }, + { + "epoch": 0.27, + "grad_norm": 1.6608049274255847, + "learning_rate": 8.570399776143319e-06, + "loss": 0.3236, + "step": 9290 + }, + { + "epoch": 0.27, + "grad_norm": 1.6278180432183424, + "learning_rate": 8.570070934900732e-06, + "loss": 0.3928, + "step": 9291 + }, + { + "epoch": 0.27, + "grad_norm": 1.5605241032795156, + "learning_rate": 8.569742062152229e-06, + "loss": 0.3577, + "step": 9292 + }, + { + "epoch": 0.27, + "grad_norm": 1.6936935542355904, + "learning_rate": 8.569413157900713e-06, + "loss": 0.3364, + "step": 9293 + }, + { + "epoch": 0.27, + "grad_norm": 1.4342440240348542, + "learning_rate": 8.569084222149087e-06, + "loss": 0.3713, + "step": 9294 + }, + { + "epoch": 0.27, + "grad_norm": 1.0363505521693033, + "learning_rate": 8.568755254900253e-06, + "loss": 0.6317, + "step": 9295 + }, + { + "epoch": 0.27, + "grad_norm": 1.4112646579366392, + "learning_rate": 8.568426256157112e-06, + "loss": 0.3511, + "step": 9296 + }, + { + "epoch": 0.27, + "grad_norm": 4.3765571067546265, + "learning_rate": 8.568097225922571e-06, + "loss": 0.334, + "step": 9297 + }, + { + "epoch": 0.27, + "grad_norm": 1.90865374042588, + "learning_rate": 8.567768164199533e-06, + "loss": 0.349, + "step": 9298 + }, + { + "epoch": 0.27, + "grad_norm": 1.822081963175954, + "learning_rate": 8.567439070990902e-06, + "loss": 0.388, + "step": 9299 + }, + { + "epoch": 0.27, + "grad_norm": 1.4946254843165658, + "learning_rate": 8.567109946299579e-06, + "loss": 0.3587, + "step": 9300 + }, + { + "epoch": 0.27, + "grad_norm": 1.513186220635361, + "learning_rate": 8.566780790128474e-06, + "loss": 0.3496, + "step": 9301 + }, + { + "epoch": 0.27, + "grad_norm": 2.380380174725705, + "learning_rate": 8.566451602480488e-06, + "loss": 0.3563, + "step": 9302 + }, + { + "epoch": 0.27, + "grad_norm": 1.4666425301082977, + "learning_rate": 8.566122383358526e-06, + "loss": 0.3301, + "step": 9303 + }, + { + "epoch": 0.27, + "grad_norm": 1.3587370879815346, + "learning_rate": 8.565793132765495e-06, + "loss": 0.3518, + "step": 9304 + }, + { + "epoch": 0.27, + "grad_norm": 1.841459384178346, + "learning_rate": 8.565463850704301e-06, + "loss": 0.3504, + "step": 9305 + }, + { + "epoch": 0.27, + "grad_norm": 1.013449130632785, + "learning_rate": 8.565134537177848e-06, + "loss": 0.5524, + "step": 9306 + }, + { + "epoch": 0.27, + "grad_norm": 1.6478516951886255, + "learning_rate": 8.564805192189043e-06, + "loss": 0.3459, + "step": 9307 + }, + { + "epoch": 0.27, + "grad_norm": 1.3011163934044574, + "learning_rate": 8.564475815740793e-06, + "loss": 0.3424, + "step": 9308 + }, + { + "epoch": 0.27, + "grad_norm": 1.466127870104297, + "learning_rate": 8.564146407836006e-06, + "loss": 0.3576, + "step": 9309 + }, + { + "epoch": 0.27, + "grad_norm": 1.8384627579932296, + "learning_rate": 8.563816968477585e-06, + "loss": 0.3628, + "step": 9310 + }, + { + "epoch": 0.27, + "grad_norm": 2.173571762207276, + "learning_rate": 8.56348749766844e-06, + "loss": 0.3643, + "step": 9311 + }, + { + "epoch": 0.27, + "grad_norm": 1.4611455619717557, + "learning_rate": 8.563157995411478e-06, + "loss": 0.3669, + "step": 9312 + }, + { + "epoch": 0.27, + "grad_norm": 2.211030782645733, + "learning_rate": 8.56282846170961e-06, + "loss": 0.3477, + "step": 9313 + }, + { + "epoch": 0.27, + "grad_norm": 1.5176383222003191, + "learning_rate": 8.56249889656574e-06, + "loss": 0.3386, + "step": 9314 + }, + { + "epoch": 0.27, + "grad_norm": 1.584882776529929, + "learning_rate": 8.562169299982776e-06, + "loss": 0.339, + "step": 9315 + }, + { + "epoch": 0.27, + "grad_norm": 1.6891416329891904, + "learning_rate": 8.561839671963629e-06, + "loss": 0.3534, + "step": 9316 + }, + { + "epoch": 0.27, + "grad_norm": 1.7068746257455354, + "learning_rate": 8.561510012511207e-06, + "loss": 0.3965, + "step": 9317 + }, + { + "epoch": 0.27, + "grad_norm": 1.4829281577394864, + "learning_rate": 8.561180321628421e-06, + "loss": 0.3411, + "step": 9318 + }, + { + "epoch": 0.27, + "grad_norm": 1.5377289012538415, + "learning_rate": 8.560850599318178e-06, + "loss": 0.3477, + "step": 9319 + }, + { + "epoch": 0.27, + "grad_norm": 1.2980729995819877, + "learning_rate": 8.56052084558339e-06, + "loss": 0.3706, + "step": 9320 + }, + { + "epoch": 0.27, + "grad_norm": 1.526060133849727, + "learning_rate": 8.560191060426966e-06, + "loss": 0.3839, + "step": 9321 + }, + { + "epoch": 0.27, + "grad_norm": 2.9589889443771025, + "learning_rate": 8.559861243851816e-06, + "loss": 0.3439, + "step": 9322 + }, + { + "epoch": 0.27, + "grad_norm": 1.4785347073065602, + "learning_rate": 8.559531395860851e-06, + "loss": 0.3224, + "step": 9323 + }, + { + "epoch": 0.27, + "grad_norm": 1.4200709820028028, + "learning_rate": 8.559201516456981e-06, + "loss": 0.3407, + "step": 9324 + }, + { + "epoch": 0.27, + "grad_norm": 1.512870302875328, + "learning_rate": 8.55887160564312e-06, + "loss": 0.3823, + "step": 9325 + }, + { + "epoch": 0.27, + "grad_norm": 1.4221579191007279, + "learning_rate": 8.558541663422178e-06, + "loss": 0.3249, + "step": 9326 + }, + { + "epoch": 0.27, + "grad_norm": 1.419441454470426, + "learning_rate": 8.558211689797063e-06, + "loss": 0.3605, + "step": 9327 + }, + { + "epoch": 0.27, + "grad_norm": 1.632446297550761, + "learning_rate": 8.557881684770692e-06, + "loss": 0.3386, + "step": 9328 + }, + { + "epoch": 0.27, + "grad_norm": 1.5941645031334195, + "learning_rate": 8.557551648345977e-06, + "loss": 0.3456, + "step": 9329 + }, + { + "epoch": 0.27, + "grad_norm": 1.461508489130223, + "learning_rate": 8.557221580525829e-06, + "loss": 0.3463, + "step": 9330 + }, + { + "epoch": 0.27, + "grad_norm": 1.7113873661483743, + "learning_rate": 8.55689148131316e-06, + "loss": 0.3614, + "step": 9331 + }, + { + "epoch": 0.27, + "grad_norm": 1.3675898572154737, + "learning_rate": 8.556561350710884e-06, + "loss": 0.3432, + "step": 9332 + }, + { + "epoch": 0.27, + "grad_norm": 1.436933537742672, + "learning_rate": 8.556231188721915e-06, + "loss": 0.3853, + "step": 9333 + }, + { + "epoch": 0.27, + "grad_norm": 1.7099791403255047, + "learning_rate": 8.555900995349167e-06, + "loss": 0.3701, + "step": 9334 + }, + { + "epoch": 0.27, + "grad_norm": 1.442505815169921, + "learning_rate": 8.555570770595553e-06, + "loss": 0.3419, + "step": 9335 + }, + { + "epoch": 0.27, + "grad_norm": 1.53995923456843, + "learning_rate": 8.555240514463987e-06, + "loss": 0.3516, + "step": 9336 + }, + { + "epoch": 0.27, + "grad_norm": 1.4850934559464724, + "learning_rate": 8.554910226957385e-06, + "loss": 0.3423, + "step": 9337 + }, + { + "epoch": 0.27, + "grad_norm": 1.4164126275586508, + "learning_rate": 8.55457990807866e-06, + "loss": 0.3393, + "step": 9338 + }, + { + "epoch": 0.27, + "grad_norm": 1.694151469915545, + "learning_rate": 8.554249557830728e-06, + "loss": 0.35, + "step": 9339 + }, + { + "epoch": 0.27, + "grad_norm": 1.5275966821059763, + "learning_rate": 8.553919176216504e-06, + "loss": 0.3431, + "step": 9340 + }, + { + "epoch": 0.27, + "grad_norm": 1.31960058988569, + "learning_rate": 8.553588763238905e-06, + "loss": 0.3613, + "step": 9341 + }, + { + "epoch": 0.27, + "grad_norm": 1.638752574317273, + "learning_rate": 8.553258318900844e-06, + "loss": 0.3462, + "step": 9342 + }, + { + "epoch": 0.27, + "grad_norm": 1.7090336601434002, + "learning_rate": 8.55292784320524e-06, + "loss": 0.3419, + "step": 9343 + }, + { + "epoch": 0.27, + "grad_norm": 1.3879087064567355, + "learning_rate": 8.552597336155008e-06, + "loss": 0.3579, + "step": 9344 + }, + { + "epoch": 0.27, + "grad_norm": 1.4516372865729983, + "learning_rate": 8.552266797753066e-06, + "loss": 0.3407, + "step": 9345 + }, + { + "epoch": 0.27, + "grad_norm": 1.4155406190403905, + "learning_rate": 8.551936228002328e-06, + "loss": 0.3524, + "step": 9346 + }, + { + "epoch": 0.27, + "grad_norm": 1.6460297629691791, + "learning_rate": 8.551605626905717e-06, + "loss": 0.3846, + "step": 9347 + }, + { + "epoch": 0.27, + "grad_norm": 1.6122340764918994, + "learning_rate": 8.551274994466143e-06, + "loss": 0.3447, + "step": 9348 + }, + { + "epoch": 0.27, + "grad_norm": 1.5896996478192167, + "learning_rate": 8.55094433068653e-06, + "loss": 0.3408, + "step": 9349 + }, + { + "epoch": 0.27, + "grad_norm": 2.054767712432073, + "learning_rate": 8.550613635569793e-06, + "loss": 0.3299, + "step": 9350 + }, + { + "epoch": 0.27, + "grad_norm": 1.3935724047511062, + "learning_rate": 8.550282909118852e-06, + "loss": 0.3524, + "step": 9351 + }, + { + "epoch": 0.27, + "grad_norm": 1.7162709797912175, + "learning_rate": 8.549952151336625e-06, + "loss": 0.3548, + "step": 9352 + }, + { + "epoch": 0.27, + "grad_norm": 0.9521917639363431, + "learning_rate": 8.54962136222603e-06, + "loss": 0.5725, + "step": 9353 + }, + { + "epoch": 0.27, + "grad_norm": 1.3184663023139325, + "learning_rate": 8.549290541789987e-06, + "loss": 0.3421, + "step": 9354 + }, + { + "epoch": 0.27, + "grad_norm": 1.5181413511543744, + "learning_rate": 8.548959690031415e-06, + "loss": 0.3474, + "step": 9355 + }, + { + "epoch": 0.27, + "grad_norm": 1.495387394028079, + "learning_rate": 8.548628806953237e-06, + "loss": 0.3508, + "step": 9356 + }, + { + "epoch": 0.27, + "grad_norm": 1.5788471529890988, + "learning_rate": 8.548297892558369e-06, + "loss": 0.3667, + "step": 9357 + }, + { + "epoch": 0.27, + "grad_norm": 1.428635071476256, + "learning_rate": 8.547966946849733e-06, + "loss": 0.3444, + "step": 9358 + }, + { + "epoch": 0.27, + "grad_norm": 1.6991597305351342, + "learning_rate": 8.547635969830248e-06, + "loss": 0.3543, + "step": 9359 + }, + { + "epoch": 0.27, + "grad_norm": 1.4874956712865655, + "learning_rate": 8.547304961502838e-06, + "loss": 0.3621, + "step": 9360 + }, + { + "epoch": 0.27, + "grad_norm": 1.5825446591624017, + "learning_rate": 8.546973921870421e-06, + "loss": 0.3465, + "step": 9361 + }, + { + "epoch": 0.27, + "grad_norm": 1.4291130441861035, + "learning_rate": 8.54664285093592e-06, + "loss": 0.365, + "step": 9362 + }, + { + "epoch": 0.27, + "grad_norm": 1.4239721733248856, + "learning_rate": 8.546311748702258e-06, + "loss": 0.345, + "step": 9363 + }, + { + "epoch": 0.27, + "grad_norm": 1.5653410687493208, + "learning_rate": 8.545980615172353e-06, + "loss": 0.3775, + "step": 9364 + }, + { + "epoch": 0.27, + "grad_norm": 1.5952400422468362, + "learning_rate": 8.545649450349131e-06, + "loss": 0.3681, + "step": 9365 + }, + { + "epoch": 0.27, + "grad_norm": 1.8100069838030208, + "learning_rate": 8.545318254235515e-06, + "loss": 0.3921, + "step": 9366 + }, + { + "epoch": 0.27, + "grad_norm": 1.4613602883135564, + "learning_rate": 8.544987026834424e-06, + "loss": 0.3528, + "step": 9367 + }, + { + "epoch": 0.27, + "grad_norm": 1.3708589206147201, + "learning_rate": 8.544655768148782e-06, + "loss": 0.3485, + "step": 9368 + }, + { + "epoch": 0.27, + "grad_norm": 1.4427993570891444, + "learning_rate": 8.544324478181516e-06, + "loss": 0.3495, + "step": 9369 + }, + { + "epoch": 0.27, + "grad_norm": 0.9982283510916772, + "learning_rate": 8.543993156935546e-06, + "loss": 0.562, + "step": 9370 + }, + { + "epoch": 0.27, + "grad_norm": 1.5632196386296324, + "learning_rate": 8.543661804413798e-06, + "loss": 0.3296, + "step": 9371 + }, + { + "epoch": 0.27, + "grad_norm": 1.6073508944374018, + "learning_rate": 8.543330420619193e-06, + "loss": 0.3639, + "step": 9372 + }, + { + "epoch": 0.27, + "grad_norm": 1.5765509858274063, + "learning_rate": 8.542999005554659e-06, + "loss": 0.349, + "step": 9373 + }, + { + "epoch": 0.27, + "grad_norm": 1.6750718160749836, + "learning_rate": 8.54266755922312e-06, + "loss": 0.3488, + "step": 9374 + }, + { + "epoch": 0.27, + "grad_norm": 1.5276762916563893, + "learning_rate": 8.542336081627501e-06, + "loss": 0.3555, + "step": 9375 + }, + { + "epoch": 0.27, + "grad_norm": 1.7614718234404987, + "learning_rate": 8.542004572770726e-06, + "loss": 0.3548, + "step": 9376 + }, + { + "epoch": 0.27, + "grad_norm": 1.561818591255594, + "learning_rate": 8.541673032655722e-06, + "loss": 0.3777, + "step": 9377 + }, + { + "epoch": 0.27, + "grad_norm": 1.6262660573480043, + "learning_rate": 8.541341461285413e-06, + "loss": 0.3502, + "step": 9378 + }, + { + "epoch": 0.27, + "grad_norm": 1.394708101831117, + "learning_rate": 8.541009858662728e-06, + "loss": 0.3852, + "step": 9379 + }, + { + "epoch": 0.27, + "grad_norm": 1.8868976374845852, + "learning_rate": 8.54067822479059e-06, + "loss": 0.3325, + "step": 9380 + }, + { + "epoch": 0.27, + "grad_norm": 0.9602592098555732, + "learning_rate": 8.540346559671929e-06, + "loss": 0.6099, + "step": 9381 + }, + { + "epoch": 0.27, + "grad_norm": 1.7068094008321026, + "learning_rate": 8.540014863309669e-06, + "loss": 0.3535, + "step": 9382 + }, + { + "epoch": 0.27, + "grad_norm": 1.8680202446746241, + "learning_rate": 8.539683135706737e-06, + "loss": 0.3547, + "step": 9383 + }, + { + "epoch": 0.27, + "grad_norm": 3.03362493607758, + "learning_rate": 8.539351376866066e-06, + "loss": 0.3756, + "step": 9384 + }, + { + "epoch": 0.27, + "grad_norm": 1.3781417503878504, + "learning_rate": 8.539019586790578e-06, + "loss": 0.35, + "step": 9385 + }, + { + "epoch": 0.27, + "grad_norm": 1.4000001645775093, + "learning_rate": 8.538687765483203e-06, + "loss": 0.3591, + "step": 9386 + }, + { + "epoch": 0.27, + "grad_norm": 1.5462856908252705, + "learning_rate": 8.538355912946867e-06, + "loss": 0.3356, + "step": 9387 + }, + { + "epoch": 0.27, + "grad_norm": 1.9948590494009273, + "learning_rate": 8.538024029184504e-06, + "loss": 0.3424, + "step": 9388 + }, + { + "epoch": 0.27, + "grad_norm": 1.7543049513583306, + "learning_rate": 8.53769211419904e-06, + "loss": 0.3394, + "step": 9389 + }, + { + "epoch": 0.27, + "grad_norm": 2.1801620522696363, + "learning_rate": 8.537360167993401e-06, + "loss": 0.3535, + "step": 9390 + }, + { + "epoch": 0.27, + "grad_norm": 1.4865506176169054, + "learning_rate": 8.53702819057052e-06, + "loss": 0.3616, + "step": 9391 + }, + { + "epoch": 0.27, + "grad_norm": 1.6163760125965871, + "learning_rate": 8.536696181933326e-06, + "loss": 0.3499, + "step": 9392 + }, + { + "epoch": 0.27, + "grad_norm": 1.449181005054388, + "learning_rate": 8.53636414208475e-06, + "loss": 0.3504, + "step": 9393 + }, + { + "epoch": 0.27, + "grad_norm": 1.3946894458567327, + "learning_rate": 8.53603207102772e-06, + "loss": 0.3578, + "step": 9394 + }, + { + "epoch": 0.27, + "grad_norm": 1.4197501606682024, + "learning_rate": 8.535699968765169e-06, + "loss": 0.3542, + "step": 9395 + }, + { + "epoch": 0.27, + "grad_norm": 1.4452351676581499, + "learning_rate": 8.535367835300026e-06, + "loss": 0.3503, + "step": 9396 + }, + { + "epoch": 0.27, + "grad_norm": 1.7282350018608892, + "learning_rate": 8.535035670635223e-06, + "loss": 0.3785, + "step": 9397 + }, + { + "epoch": 0.27, + "grad_norm": 1.532101471308527, + "learning_rate": 8.53470347477369e-06, + "loss": 0.3492, + "step": 9398 + }, + { + "epoch": 0.27, + "grad_norm": 1.358044541676241, + "learning_rate": 8.53437124771836e-06, + "loss": 0.3533, + "step": 9399 + }, + { + "epoch": 0.27, + "grad_norm": 1.4711679542647584, + "learning_rate": 8.534038989472165e-06, + "loss": 0.3425, + "step": 9400 + }, + { + "epoch": 0.27, + "grad_norm": 1.5131142693454234, + "learning_rate": 8.533706700038037e-06, + "loss": 0.3595, + "step": 9401 + }, + { + "epoch": 0.27, + "grad_norm": 1.6666271673325945, + "learning_rate": 8.533374379418908e-06, + "loss": 0.3641, + "step": 9402 + }, + { + "epoch": 0.27, + "grad_norm": 1.628069782394233, + "learning_rate": 8.53304202761771e-06, + "loss": 0.332, + "step": 9403 + }, + { + "epoch": 0.27, + "grad_norm": 3.5657642731244055, + "learning_rate": 8.532709644637377e-06, + "loss": 0.3391, + "step": 9404 + }, + { + "epoch": 0.27, + "grad_norm": 1.4598915888486268, + "learning_rate": 8.532377230480843e-06, + "loss": 0.3407, + "step": 9405 + }, + { + "epoch": 0.27, + "grad_norm": 1.6173222943082952, + "learning_rate": 8.532044785151042e-06, + "loss": 0.3591, + "step": 9406 + }, + { + "epoch": 0.27, + "grad_norm": 1.7365578380559532, + "learning_rate": 8.531712308650904e-06, + "loss": 0.3723, + "step": 9407 + }, + { + "epoch": 0.27, + "grad_norm": 1.5774061380525386, + "learning_rate": 8.531379800983366e-06, + "loss": 0.3612, + "step": 9408 + }, + { + "epoch": 0.27, + "grad_norm": 2.021661322891923, + "learning_rate": 8.531047262151365e-06, + "loss": 0.3627, + "step": 9409 + }, + { + "epoch": 0.27, + "grad_norm": 1.6728263635520988, + "learning_rate": 8.53071469215783e-06, + "loss": 0.3571, + "step": 9410 + }, + { + "epoch": 0.27, + "grad_norm": 1.535304582640804, + "learning_rate": 8.530382091005697e-06, + "loss": 0.362, + "step": 9411 + }, + { + "epoch": 0.27, + "grad_norm": 1.4220489222383699, + "learning_rate": 8.530049458697906e-06, + "loss": 0.3508, + "step": 9412 + }, + { + "epoch": 0.27, + "grad_norm": 1.718076967235593, + "learning_rate": 8.529716795237388e-06, + "loss": 0.3341, + "step": 9413 + }, + { + "epoch": 0.27, + "grad_norm": 1.705747602981327, + "learning_rate": 8.52938410062708e-06, + "loss": 0.3276, + "step": 9414 + }, + { + "epoch": 0.27, + "grad_norm": 1.6752524936477364, + "learning_rate": 8.52905137486992e-06, + "loss": 0.3251, + "step": 9415 + }, + { + "epoch": 0.27, + "grad_norm": 1.370280827889081, + "learning_rate": 8.52871861796884e-06, + "loss": 0.3289, + "step": 9416 + }, + { + "epoch": 0.27, + "grad_norm": 0.9903501862501567, + "learning_rate": 8.528385829926782e-06, + "loss": 0.587, + "step": 9417 + }, + { + "epoch": 0.27, + "grad_norm": 1.3580029344842108, + "learning_rate": 8.528053010746678e-06, + "loss": 0.35, + "step": 9418 + }, + { + "epoch": 0.27, + "grad_norm": 1.5966855923039496, + "learning_rate": 8.527720160431467e-06, + "loss": 0.3858, + "step": 9419 + }, + { + "epoch": 0.27, + "grad_norm": 1.6634586348701208, + "learning_rate": 8.527387278984088e-06, + "loss": 0.3615, + "step": 9420 + }, + { + "epoch": 0.27, + "grad_norm": 1.6081219052944906, + "learning_rate": 8.527054366407475e-06, + "loss": 0.3727, + "step": 9421 + }, + { + "epoch": 0.27, + "grad_norm": 1.3912807465570167, + "learning_rate": 8.526721422704569e-06, + "loss": 0.3691, + "step": 9422 + }, + { + "epoch": 0.27, + "grad_norm": 1.4143076933058991, + "learning_rate": 8.526388447878306e-06, + "loss": 0.3545, + "step": 9423 + }, + { + "epoch": 0.27, + "grad_norm": 1.3643571184679222, + "learning_rate": 8.526055441931627e-06, + "loss": 0.3413, + "step": 9424 + }, + { + "epoch": 0.27, + "grad_norm": 2.2637997382319255, + "learning_rate": 8.52572240486747e-06, + "loss": 0.3496, + "step": 9425 + }, + { + "epoch": 0.27, + "grad_norm": 1.6797801013839118, + "learning_rate": 8.525389336688774e-06, + "loss": 0.3202, + "step": 9426 + }, + { + "epoch": 0.27, + "grad_norm": 1.51828430113671, + "learning_rate": 8.525056237398477e-06, + "loss": 0.3313, + "step": 9427 + }, + { + "epoch": 0.27, + "grad_norm": 1.6291186721149271, + "learning_rate": 8.524723106999519e-06, + "loss": 0.3566, + "step": 9428 + }, + { + "epoch": 0.27, + "grad_norm": 1.392202556043193, + "learning_rate": 8.524389945494841e-06, + "loss": 0.3557, + "step": 9429 + }, + { + "epoch": 0.27, + "grad_norm": 1.8416364057729917, + "learning_rate": 8.524056752887385e-06, + "loss": 0.4025, + "step": 9430 + }, + { + "epoch": 0.27, + "grad_norm": 1.8348407809120881, + "learning_rate": 8.523723529180087e-06, + "loss": 0.3527, + "step": 9431 + }, + { + "epoch": 0.27, + "grad_norm": 1.3635461175812629, + "learning_rate": 8.523390274375891e-06, + "loss": 0.3254, + "step": 9432 + }, + { + "epoch": 0.27, + "grad_norm": 0.890776237549546, + "learning_rate": 8.523056988477733e-06, + "loss": 0.554, + "step": 9433 + }, + { + "epoch": 0.27, + "grad_norm": 1.680859363662678, + "learning_rate": 8.522723671488562e-06, + "loss": 0.3617, + "step": 9434 + }, + { + "epoch": 0.27, + "grad_norm": 1.8405728367035576, + "learning_rate": 8.522390323411314e-06, + "loss": 0.3526, + "step": 9435 + }, + { + "epoch": 0.27, + "grad_norm": 1.8518714254959303, + "learning_rate": 8.522056944248933e-06, + "loss": 0.3961, + "step": 9436 + }, + { + "epoch": 0.27, + "grad_norm": 1.9183565242832377, + "learning_rate": 8.521723534004359e-06, + "loss": 0.3692, + "step": 9437 + }, + { + "epoch": 0.27, + "grad_norm": 1.5450840576344593, + "learning_rate": 8.521390092680536e-06, + "loss": 0.328, + "step": 9438 + }, + { + "epoch": 0.27, + "grad_norm": 1.511195634263701, + "learning_rate": 8.521056620280408e-06, + "loss": 0.376, + "step": 9439 + }, + { + "epoch": 0.27, + "grad_norm": 1.647390013177964, + "learning_rate": 8.520723116806915e-06, + "loss": 0.3715, + "step": 9440 + }, + { + "epoch": 0.27, + "grad_norm": 1.6613537707303137, + "learning_rate": 8.520389582263001e-06, + "loss": 0.345, + "step": 9441 + }, + { + "epoch": 0.27, + "grad_norm": 1.5841750038790046, + "learning_rate": 8.52005601665161e-06, + "loss": 0.3402, + "step": 9442 + }, + { + "epoch": 0.27, + "grad_norm": 1.4730591255054324, + "learning_rate": 8.519722419975687e-06, + "loss": 0.3611, + "step": 9443 + }, + { + "epoch": 0.27, + "grad_norm": 1.4563619948236588, + "learning_rate": 8.519388792238173e-06, + "loss": 0.3664, + "step": 9444 + }, + { + "epoch": 0.27, + "grad_norm": 3.1172968366611253, + "learning_rate": 8.519055133442016e-06, + "loss": 0.3546, + "step": 9445 + }, + { + "epoch": 0.27, + "grad_norm": 1.998457657960631, + "learning_rate": 8.518721443590156e-06, + "loss": 0.3135, + "step": 9446 + }, + { + "epoch": 0.27, + "grad_norm": 1.6880956655585402, + "learning_rate": 8.518387722685542e-06, + "loss": 0.3384, + "step": 9447 + }, + { + "epoch": 0.27, + "grad_norm": 1.440733864630488, + "learning_rate": 8.518053970731117e-06, + "loss": 0.3567, + "step": 9448 + }, + { + "epoch": 0.27, + "grad_norm": 1.7816360430442555, + "learning_rate": 8.517720187729826e-06, + "loss": 0.3545, + "step": 9449 + }, + { + "epoch": 0.27, + "grad_norm": 1.7187556384296703, + "learning_rate": 8.517386373684615e-06, + "loss": 0.378, + "step": 9450 + }, + { + "epoch": 0.27, + "grad_norm": 2.051208945881084, + "learning_rate": 8.517052528598431e-06, + "loss": 0.3529, + "step": 9451 + }, + { + "epoch": 0.27, + "grad_norm": 1.682839429786782, + "learning_rate": 8.516718652474218e-06, + "loss": 0.3807, + "step": 9452 + }, + { + "epoch": 0.27, + "grad_norm": 1.6670977949320036, + "learning_rate": 8.516384745314926e-06, + "loss": 0.3573, + "step": 9453 + }, + { + "epoch": 0.27, + "grad_norm": 2.075359803032021, + "learning_rate": 8.516050807123499e-06, + "loss": 0.3382, + "step": 9454 + }, + { + "epoch": 0.27, + "grad_norm": 1.883165546491816, + "learning_rate": 8.515716837902883e-06, + "loss": 0.3561, + "step": 9455 + }, + { + "epoch": 0.27, + "grad_norm": 1.539527248419372, + "learning_rate": 8.51538283765603e-06, + "loss": 0.342, + "step": 9456 + }, + { + "epoch": 0.27, + "grad_norm": 1.7500791019714896, + "learning_rate": 8.515048806385881e-06, + "loss": 0.3684, + "step": 9457 + }, + { + "epoch": 0.27, + "grad_norm": 1.6657049217792987, + "learning_rate": 8.51471474409539e-06, + "loss": 0.3511, + "step": 9458 + }, + { + "epoch": 0.27, + "grad_norm": 2.516661334264238, + "learning_rate": 8.514380650787502e-06, + "loss": 0.3745, + "step": 9459 + }, + { + "epoch": 0.27, + "grad_norm": 1.8692785955275486, + "learning_rate": 8.514046526465166e-06, + "loss": 0.3558, + "step": 9460 + }, + { + "epoch": 0.27, + "grad_norm": 2.6130229319605203, + "learning_rate": 8.513712371131329e-06, + "loss": 0.3554, + "step": 9461 + }, + { + "epoch": 0.27, + "grad_norm": 1.697478720373499, + "learning_rate": 8.51337818478894e-06, + "loss": 0.3792, + "step": 9462 + }, + { + "epoch": 0.27, + "grad_norm": 1.8101914277688032, + "learning_rate": 8.513043967440953e-06, + "loss": 0.3392, + "step": 9463 + }, + { + "epoch": 0.27, + "grad_norm": 1.4122146623390903, + "learning_rate": 8.512709719090312e-06, + "loss": 0.3524, + "step": 9464 + }, + { + "epoch": 0.27, + "grad_norm": 1.4402732842618176, + "learning_rate": 8.512375439739968e-06, + "loss": 0.3391, + "step": 9465 + }, + { + "epoch": 0.27, + "grad_norm": 1.4788459499081126, + "learning_rate": 8.512041129392875e-06, + "loss": 0.3366, + "step": 9466 + }, + { + "epoch": 0.27, + "grad_norm": 2.2178483886898794, + "learning_rate": 8.511706788051976e-06, + "loss": 0.3894, + "step": 9467 + }, + { + "epoch": 0.27, + "grad_norm": 1.8067846935991478, + "learning_rate": 8.51137241572023e-06, + "loss": 0.3499, + "step": 9468 + }, + { + "epoch": 0.27, + "grad_norm": 1.5512051854735949, + "learning_rate": 8.51103801240058e-06, + "loss": 0.3351, + "step": 9469 + }, + { + "epoch": 0.27, + "grad_norm": 1.6422647807339694, + "learning_rate": 8.510703578095983e-06, + "loss": 0.3599, + "step": 9470 + }, + { + "epoch": 0.27, + "grad_norm": 1.691064383690915, + "learning_rate": 8.510369112809385e-06, + "loss": 0.3499, + "step": 9471 + }, + { + "epoch": 0.27, + "grad_norm": 1.5041617789034132, + "learning_rate": 8.510034616543744e-06, + "loss": 0.3626, + "step": 9472 + }, + { + "epoch": 0.27, + "grad_norm": 1.4802411406786131, + "learning_rate": 8.509700089302007e-06, + "loss": 0.3466, + "step": 9473 + }, + { + "epoch": 0.27, + "grad_norm": 1.4161163711100204, + "learning_rate": 8.509365531087126e-06, + "loss": 0.3587, + "step": 9474 + }, + { + "epoch": 0.27, + "grad_norm": 1.860463932597522, + "learning_rate": 8.509030941902058e-06, + "loss": 0.353, + "step": 9475 + }, + { + "epoch": 0.27, + "grad_norm": 1.5123291301438768, + "learning_rate": 8.508696321749752e-06, + "loss": 0.3407, + "step": 9476 + }, + { + "epoch": 0.27, + "grad_norm": 2.165696014363303, + "learning_rate": 8.508361670633162e-06, + "loss": 0.3413, + "step": 9477 + }, + { + "epoch": 0.27, + "grad_norm": 1.4849186578023095, + "learning_rate": 8.508026988555241e-06, + "loss": 0.3504, + "step": 9478 + }, + { + "epoch": 0.27, + "grad_norm": 1.5528455330209616, + "learning_rate": 8.507692275518943e-06, + "loss": 0.3591, + "step": 9479 + }, + { + "epoch": 0.27, + "grad_norm": 1.7408388479015624, + "learning_rate": 8.507357531527223e-06, + "loss": 0.3371, + "step": 9480 + }, + { + "epoch": 0.27, + "grad_norm": 1.4769154806003786, + "learning_rate": 8.507022756583032e-06, + "loss": 0.3297, + "step": 9481 + }, + { + "epoch": 0.28, + "grad_norm": 1.3997678251496246, + "learning_rate": 8.506687950689328e-06, + "loss": 0.3411, + "step": 9482 + }, + { + "epoch": 0.28, + "grad_norm": 1.4030971503221754, + "learning_rate": 8.506353113849062e-06, + "loss": 0.3671, + "step": 9483 + }, + { + "epoch": 0.28, + "grad_norm": 2.014731263910815, + "learning_rate": 8.506018246065193e-06, + "loss": 0.3626, + "step": 9484 + }, + { + "epoch": 0.28, + "grad_norm": 1.67549303960686, + "learning_rate": 8.505683347340672e-06, + "loss": 0.3617, + "step": 9485 + }, + { + "epoch": 0.28, + "grad_norm": 1.6814600822379022, + "learning_rate": 8.505348417678458e-06, + "loss": 0.3648, + "step": 9486 + }, + { + "epoch": 0.28, + "grad_norm": 1.5344154276760837, + "learning_rate": 8.505013457081506e-06, + "loss": 0.3429, + "step": 9487 + }, + { + "epoch": 0.28, + "grad_norm": 1.7910950493754356, + "learning_rate": 8.504678465552769e-06, + "loss": 0.3406, + "step": 9488 + }, + { + "epoch": 0.28, + "grad_norm": 1.5067626388791853, + "learning_rate": 8.504343443095208e-06, + "loss": 0.3422, + "step": 9489 + }, + { + "epoch": 0.28, + "grad_norm": 1.6921235085514408, + "learning_rate": 8.504008389711775e-06, + "loss": 0.3503, + "step": 9490 + }, + { + "epoch": 0.28, + "grad_norm": 1.4794260283502827, + "learning_rate": 8.503673305405431e-06, + "loss": 0.3659, + "step": 9491 + }, + { + "epoch": 0.28, + "grad_norm": 1.459538122821982, + "learning_rate": 8.503338190179132e-06, + "loss": 0.3393, + "step": 9492 + }, + { + "epoch": 0.28, + "grad_norm": 1.3548487420339546, + "learning_rate": 8.503003044035833e-06, + "loss": 0.3379, + "step": 9493 + }, + { + "epoch": 0.28, + "grad_norm": 1.4875855409607348, + "learning_rate": 8.502667866978492e-06, + "loss": 0.3578, + "step": 9494 + }, + { + "epoch": 0.28, + "grad_norm": 1.756488204982764, + "learning_rate": 8.50233265901007e-06, + "loss": 0.3838, + "step": 9495 + }, + { + "epoch": 0.28, + "grad_norm": 1.745449378015664, + "learning_rate": 8.501997420133523e-06, + "loss": 0.349, + "step": 9496 + }, + { + "epoch": 0.28, + "grad_norm": 1.5651589443570995, + "learning_rate": 8.50166215035181e-06, + "loss": 0.3573, + "step": 9497 + }, + { + "epoch": 0.28, + "grad_norm": 2.123768498908178, + "learning_rate": 8.50132684966789e-06, + "loss": 0.358, + "step": 9498 + }, + { + "epoch": 0.28, + "grad_norm": 2.2986182432735798, + "learning_rate": 8.50099151808472e-06, + "loss": 0.3443, + "step": 9499 + }, + { + "epoch": 0.28, + "grad_norm": 1.548398632809067, + "learning_rate": 8.500656155605264e-06, + "loss": 0.3522, + "step": 9500 + }, + { + "epoch": 0.28, + "grad_norm": 1.462378010388843, + "learning_rate": 8.500320762232476e-06, + "loss": 0.3945, + "step": 9501 + }, + { + "epoch": 0.28, + "grad_norm": 1.3414996065878835, + "learning_rate": 8.499985337969321e-06, + "loss": 0.354, + "step": 9502 + }, + { + "epoch": 0.28, + "grad_norm": 1.6062628501961826, + "learning_rate": 8.499649882818754e-06, + "loss": 0.366, + "step": 9503 + }, + { + "epoch": 0.28, + "grad_norm": 1.877873577692208, + "learning_rate": 8.49931439678374e-06, + "loss": 0.3404, + "step": 9504 + }, + { + "epoch": 0.28, + "grad_norm": 1.8463566027053897, + "learning_rate": 8.498978879867235e-06, + "loss": 0.4217, + "step": 9505 + }, + { + "epoch": 0.28, + "grad_norm": 1.5515856029805013, + "learning_rate": 8.498643332072206e-06, + "loss": 0.3494, + "step": 9506 + }, + { + "epoch": 0.28, + "grad_norm": 1.8055530046283441, + "learning_rate": 8.498307753401607e-06, + "loss": 0.3463, + "step": 9507 + }, + { + "epoch": 0.28, + "grad_norm": 1.4566964621568814, + "learning_rate": 8.497972143858406e-06, + "loss": 0.3428, + "step": 9508 + }, + { + "epoch": 0.28, + "grad_norm": 1.6165126933856655, + "learning_rate": 8.49763650344556e-06, + "loss": 0.3702, + "step": 9509 + }, + { + "epoch": 0.28, + "grad_norm": 1.974083810147678, + "learning_rate": 8.497300832166036e-06, + "loss": 0.3375, + "step": 9510 + }, + { + "epoch": 0.28, + "grad_norm": 4.033216421154716, + "learning_rate": 8.496965130022792e-06, + "loss": 0.3615, + "step": 9511 + }, + { + "epoch": 0.28, + "grad_norm": 1.6853833783439454, + "learning_rate": 8.49662939701879e-06, + "loss": 0.3406, + "step": 9512 + }, + { + "epoch": 0.28, + "grad_norm": 1.7434992063012538, + "learning_rate": 8.496293633156997e-06, + "loss": 0.3225, + "step": 9513 + }, + { + "epoch": 0.28, + "grad_norm": 1.5753777890920262, + "learning_rate": 8.495957838440373e-06, + "loss": 0.353, + "step": 9514 + }, + { + "epoch": 0.28, + "grad_norm": 1.4394047914704922, + "learning_rate": 8.495622012871883e-06, + "loss": 0.3506, + "step": 9515 + }, + { + "epoch": 0.28, + "grad_norm": 1.6324593412886312, + "learning_rate": 8.49528615645449e-06, + "loss": 0.3705, + "step": 9516 + }, + { + "epoch": 0.28, + "grad_norm": 1.447727812203155, + "learning_rate": 8.494950269191157e-06, + "loss": 0.3782, + "step": 9517 + }, + { + "epoch": 0.28, + "grad_norm": 0.9410726586985937, + "learning_rate": 8.494614351084848e-06, + "loss": 0.5697, + "step": 9518 + }, + { + "epoch": 0.28, + "grad_norm": 1.5138501421417594, + "learning_rate": 8.494278402138532e-06, + "loss": 0.3579, + "step": 9519 + }, + { + "epoch": 0.28, + "grad_norm": 1.4484334686506697, + "learning_rate": 8.493942422355168e-06, + "loss": 0.3512, + "step": 9520 + }, + { + "epoch": 0.28, + "grad_norm": 1.4322044145683686, + "learning_rate": 8.493606411737724e-06, + "loss": 0.3452, + "step": 9521 + }, + { + "epoch": 0.28, + "grad_norm": 1.5308042780181264, + "learning_rate": 8.493270370289164e-06, + "loss": 0.3758, + "step": 9522 + }, + { + "epoch": 0.28, + "grad_norm": 1.5150680092715536, + "learning_rate": 8.492934298012453e-06, + "loss": 0.3469, + "step": 9523 + }, + { + "epoch": 0.28, + "grad_norm": 1.8534368648433686, + "learning_rate": 8.492598194910562e-06, + "loss": 0.3442, + "step": 9524 + }, + { + "epoch": 0.28, + "grad_norm": 1.636732092903565, + "learning_rate": 8.49226206098645e-06, + "loss": 0.3512, + "step": 9525 + }, + { + "epoch": 0.28, + "grad_norm": 1.862937621644449, + "learning_rate": 8.491925896243087e-06, + "loss": 0.3708, + "step": 9526 + }, + { + "epoch": 0.28, + "grad_norm": 1.3702212887862149, + "learning_rate": 8.49158970068344e-06, + "loss": 0.3543, + "step": 9527 + }, + { + "epoch": 0.28, + "grad_norm": 1.484780124635157, + "learning_rate": 8.491253474310474e-06, + "loss": 0.3568, + "step": 9528 + }, + { + "epoch": 0.28, + "grad_norm": 1.4116752985449879, + "learning_rate": 8.49091721712716e-06, + "loss": 0.3495, + "step": 9529 + }, + { + "epoch": 0.28, + "grad_norm": 1.428050036018875, + "learning_rate": 8.490580929136462e-06, + "loss": 0.3445, + "step": 9530 + }, + { + "epoch": 0.28, + "grad_norm": 2.1904894512043267, + "learning_rate": 8.490244610341348e-06, + "loss": 0.3481, + "step": 9531 + }, + { + "epoch": 0.28, + "grad_norm": 1.663635978419435, + "learning_rate": 8.489908260744786e-06, + "loss": 0.3519, + "step": 9532 + }, + { + "epoch": 0.28, + "grad_norm": 1.452352130934454, + "learning_rate": 8.489571880349745e-06, + "loss": 0.3563, + "step": 9533 + }, + { + "epoch": 0.28, + "grad_norm": 1.418610369787248, + "learning_rate": 8.489235469159192e-06, + "loss": 0.3412, + "step": 9534 + }, + { + "epoch": 0.28, + "grad_norm": 1.4557497566941993, + "learning_rate": 8.4888990271761e-06, + "loss": 0.3624, + "step": 9535 + }, + { + "epoch": 0.28, + "grad_norm": 1.9808871693304706, + "learning_rate": 8.488562554403435e-06, + "loss": 0.3342, + "step": 9536 + }, + { + "epoch": 0.28, + "grad_norm": 1.4828829281858684, + "learning_rate": 8.488226050844165e-06, + "loss": 0.332, + "step": 9537 + }, + { + "epoch": 0.28, + "grad_norm": 1.4448316324968056, + "learning_rate": 8.487889516501262e-06, + "loss": 0.3582, + "step": 9538 + }, + { + "epoch": 0.28, + "grad_norm": 1.5665648402958754, + "learning_rate": 8.487552951377696e-06, + "loss": 0.3606, + "step": 9539 + }, + { + "epoch": 0.28, + "grad_norm": 1.6436627013965714, + "learning_rate": 8.487216355476436e-06, + "loss": 0.3439, + "step": 9540 + }, + { + "epoch": 0.28, + "grad_norm": 1.5327313435730257, + "learning_rate": 8.486879728800454e-06, + "loss": 0.3715, + "step": 9541 + }, + { + "epoch": 0.28, + "grad_norm": 1.534211691458022, + "learning_rate": 8.486543071352717e-06, + "loss": 0.3439, + "step": 9542 + }, + { + "epoch": 0.28, + "grad_norm": 1.7974976399615021, + "learning_rate": 8.486206383136202e-06, + "loss": 0.4064, + "step": 9543 + }, + { + "epoch": 0.28, + "grad_norm": 1.540110438675906, + "learning_rate": 8.485869664153877e-06, + "loss": 0.3637, + "step": 9544 + }, + { + "epoch": 0.28, + "grad_norm": 1.493291700390252, + "learning_rate": 8.485532914408712e-06, + "loss": 0.3302, + "step": 9545 + }, + { + "epoch": 0.28, + "grad_norm": 1.0224136269967563, + "learning_rate": 8.48519613390368e-06, + "loss": 0.6284, + "step": 9546 + }, + { + "epoch": 0.28, + "grad_norm": 1.5002383127440746, + "learning_rate": 8.484859322641755e-06, + "loss": 0.3462, + "step": 9547 + }, + { + "epoch": 0.28, + "grad_norm": 1.648231868329907, + "learning_rate": 8.484522480625905e-06, + "loss": 0.3275, + "step": 9548 + }, + { + "epoch": 0.28, + "grad_norm": 1.3385578019287467, + "learning_rate": 8.484185607859108e-06, + "loss": 0.362, + "step": 9549 + }, + { + "epoch": 0.28, + "grad_norm": 1.7654236305232827, + "learning_rate": 8.483848704344333e-06, + "loss": 0.355, + "step": 9550 + }, + { + "epoch": 0.28, + "grad_norm": 1.4470105813616672, + "learning_rate": 8.483511770084555e-06, + "loss": 0.3659, + "step": 9551 + }, + { + "epoch": 0.28, + "grad_norm": 1.6139823082561906, + "learning_rate": 8.483174805082747e-06, + "loss": 0.3521, + "step": 9552 + }, + { + "epoch": 0.28, + "grad_norm": 1.3952924325644434, + "learning_rate": 8.482837809341884e-06, + "loss": 0.3442, + "step": 9553 + }, + { + "epoch": 0.28, + "grad_norm": 1.3797437797336223, + "learning_rate": 8.482500782864937e-06, + "loss": 0.323, + "step": 9554 + }, + { + "epoch": 0.28, + "grad_norm": 1.4264641489951657, + "learning_rate": 8.482163725654883e-06, + "loss": 0.3381, + "step": 9555 + }, + { + "epoch": 0.28, + "grad_norm": 1.5877133774885233, + "learning_rate": 8.481826637714694e-06, + "loss": 0.3541, + "step": 9556 + }, + { + "epoch": 0.28, + "grad_norm": 1.4790747697482467, + "learning_rate": 8.481489519047348e-06, + "loss": 0.3444, + "step": 9557 + }, + { + "epoch": 0.28, + "grad_norm": 1.4325828090649788, + "learning_rate": 8.481152369655818e-06, + "loss": 0.3624, + "step": 9558 + }, + { + "epoch": 0.28, + "grad_norm": 1.3681105711856916, + "learning_rate": 8.48081518954308e-06, + "loss": 0.3426, + "step": 9559 + }, + { + "epoch": 0.28, + "grad_norm": 1.453928232039822, + "learning_rate": 8.480477978712108e-06, + "loss": 0.3392, + "step": 9560 + }, + { + "epoch": 0.28, + "grad_norm": 1.3850924441540589, + "learning_rate": 8.48014073716588e-06, + "loss": 0.3314, + "step": 9561 + }, + { + "epoch": 0.28, + "grad_norm": 1.4362955440737637, + "learning_rate": 8.479803464907372e-06, + "loss": 0.3399, + "step": 9562 + }, + { + "epoch": 0.28, + "grad_norm": 1.6549070802535037, + "learning_rate": 8.479466161939558e-06, + "loss": 0.339, + "step": 9563 + }, + { + "epoch": 0.28, + "grad_norm": 1.457160831005892, + "learning_rate": 8.479128828265418e-06, + "loss": 0.3544, + "step": 9564 + }, + { + "epoch": 0.28, + "grad_norm": 1.8958428362066964, + "learning_rate": 8.478791463887927e-06, + "loss": 0.3402, + "step": 9565 + }, + { + "epoch": 0.28, + "grad_norm": 1.6395997716450588, + "learning_rate": 8.478454068810064e-06, + "loss": 0.3562, + "step": 9566 + }, + { + "epoch": 0.28, + "grad_norm": 1.807565269339109, + "learning_rate": 8.478116643034804e-06, + "loss": 0.3517, + "step": 9567 + }, + { + "epoch": 0.28, + "grad_norm": 1.5172098242773155, + "learning_rate": 8.477779186565125e-06, + "loss": 0.3531, + "step": 9568 + }, + { + "epoch": 0.28, + "grad_norm": 1.4873205750627985, + "learning_rate": 8.477441699404007e-06, + "loss": 0.3515, + "step": 9569 + }, + { + "epoch": 0.28, + "grad_norm": 1.616843739597172, + "learning_rate": 8.47710418155443e-06, + "loss": 0.355, + "step": 9570 + }, + { + "epoch": 0.28, + "grad_norm": 1.3366129664218123, + "learning_rate": 8.476766633019367e-06, + "loss": 0.3583, + "step": 9571 + }, + { + "epoch": 0.28, + "grad_norm": 1.6654063587743262, + "learning_rate": 8.4764290538018e-06, + "loss": 0.3528, + "step": 9572 + }, + { + "epoch": 0.28, + "grad_norm": 1.7190405872704262, + "learning_rate": 8.47609144390471e-06, + "loss": 0.3593, + "step": 9573 + }, + { + "epoch": 0.28, + "grad_norm": 2.3955484110148135, + "learning_rate": 8.475753803331073e-06, + "loss": 0.3373, + "step": 9574 + }, + { + "epoch": 0.28, + "grad_norm": 1.5392644530789013, + "learning_rate": 8.475416132083869e-06, + "loss": 0.3794, + "step": 9575 + }, + { + "epoch": 0.28, + "grad_norm": 1.7741319870387335, + "learning_rate": 8.475078430166081e-06, + "loss": 0.3297, + "step": 9576 + }, + { + "epoch": 0.28, + "grad_norm": 1.4258839793217732, + "learning_rate": 8.474740697580687e-06, + "loss": 0.3492, + "step": 9577 + }, + { + "epoch": 0.28, + "grad_norm": 1.6244258907431561, + "learning_rate": 8.474402934330669e-06, + "loss": 0.3758, + "step": 9578 + }, + { + "epoch": 0.28, + "grad_norm": 1.5468438008327219, + "learning_rate": 8.474065140419003e-06, + "loss": 0.3477, + "step": 9579 + }, + { + "epoch": 0.28, + "grad_norm": 1.3184995812523364, + "learning_rate": 8.473727315848676e-06, + "loss": 0.3428, + "step": 9580 + }, + { + "epoch": 0.28, + "grad_norm": 1.704516297401831, + "learning_rate": 8.473389460622667e-06, + "loss": 0.3514, + "step": 9581 + }, + { + "epoch": 0.28, + "grad_norm": 3.529926185285455, + "learning_rate": 8.473051574743955e-06, + "loss": 0.3495, + "step": 9582 + }, + { + "epoch": 0.28, + "grad_norm": 1.9119177543341983, + "learning_rate": 8.472713658215527e-06, + "loss": 0.3414, + "step": 9583 + }, + { + "epoch": 0.28, + "grad_norm": 1.2856550704050687, + "learning_rate": 8.472375711040361e-06, + "loss": 0.3539, + "step": 9584 + }, + { + "epoch": 0.28, + "grad_norm": 2.393542412829026, + "learning_rate": 8.472037733221444e-06, + "loss": 0.3314, + "step": 9585 + }, + { + "epoch": 0.28, + "grad_norm": 1.317951392383433, + "learning_rate": 8.471699724761752e-06, + "loss": 0.3586, + "step": 9586 + }, + { + "epoch": 0.28, + "grad_norm": 1.493643062730206, + "learning_rate": 8.471361685664272e-06, + "loss": 0.3606, + "step": 9587 + }, + { + "epoch": 0.28, + "grad_norm": 1.404361247854483, + "learning_rate": 8.471023615931988e-06, + "loss": 0.338, + "step": 9588 + }, + { + "epoch": 0.28, + "grad_norm": 2.3409757221925536, + "learning_rate": 8.470685515567882e-06, + "loss": 0.3194, + "step": 9589 + }, + { + "epoch": 0.28, + "grad_norm": 1.3885571522550275, + "learning_rate": 8.470347384574936e-06, + "loss": 0.3638, + "step": 9590 + }, + { + "epoch": 0.28, + "grad_norm": 1.4710089618958389, + "learning_rate": 8.470009222956138e-06, + "loss": 0.33, + "step": 9591 + }, + { + "epoch": 0.28, + "grad_norm": 1.5551238480340188, + "learning_rate": 8.469671030714469e-06, + "loss": 0.3347, + "step": 9592 + }, + { + "epoch": 0.28, + "grad_norm": 1.4758105310430778, + "learning_rate": 8.469332807852913e-06, + "loss": 0.3745, + "step": 9593 + }, + { + "epoch": 0.28, + "grad_norm": 1.4707727558717947, + "learning_rate": 8.468994554374458e-06, + "loss": 0.348, + "step": 9594 + }, + { + "epoch": 0.28, + "grad_norm": 1.7056276885535644, + "learning_rate": 8.468656270282088e-06, + "loss": 0.3517, + "step": 9595 + }, + { + "epoch": 0.28, + "grad_norm": 1.3562540282177942, + "learning_rate": 8.468317955578788e-06, + "loss": 0.3366, + "step": 9596 + }, + { + "epoch": 0.28, + "grad_norm": 1.4135456135794888, + "learning_rate": 8.467979610267543e-06, + "loss": 0.354, + "step": 9597 + }, + { + "epoch": 0.28, + "grad_norm": 1.5162884891586514, + "learning_rate": 8.46764123435134e-06, + "loss": 0.3492, + "step": 9598 + }, + { + "epoch": 0.28, + "grad_norm": 1.8929762958245913, + "learning_rate": 8.467302827833163e-06, + "loss": 0.3768, + "step": 9599 + }, + { + "epoch": 0.28, + "grad_norm": 1.7536538227619736, + "learning_rate": 8.466964390716002e-06, + "loss": 0.3412, + "step": 9600 + }, + { + "epoch": 0.28, + "grad_norm": 1.4047084800718836, + "learning_rate": 8.466625923002842e-06, + "loss": 0.3549, + "step": 9601 + }, + { + "epoch": 0.28, + "grad_norm": 1.4767280795179614, + "learning_rate": 8.466287424696668e-06, + "loss": 0.3403, + "step": 9602 + }, + { + "epoch": 0.28, + "grad_norm": 1.7948180522306625, + "learning_rate": 8.46594889580047e-06, + "loss": 0.3474, + "step": 9603 + }, + { + "epoch": 0.28, + "grad_norm": 1.8066018735958091, + "learning_rate": 8.465610336317235e-06, + "loss": 0.3359, + "step": 9604 + }, + { + "epoch": 0.28, + "grad_norm": 1.802736486382421, + "learning_rate": 8.46527174624995e-06, + "loss": 0.3443, + "step": 9605 + }, + { + "epoch": 0.28, + "grad_norm": 1.7520215510524537, + "learning_rate": 8.464933125601602e-06, + "loss": 0.3706, + "step": 9606 + }, + { + "epoch": 0.28, + "grad_norm": 1.674670713470005, + "learning_rate": 8.464594474375183e-06, + "loss": 0.3492, + "step": 9607 + }, + { + "epoch": 0.28, + "grad_norm": 1.6338519133021012, + "learning_rate": 8.464255792573679e-06, + "loss": 0.3329, + "step": 9608 + }, + { + "epoch": 0.28, + "grad_norm": 2.3429666177311614, + "learning_rate": 8.463917080200078e-06, + "loss": 0.3411, + "step": 9609 + }, + { + "epoch": 0.28, + "grad_norm": 1.4663812264839686, + "learning_rate": 8.46357833725737e-06, + "loss": 0.3519, + "step": 9610 + }, + { + "epoch": 0.28, + "grad_norm": 1.406554757941218, + "learning_rate": 8.463239563748547e-06, + "loss": 0.3763, + "step": 9611 + }, + { + "epoch": 0.28, + "grad_norm": 1.4020466494917645, + "learning_rate": 8.462900759676594e-06, + "loss": 0.3785, + "step": 9612 + }, + { + "epoch": 0.28, + "grad_norm": 1.3145927717056638, + "learning_rate": 8.462561925044505e-06, + "loss": 0.367, + "step": 9613 + }, + { + "epoch": 0.28, + "grad_norm": 1.3854249122160693, + "learning_rate": 8.462223059855268e-06, + "loss": 0.3287, + "step": 9614 + }, + { + "epoch": 0.28, + "grad_norm": 1.8510402094815959, + "learning_rate": 8.461884164111874e-06, + "loss": 0.3412, + "step": 9615 + }, + { + "epoch": 0.28, + "grad_norm": 1.513680201014686, + "learning_rate": 8.461545237817314e-06, + "loss": 0.3607, + "step": 9616 + }, + { + "epoch": 0.28, + "grad_norm": 1.5643677437331445, + "learning_rate": 8.461206280974579e-06, + "loss": 0.3477, + "step": 9617 + }, + { + "epoch": 0.28, + "grad_norm": 1.4478903350442955, + "learning_rate": 8.46086729358666e-06, + "loss": 0.3521, + "step": 9618 + }, + { + "epoch": 0.28, + "grad_norm": 1.451135847507794, + "learning_rate": 8.460528275656549e-06, + "loss": 0.331, + "step": 9619 + }, + { + "epoch": 0.28, + "grad_norm": 1.5077606960648537, + "learning_rate": 8.460189227187237e-06, + "loss": 0.3442, + "step": 9620 + }, + { + "epoch": 0.28, + "grad_norm": 1.3838223517235213, + "learning_rate": 8.459850148181716e-06, + "loss": 0.3563, + "step": 9621 + }, + { + "epoch": 0.28, + "grad_norm": 1.3914611653028204, + "learning_rate": 8.45951103864298e-06, + "loss": 0.3387, + "step": 9622 + }, + { + "epoch": 0.28, + "grad_norm": 1.5032710109383363, + "learning_rate": 8.45917189857402e-06, + "loss": 0.3791, + "step": 9623 + }, + { + "epoch": 0.28, + "grad_norm": 1.2728864597214906, + "learning_rate": 8.458832727977832e-06, + "loss": 0.36, + "step": 9624 + }, + { + "epoch": 0.28, + "grad_norm": 1.3566249743988381, + "learning_rate": 8.458493526857405e-06, + "loss": 0.3445, + "step": 9625 + }, + { + "epoch": 0.28, + "grad_norm": 1.335873381752978, + "learning_rate": 8.458154295215735e-06, + "loss": 0.3379, + "step": 9626 + }, + { + "epoch": 0.28, + "grad_norm": 1.2682576576047497, + "learning_rate": 8.457815033055815e-06, + "loss": 0.3487, + "step": 9627 + }, + { + "epoch": 0.28, + "grad_norm": 2.073272301244019, + "learning_rate": 8.457475740380638e-06, + "loss": 0.3718, + "step": 9628 + }, + { + "epoch": 0.28, + "grad_norm": 1.3733756969329687, + "learning_rate": 8.457136417193199e-06, + "loss": 0.3148, + "step": 9629 + }, + { + "epoch": 0.28, + "grad_norm": 1.3655935966051624, + "learning_rate": 8.456797063496494e-06, + "loss": 0.3326, + "step": 9630 + }, + { + "epoch": 0.28, + "grad_norm": 1.4679124380606128, + "learning_rate": 8.456457679293515e-06, + "loss": 0.349, + "step": 9631 + }, + { + "epoch": 0.28, + "grad_norm": 1.401197605107488, + "learning_rate": 8.456118264587261e-06, + "loss": 0.3391, + "step": 9632 + }, + { + "epoch": 0.28, + "grad_norm": 1.7079766483104204, + "learning_rate": 8.455778819380723e-06, + "loss": 0.3428, + "step": 9633 + }, + { + "epoch": 0.28, + "grad_norm": 1.5566320913599683, + "learning_rate": 8.455439343676898e-06, + "loss": 0.4024, + "step": 9634 + }, + { + "epoch": 0.28, + "grad_norm": 1.7433222591583333, + "learning_rate": 8.455099837478784e-06, + "loss": 0.3685, + "step": 9635 + }, + { + "epoch": 0.28, + "grad_norm": 1.574543008910797, + "learning_rate": 8.454760300789375e-06, + "loss": 0.3705, + "step": 9636 + }, + { + "epoch": 0.28, + "grad_norm": 1.6847537949744216, + "learning_rate": 8.45442073361167e-06, + "loss": 0.3484, + "step": 9637 + }, + { + "epoch": 0.28, + "grad_norm": 0.9315911257677872, + "learning_rate": 8.454081135948662e-06, + "loss": 0.6426, + "step": 9638 + }, + { + "epoch": 0.28, + "grad_norm": 1.43936431883957, + "learning_rate": 8.453741507803347e-06, + "loss": 0.3467, + "step": 9639 + }, + { + "epoch": 0.28, + "grad_norm": 1.500756947614975, + "learning_rate": 8.453401849178728e-06, + "loss": 0.3602, + "step": 9640 + }, + { + "epoch": 0.28, + "grad_norm": 2.0468905220740297, + "learning_rate": 8.4530621600778e-06, + "loss": 0.3494, + "step": 9641 + }, + { + "epoch": 0.28, + "grad_norm": 1.368385216451541, + "learning_rate": 8.45272244050356e-06, + "loss": 0.3451, + "step": 9642 + }, + { + "epoch": 0.28, + "grad_norm": 1.4612618349957587, + "learning_rate": 8.452382690459005e-06, + "loss": 0.3631, + "step": 9643 + }, + { + "epoch": 0.28, + "grad_norm": 1.7080843511057324, + "learning_rate": 8.452042909947134e-06, + "loss": 0.3428, + "step": 9644 + }, + { + "epoch": 0.28, + "grad_norm": 1.8389713902050824, + "learning_rate": 8.451703098970947e-06, + "loss": 0.3561, + "step": 9645 + }, + { + "epoch": 0.28, + "grad_norm": 1.7688148155197292, + "learning_rate": 8.451363257533441e-06, + "loss": 0.3412, + "step": 9646 + }, + { + "epoch": 0.28, + "grad_norm": 1.450147932720668, + "learning_rate": 8.451023385637616e-06, + "loss": 0.3617, + "step": 9647 + }, + { + "epoch": 0.28, + "grad_norm": 1.4037564291309637, + "learning_rate": 8.450683483286473e-06, + "loss": 0.3504, + "step": 9648 + }, + { + "epoch": 0.28, + "grad_norm": 2.3059580590544293, + "learning_rate": 8.45034355048301e-06, + "loss": 0.4163, + "step": 9649 + }, + { + "epoch": 0.28, + "grad_norm": 1.348801399856412, + "learning_rate": 8.450003587230224e-06, + "loss": 0.3366, + "step": 9650 + }, + { + "epoch": 0.28, + "grad_norm": 2.063703295312434, + "learning_rate": 8.44966359353112e-06, + "loss": 0.352, + "step": 9651 + }, + { + "epoch": 0.28, + "grad_norm": 1.2750078149415989, + "learning_rate": 8.449323569388695e-06, + "loss": 0.3264, + "step": 9652 + }, + { + "epoch": 0.28, + "grad_norm": 1.50367480060332, + "learning_rate": 8.448983514805953e-06, + "loss": 0.3739, + "step": 9653 + }, + { + "epoch": 0.28, + "grad_norm": 1.2815082350373725, + "learning_rate": 8.448643429785891e-06, + "loss": 0.3201, + "step": 9654 + }, + { + "epoch": 0.28, + "grad_norm": 1.529236763601528, + "learning_rate": 8.448303314331516e-06, + "loss": 0.3689, + "step": 9655 + }, + { + "epoch": 0.28, + "grad_norm": 1.5525786256257736, + "learning_rate": 8.447963168445825e-06, + "loss": 0.3366, + "step": 9656 + }, + { + "epoch": 0.28, + "grad_norm": 1.6285232434144576, + "learning_rate": 8.44762299213182e-06, + "loss": 0.3572, + "step": 9657 + }, + { + "epoch": 0.28, + "grad_norm": 1.6040222775783184, + "learning_rate": 8.447282785392503e-06, + "loss": 0.355, + "step": 9658 + }, + { + "epoch": 0.28, + "grad_norm": 1.4350936247804693, + "learning_rate": 8.44694254823088e-06, + "loss": 0.3525, + "step": 9659 + }, + { + "epoch": 0.28, + "grad_norm": 1.4739329811768636, + "learning_rate": 8.446602280649947e-06, + "loss": 0.3506, + "step": 9660 + }, + { + "epoch": 0.28, + "grad_norm": 1.6478250335704037, + "learning_rate": 8.446261982652713e-06, + "loss": 0.3496, + "step": 9661 + }, + { + "epoch": 0.28, + "grad_norm": 2.715931066939792, + "learning_rate": 8.445921654242178e-06, + "loss": 0.3792, + "step": 9662 + }, + { + "epoch": 0.28, + "grad_norm": 1.4517948322898144, + "learning_rate": 8.445581295421345e-06, + "loss": 0.3425, + "step": 9663 + }, + { + "epoch": 0.28, + "grad_norm": 1.3202868615678194, + "learning_rate": 8.445240906193221e-06, + "loss": 0.3756, + "step": 9664 + }, + { + "epoch": 0.28, + "grad_norm": 1.7090449300852266, + "learning_rate": 8.444900486560808e-06, + "loss": 0.3504, + "step": 9665 + }, + { + "epoch": 0.28, + "grad_norm": 1.806164884821909, + "learning_rate": 8.44456003652711e-06, + "loss": 0.3779, + "step": 9666 + }, + { + "epoch": 0.28, + "grad_norm": 1.4388293372620975, + "learning_rate": 8.44421955609513e-06, + "loss": 0.3501, + "step": 9667 + }, + { + "epoch": 0.28, + "grad_norm": 1.636717868938574, + "learning_rate": 8.443879045267873e-06, + "loss": 0.3822, + "step": 9668 + }, + { + "epoch": 0.28, + "grad_norm": 1.4206193507236757, + "learning_rate": 8.443538504048348e-06, + "loss": 0.3376, + "step": 9669 + }, + { + "epoch": 0.28, + "grad_norm": 1.447040930152595, + "learning_rate": 8.443197932439556e-06, + "loss": 0.3446, + "step": 9670 + }, + { + "epoch": 0.28, + "grad_norm": 1.391873805347286, + "learning_rate": 8.442857330444504e-06, + "loss": 0.3744, + "step": 9671 + }, + { + "epoch": 0.28, + "grad_norm": 1.5891785835885495, + "learning_rate": 8.4425166980662e-06, + "loss": 0.3605, + "step": 9672 + }, + { + "epoch": 0.28, + "grad_norm": 1.3505124395203885, + "learning_rate": 8.442176035307645e-06, + "loss": 0.3226, + "step": 9673 + }, + { + "epoch": 0.28, + "grad_norm": 1.4451178187917284, + "learning_rate": 8.44183534217185e-06, + "loss": 0.3376, + "step": 9674 + }, + { + "epoch": 0.28, + "grad_norm": 1.2518516682242702, + "learning_rate": 8.441494618661818e-06, + "loss": 0.3397, + "step": 9675 + }, + { + "epoch": 0.28, + "grad_norm": 1.7683874432908724, + "learning_rate": 8.44115386478056e-06, + "loss": 0.3666, + "step": 9676 + }, + { + "epoch": 0.28, + "grad_norm": 1.762220542734764, + "learning_rate": 8.440813080531081e-06, + "loss": 0.3496, + "step": 9677 + }, + { + "epoch": 0.28, + "grad_norm": 1.3092854423488287, + "learning_rate": 8.440472265916387e-06, + "loss": 0.3257, + "step": 9678 + }, + { + "epoch": 0.28, + "grad_norm": 1.5236370010162452, + "learning_rate": 8.440131420939488e-06, + "loss": 0.3571, + "step": 9679 + }, + { + "epoch": 0.28, + "grad_norm": 2.346297195663122, + "learning_rate": 8.43979054560339e-06, + "loss": 0.3749, + "step": 9680 + }, + { + "epoch": 0.28, + "grad_norm": 1.4491873516815266, + "learning_rate": 8.439449639911104e-06, + "loss": 0.3707, + "step": 9681 + }, + { + "epoch": 0.28, + "grad_norm": 1.5898427071810544, + "learning_rate": 8.439108703865635e-06, + "loss": 0.3839, + "step": 9682 + }, + { + "epoch": 0.28, + "grad_norm": 1.4591613664025576, + "learning_rate": 8.438767737469995e-06, + "loss": 0.3497, + "step": 9683 + }, + { + "epoch": 0.28, + "grad_norm": 1.5777855569699768, + "learning_rate": 8.438426740727192e-06, + "loss": 0.334, + "step": 9684 + }, + { + "epoch": 0.28, + "grad_norm": 1.496640784561779, + "learning_rate": 8.438085713640234e-06, + "loss": 0.3778, + "step": 9685 + }, + { + "epoch": 0.28, + "grad_norm": 1.6743659585278752, + "learning_rate": 8.43774465621213e-06, + "loss": 0.3462, + "step": 9686 + }, + { + "epoch": 0.28, + "grad_norm": 1.4130391463644698, + "learning_rate": 8.437403568445893e-06, + "loss": 0.3472, + "step": 9687 + }, + { + "epoch": 0.28, + "grad_norm": 1.4739302261290157, + "learning_rate": 8.437062450344532e-06, + "loss": 0.3596, + "step": 9688 + }, + { + "epoch": 0.28, + "grad_norm": 1.4458470268088857, + "learning_rate": 8.436721301911054e-06, + "loss": 0.3761, + "step": 9689 + }, + { + "epoch": 0.28, + "grad_norm": 1.4616593027465679, + "learning_rate": 8.436380123148475e-06, + "loss": 0.3587, + "step": 9690 + }, + { + "epoch": 0.28, + "grad_norm": 1.5017623015314074, + "learning_rate": 8.4360389140598e-06, + "loss": 0.3443, + "step": 9691 + }, + { + "epoch": 0.28, + "grad_norm": 1.463737234555126, + "learning_rate": 8.435697674648047e-06, + "loss": 0.3739, + "step": 9692 + }, + { + "epoch": 0.28, + "grad_norm": 1.4428076583717677, + "learning_rate": 8.435356404916223e-06, + "loss": 0.3696, + "step": 9693 + }, + { + "epoch": 0.28, + "grad_norm": 1.9994703599141936, + "learning_rate": 8.43501510486734e-06, + "loss": 0.374, + "step": 9694 + }, + { + "epoch": 0.28, + "grad_norm": 1.5888273331009692, + "learning_rate": 8.434673774504411e-06, + "loss": 0.3342, + "step": 9695 + }, + { + "epoch": 0.28, + "grad_norm": 1.3768825751632818, + "learning_rate": 8.434332413830448e-06, + "loss": 0.3733, + "step": 9696 + }, + { + "epoch": 0.28, + "grad_norm": 1.391727838132, + "learning_rate": 8.433991022848463e-06, + "loss": 0.3481, + "step": 9697 + }, + { + "epoch": 0.28, + "grad_norm": 1.4306560204966945, + "learning_rate": 8.433649601561468e-06, + "loss": 0.3546, + "step": 9698 + }, + { + "epoch": 0.28, + "grad_norm": 1.5873219821342517, + "learning_rate": 8.43330814997248e-06, + "loss": 0.4036, + "step": 9699 + }, + { + "epoch": 0.28, + "grad_norm": 1.3906991087623959, + "learning_rate": 8.432966668084509e-06, + "loss": 0.3292, + "step": 9700 + }, + { + "epoch": 0.28, + "grad_norm": 1.3963835554306767, + "learning_rate": 8.432625155900567e-06, + "loss": 0.3306, + "step": 9701 + }, + { + "epoch": 0.28, + "grad_norm": 0.9917673820865346, + "learning_rate": 8.43228361342367e-06, + "loss": 0.5876, + "step": 9702 + }, + { + "epoch": 0.28, + "grad_norm": 1.8240786262822262, + "learning_rate": 8.431942040656832e-06, + "loss": 0.3539, + "step": 9703 + }, + { + "epoch": 0.28, + "grad_norm": 1.4276231046458274, + "learning_rate": 8.43160043760307e-06, + "loss": 0.3728, + "step": 9704 + }, + { + "epoch": 0.28, + "grad_norm": 1.408594564395536, + "learning_rate": 8.431258804265395e-06, + "loss": 0.3452, + "step": 9705 + }, + { + "epoch": 0.28, + "grad_norm": 1.39154223947261, + "learning_rate": 8.430917140646821e-06, + "loss": 0.3582, + "step": 9706 + }, + { + "epoch": 0.28, + "grad_norm": 1.5400483361054826, + "learning_rate": 8.430575446750366e-06, + "loss": 0.3561, + "step": 9707 + }, + { + "epoch": 0.28, + "grad_norm": 1.6620477417297346, + "learning_rate": 8.430233722579046e-06, + "loss": 0.3635, + "step": 9708 + }, + { + "epoch": 0.28, + "grad_norm": 1.447677480419156, + "learning_rate": 8.429891968135874e-06, + "loss": 0.3552, + "step": 9709 + }, + { + "epoch": 0.28, + "grad_norm": 1.2824213856846456, + "learning_rate": 8.42955018342387e-06, + "loss": 0.3432, + "step": 9710 + }, + { + "epoch": 0.28, + "grad_norm": 1.4294267943236336, + "learning_rate": 8.429208368446043e-06, + "loss": 0.3655, + "step": 9711 + }, + { + "epoch": 0.28, + "grad_norm": 1.5175034294324194, + "learning_rate": 8.428866523205418e-06, + "loss": 0.3436, + "step": 9712 + }, + { + "epoch": 0.28, + "grad_norm": 1.5885944057384067, + "learning_rate": 8.428524647705007e-06, + "loss": 0.3598, + "step": 9713 + }, + { + "epoch": 0.28, + "grad_norm": 1.3934710491502351, + "learning_rate": 8.428182741947826e-06, + "loss": 0.3485, + "step": 9714 + }, + { + "epoch": 0.28, + "grad_norm": 1.4263546977389283, + "learning_rate": 8.427840805936896e-06, + "loss": 0.3529, + "step": 9715 + }, + { + "epoch": 0.28, + "grad_norm": 0.9523872132196809, + "learning_rate": 8.427498839675233e-06, + "loss": 0.6119, + "step": 9716 + }, + { + "epoch": 0.28, + "grad_norm": 1.5771832945546886, + "learning_rate": 8.427156843165853e-06, + "loss": 0.3981, + "step": 9717 + }, + { + "epoch": 0.28, + "grad_norm": 1.4239401295019165, + "learning_rate": 8.426814816411778e-06, + "loss": 0.3445, + "step": 9718 + }, + { + "epoch": 0.28, + "grad_norm": 1.3623317455747808, + "learning_rate": 8.426472759416023e-06, + "loss": 0.3389, + "step": 9719 + }, + { + "epoch": 0.28, + "grad_norm": 1.458426113389765, + "learning_rate": 8.426130672181609e-06, + "loss": 0.3513, + "step": 9720 + }, + { + "epoch": 0.28, + "grad_norm": 1.4500432351421069, + "learning_rate": 8.425788554711552e-06, + "loss": 0.3609, + "step": 9721 + }, + { + "epoch": 0.28, + "grad_norm": 1.6787844036160828, + "learning_rate": 8.425446407008875e-06, + "loss": 0.3574, + "step": 9722 + }, + { + "epoch": 0.28, + "grad_norm": 2.0784945389452423, + "learning_rate": 8.425104229076594e-06, + "loss": 0.3673, + "step": 9723 + }, + { + "epoch": 0.28, + "grad_norm": 1.329403644165342, + "learning_rate": 8.42476202091773e-06, + "loss": 0.3834, + "step": 9724 + }, + { + "epoch": 0.28, + "grad_norm": 1.338363974510636, + "learning_rate": 8.424419782535305e-06, + "loss": 0.3296, + "step": 9725 + }, + { + "epoch": 0.28, + "grad_norm": 1.3936667112570666, + "learning_rate": 8.424077513932336e-06, + "loss": 0.3575, + "step": 9726 + }, + { + "epoch": 0.28, + "grad_norm": 1.982092015425559, + "learning_rate": 8.423735215111846e-06, + "loss": 0.351, + "step": 9727 + }, + { + "epoch": 0.28, + "grad_norm": 1.6870158996650702, + "learning_rate": 8.423392886076852e-06, + "loss": 0.3446, + "step": 9728 + }, + { + "epoch": 0.28, + "grad_norm": 1.4874138610262162, + "learning_rate": 8.42305052683038e-06, + "loss": 0.348, + "step": 9729 + }, + { + "epoch": 0.28, + "grad_norm": 1.4434230936080237, + "learning_rate": 8.422708137375449e-06, + "loss": 0.41, + "step": 9730 + }, + { + "epoch": 0.28, + "grad_norm": 1.5603526239351853, + "learning_rate": 8.422365717715082e-06, + "loss": 0.3795, + "step": 9731 + }, + { + "epoch": 0.28, + "grad_norm": 1.2873815846448706, + "learning_rate": 8.422023267852297e-06, + "loss": 0.3689, + "step": 9732 + }, + { + "epoch": 0.28, + "grad_norm": 1.4044860320341133, + "learning_rate": 8.421680787790121e-06, + "loss": 0.3667, + "step": 9733 + }, + { + "epoch": 0.28, + "grad_norm": 1.28869661567896, + "learning_rate": 8.421338277531572e-06, + "loss": 0.3431, + "step": 9734 + }, + { + "epoch": 0.28, + "grad_norm": 1.3039453857479801, + "learning_rate": 8.420995737079676e-06, + "loss": 0.3609, + "step": 9735 + }, + { + "epoch": 0.28, + "grad_norm": 1.4325244410745044, + "learning_rate": 8.420653166437453e-06, + "loss": 0.3421, + "step": 9736 + }, + { + "epoch": 0.28, + "grad_norm": 1.598653204878738, + "learning_rate": 8.42031056560793e-06, + "loss": 0.3707, + "step": 9737 + }, + { + "epoch": 0.28, + "grad_norm": 1.2764979388094335, + "learning_rate": 8.419967934594128e-06, + "loss": 0.3386, + "step": 9738 + }, + { + "epoch": 0.28, + "grad_norm": 1.4687364624746222, + "learning_rate": 8.41962527339907e-06, + "loss": 0.3167, + "step": 9739 + }, + { + "epoch": 0.28, + "grad_norm": 1.6098152785789765, + "learning_rate": 8.419282582025783e-06, + "loss": 0.3504, + "step": 9740 + }, + { + "epoch": 0.28, + "grad_norm": 1.4292131775571864, + "learning_rate": 8.418939860477288e-06, + "loss": 0.3288, + "step": 9741 + }, + { + "epoch": 0.28, + "grad_norm": 1.4313745697330251, + "learning_rate": 8.418597108756609e-06, + "loss": 0.3502, + "step": 9742 + }, + { + "epoch": 0.28, + "grad_norm": 1.5120511915234824, + "learning_rate": 8.418254326866775e-06, + "loss": 0.3469, + "step": 9743 + }, + { + "epoch": 0.28, + "grad_norm": 1.4006681849834959, + "learning_rate": 8.417911514810807e-06, + "loss": 0.3653, + "step": 9744 + }, + { + "epoch": 0.28, + "grad_norm": 1.446025124871277, + "learning_rate": 8.417568672591734e-06, + "loss": 0.3679, + "step": 9745 + }, + { + "epoch": 0.28, + "grad_norm": 2.1790525007764723, + "learning_rate": 8.417225800212579e-06, + "loss": 0.3518, + "step": 9746 + }, + { + "epoch": 0.28, + "grad_norm": 1.716133347486243, + "learning_rate": 8.416882897676368e-06, + "loss": 0.3653, + "step": 9747 + }, + { + "epoch": 0.28, + "grad_norm": 1.513228231990273, + "learning_rate": 8.416539964986127e-06, + "loss": 0.3649, + "step": 9748 + }, + { + "epoch": 0.28, + "grad_norm": 1.432158425685002, + "learning_rate": 8.416197002144883e-06, + "loss": 0.3625, + "step": 9749 + }, + { + "epoch": 0.28, + "grad_norm": 1.9936925834377743, + "learning_rate": 8.415854009155663e-06, + "loss": 0.3682, + "step": 9750 + }, + { + "epoch": 0.28, + "grad_norm": 0.996330122571919, + "learning_rate": 8.415510986021493e-06, + "loss": 0.6136, + "step": 9751 + }, + { + "epoch": 0.28, + "grad_norm": 2.036189222918477, + "learning_rate": 8.4151679327454e-06, + "loss": 0.3581, + "step": 9752 + }, + { + "epoch": 0.28, + "grad_norm": 1.4488266240015284, + "learning_rate": 8.414824849330414e-06, + "loss": 0.3546, + "step": 9753 + }, + { + "epoch": 0.28, + "grad_norm": 1.6506436146610626, + "learning_rate": 8.41448173577956e-06, + "loss": 0.3534, + "step": 9754 + }, + { + "epoch": 0.28, + "grad_norm": 0.9979899252396037, + "learning_rate": 8.414138592095868e-06, + "loss": 0.5796, + "step": 9755 + }, + { + "epoch": 0.28, + "grad_norm": 1.7258627848031536, + "learning_rate": 8.413795418282363e-06, + "loss": 0.3767, + "step": 9756 + }, + { + "epoch": 0.28, + "grad_norm": 1.7946223939287285, + "learning_rate": 8.413452214342077e-06, + "loss": 0.3443, + "step": 9757 + }, + { + "epoch": 0.28, + "grad_norm": 1.45500613543854, + "learning_rate": 8.413108980278036e-06, + "loss": 0.344, + "step": 9758 + }, + { + "epoch": 0.28, + "grad_norm": 1.4612404603366118, + "learning_rate": 8.412765716093273e-06, + "loss": 0.3853, + "step": 9759 + }, + { + "epoch": 0.28, + "grad_norm": 1.5384460626538654, + "learning_rate": 8.412422421790811e-06, + "loss": 0.3769, + "step": 9760 + }, + { + "epoch": 0.28, + "grad_norm": 1.3395677340107162, + "learning_rate": 8.412079097373684e-06, + "loss": 0.3185, + "step": 9761 + }, + { + "epoch": 0.28, + "grad_norm": 1.5564202283112645, + "learning_rate": 8.411735742844924e-06, + "loss": 0.3926, + "step": 9762 + }, + { + "epoch": 0.28, + "grad_norm": 1.3376831660206359, + "learning_rate": 8.411392358207555e-06, + "loss": 0.3447, + "step": 9763 + }, + { + "epoch": 0.28, + "grad_norm": 0.9878582450893282, + "learning_rate": 8.411048943464612e-06, + "loss": 0.6151, + "step": 9764 + }, + { + "epoch": 0.28, + "grad_norm": 0.9135509748838977, + "learning_rate": 8.410705498619125e-06, + "loss": 0.5943, + "step": 9765 + }, + { + "epoch": 0.28, + "grad_norm": 1.7117304983128747, + "learning_rate": 8.410362023674123e-06, + "loss": 0.3563, + "step": 9766 + }, + { + "epoch": 0.28, + "grad_norm": 1.3772961739304386, + "learning_rate": 8.41001851863264e-06, + "loss": 0.3968, + "step": 9767 + }, + { + "epoch": 0.28, + "grad_norm": 1.5236828325465341, + "learning_rate": 8.409674983497704e-06, + "loss": 0.382, + "step": 9768 + }, + { + "epoch": 0.28, + "grad_norm": 2.525379317848244, + "learning_rate": 8.409331418272348e-06, + "loss": 0.4015, + "step": 9769 + }, + { + "epoch": 0.28, + "grad_norm": 1.4551080879773586, + "learning_rate": 8.408987822959605e-06, + "loss": 0.3461, + "step": 9770 + }, + { + "epoch": 0.28, + "grad_norm": 1.3635615515924113, + "learning_rate": 8.408644197562508e-06, + "loss": 0.3682, + "step": 9771 + }, + { + "epoch": 0.28, + "grad_norm": 1.5610293949592384, + "learning_rate": 8.408300542084087e-06, + "loss": 0.3858, + "step": 9772 + }, + { + "epoch": 0.28, + "grad_norm": 1.5928345014882834, + "learning_rate": 8.407956856527375e-06, + "loss": 0.3586, + "step": 9773 + }, + { + "epoch": 0.28, + "grad_norm": 1.4739851863768691, + "learning_rate": 8.407613140895407e-06, + "loss": 0.3373, + "step": 9774 + }, + { + "epoch": 0.28, + "grad_norm": 2.224375011539203, + "learning_rate": 8.407269395191216e-06, + "loss": 0.3659, + "step": 9775 + }, + { + "epoch": 0.28, + "grad_norm": 1.5939671895662946, + "learning_rate": 8.406925619417834e-06, + "loss": 0.3664, + "step": 9776 + }, + { + "epoch": 0.28, + "grad_norm": 1.5663671712015252, + "learning_rate": 8.406581813578293e-06, + "loss": 0.3461, + "step": 9777 + }, + { + "epoch": 0.28, + "grad_norm": 1.3876353053362473, + "learning_rate": 8.406237977675633e-06, + "loss": 0.3797, + "step": 9778 + }, + { + "epoch": 0.28, + "grad_norm": 1.5346941163714094, + "learning_rate": 8.405894111712884e-06, + "loss": 0.3368, + "step": 9779 + }, + { + "epoch": 0.28, + "grad_norm": 1.435114457789715, + "learning_rate": 8.405550215693082e-06, + "loss": 0.3504, + "step": 9780 + }, + { + "epoch": 0.28, + "grad_norm": 1.5927034817639347, + "learning_rate": 8.405206289619262e-06, + "loss": 0.3764, + "step": 9781 + }, + { + "epoch": 0.28, + "grad_norm": 2.299284708745121, + "learning_rate": 8.404862333494458e-06, + "loss": 0.3587, + "step": 9782 + }, + { + "epoch": 0.28, + "grad_norm": 1.4711263435474433, + "learning_rate": 8.404518347321708e-06, + "loss": 0.3228, + "step": 9783 + }, + { + "epoch": 0.28, + "grad_norm": 2.117899881167056, + "learning_rate": 8.404174331104043e-06, + "loss": 0.3658, + "step": 9784 + }, + { + "epoch": 0.28, + "grad_norm": 1.549368685985712, + "learning_rate": 8.403830284844503e-06, + "loss": 0.3563, + "step": 9785 + }, + { + "epoch": 0.28, + "grad_norm": 1.5019157361783149, + "learning_rate": 8.403486208546124e-06, + "loss": 0.3578, + "step": 9786 + }, + { + "epoch": 0.28, + "grad_norm": 1.5852689586284043, + "learning_rate": 8.40314210221194e-06, + "loss": 0.3812, + "step": 9787 + }, + { + "epoch": 0.28, + "grad_norm": 1.5760066187570536, + "learning_rate": 8.402797965844989e-06, + "loss": 0.352, + "step": 9788 + }, + { + "epoch": 0.28, + "grad_norm": 1.5339753065578263, + "learning_rate": 8.40245379944831e-06, + "loss": 0.3644, + "step": 9789 + }, + { + "epoch": 0.28, + "grad_norm": 1.5415091275946367, + "learning_rate": 8.402109603024937e-06, + "loss": 0.3417, + "step": 9790 + }, + { + "epoch": 0.28, + "grad_norm": 1.2747021188553518, + "learning_rate": 8.401765376577908e-06, + "loss": 0.3481, + "step": 9791 + }, + { + "epoch": 0.28, + "grad_norm": 1.4927252219400176, + "learning_rate": 8.401421120110264e-06, + "loss": 0.3574, + "step": 9792 + }, + { + "epoch": 0.28, + "grad_norm": 1.2763295492196431, + "learning_rate": 8.40107683362504e-06, + "loss": 0.3499, + "step": 9793 + }, + { + "epoch": 0.28, + "grad_norm": 1.3044768834538405, + "learning_rate": 8.400732517125274e-06, + "loss": 0.3584, + "step": 9794 + }, + { + "epoch": 0.28, + "grad_norm": 1.319740204299502, + "learning_rate": 8.400388170614008e-06, + "loss": 0.3433, + "step": 9795 + }, + { + "epoch": 0.28, + "grad_norm": 1.5359704135899335, + "learning_rate": 8.400043794094278e-06, + "loss": 0.3514, + "step": 9796 + }, + { + "epoch": 0.28, + "grad_norm": 1.37680204714527, + "learning_rate": 8.399699387569123e-06, + "loss": 0.3497, + "step": 9797 + }, + { + "epoch": 0.28, + "grad_norm": 1.490733404557511, + "learning_rate": 8.399354951041584e-06, + "loss": 0.3545, + "step": 9798 + }, + { + "epoch": 0.28, + "grad_norm": 1.286596584722024, + "learning_rate": 8.3990104845147e-06, + "loss": 0.3463, + "step": 9799 + }, + { + "epoch": 0.28, + "grad_norm": 1.2970007938025363, + "learning_rate": 8.39866598799151e-06, + "loss": 0.3486, + "step": 9800 + }, + { + "epoch": 0.28, + "grad_norm": 1.7354913891883927, + "learning_rate": 8.398321461475054e-06, + "loss": 0.3404, + "step": 9801 + }, + { + "epoch": 0.28, + "grad_norm": 1.8065374965600502, + "learning_rate": 8.397976904968375e-06, + "loss": 0.3627, + "step": 9802 + }, + { + "epoch": 0.28, + "grad_norm": 1.1244226839094018, + "learning_rate": 8.397632318474512e-06, + "loss": 0.6347, + "step": 9803 + }, + { + "epoch": 0.28, + "grad_norm": 1.114666998891092, + "learning_rate": 8.397287701996507e-06, + "loss": 0.6258, + "step": 9804 + }, + { + "epoch": 0.28, + "grad_norm": 2.025283537966071, + "learning_rate": 8.3969430555374e-06, + "loss": 0.391, + "step": 9805 + }, + { + "epoch": 0.28, + "grad_norm": 1.486810596230682, + "learning_rate": 8.39659837910023e-06, + "loss": 0.3399, + "step": 9806 + }, + { + "epoch": 0.28, + "grad_norm": 1.5035726946137415, + "learning_rate": 8.396253672688043e-06, + "loss": 0.3761, + "step": 9807 + }, + { + "epoch": 0.28, + "grad_norm": 2.0083496023414034, + "learning_rate": 8.395908936303881e-06, + "loss": 0.3541, + "step": 9808 + }, + { + "epoch": 0.28, + "grad_norm": 1.955050308188439, + "learning_rate": 8.395564169950786e-06, + "loss": 0.3472, + "step": 9809 + }, + { + "epoch": 0.28, + "grad_norm": 1.6869008815112239, + "learning_rate": 8.395219373631795e-06, + "loss": 0.3547, + "step": 9810 + }, + { + "epoch": 0.28, + "grad_norm": 1.5353928600622986, + "learning_rate": 8.39487454734996e-06, + "loss": 0.3503, + "step": 9811 + }, + { + "epoch": 0.28, + "grad_norm": 1.63547841705816, + "learning_rate": 8.394529691108317e-06, + "loss": 0.3521, + "step": 9812 + }, + { + "epoch": 0.28, + "grad_norm": 1.4479587981944377, + "learning_rate": 8.394184804909914e-06, + "loss": 0.36, + "step": 9813 + }, + { + "epoch": 0.28, + "grad_norm": 1.6188684987209516, + "learning_rate": 8.39383988875779e-06, + "loss": 0.4009, + "step": 9814 + }, + { + "epoch": 0.28, + "grad_norm": 1.8322437997616268, + "learning_rate": 8.393494942654992e-06, + "loss": 0.3433, + "step": 9815 + }, + { + "epoch": 0.28, + "grad_norm": 1.3334835577658217, + "learning_rate": 8.393149966604563e-06, + "loss": 0.3554, + "step": 9816 + }, + { + "epoch": 0.28, + "grad_norm": 1.3452389372538418, + "learning_rate": 8.39280496060955e-06, + "loss": 0.5692, + "step": 9817 + }, + { + "epoch": 0.28, + "grad_norm": 1.3057096262497874, + "learning_rate": 8.392459924672993e-06, + "loss": 0.3467, + "step": 9818 + }, + { + "epoch": 0.28, + "grad_norm": 1.5348912392980973, + "learning_rate": 8.392114858797942e-06, + "loss": 0.3622, + "step": 9819 + }, + { + "epoch": 0.28, + "grad_norm": 1.4303579034766698, + "learning_rate": 8.391769762987438e-06, + "loss": 0.3317, + "step": 9820 + }, + { + "epoch": 0.28, + "grad_norm": 1.647401509741087, + "learning_rate": 8.391424637244528e-06, + "loss": 0.3841, + "step": 9821 + }, + { + "epoch": 0.28, + "grad_norm": 1.2364881566367307, + "learning_rate": 8.39107948157226e-06, + "loss": 0.3441, + "step": 9822 + }, + { + "epoch": 0.28, + "grad_norm": 3.51412109153112, + "learning_rate": 8.390734295973675e-06, + "loss": 0.3529, + "step": 9823 + }, + { + "epoch": 0.28, + "grad_norm": 1.3434005975324441, + "learning_rate": 8.390389080451825e-06, + "loss": 0.3405, + "step": 9824 + }, + { + "epoch": 0.28, + "grad_norm": 1.3826733514933698, + "learning_rate": 8.390043835009752e-06, + "loss": 0.3409, + "step": 9825 + }, + { + "epoch": 0.29, + "grad_norm": 1.3650976573591562, + "learning_rate": 8.389698559650506e-06, + "loss": 0.3801, + "step": 9826 + }, + { + "epoch": 0.29, + "grad_norm": 1.4640972165929647, + "learning_rate": 8.389353254377131e-06, + "loss": 0.3513, + "step": 9827 + }, + { + "epoch": 0.29, + "grad_norm": 1.3937936112056493, + "learning_rate": 8.389007919192676e-06, + "loss": 0.3613, + "step": 9828 + }, + { + "epoch": 0.29, + "grad_norm": 1.3943438158106742, + "learning_rate": 8.388662554100191e-06, + "loss": 0.3556, + "step": 9829 + }, + { + "epoch": 0.29, + "grad_norm": 1.3873608658749113, + "learning_rate": 8.388317159102719e-06, + "loss": 0.3791, + "step": 9830 + }, + { + "epoch": 0.29, + "grad_norm": 1.4687066562436617, + "learning_rate": 8.387971734203311e-06, + "loss": 0.3593, + "step": 9831 + }, + { + "epoch": 0.29, + "grad_norm": 1.3609944808284298, + "learning_rate": 8.387626279405014e-06, + "loss": 0.3911, + "step": 9832 + }, + { + "epoch": 0.29, + "grad_norm": 1.607874649008231, + "learning_rate": 8.387280794710879e-06, + "loss": 0.3473, + "step": 9833 + }, + { + "epoch": 0.29, + "grad_norm": 1.2904487855060272, + "learning_rate": 8.386935280123952e-06, + "loss": 0.3484, + "step": 9834 + }, + { + "epoch": 0.29, + "grad_norm": 1.5081648275513895, + "learning_rate": 8.386589735647287e-06, + "loss": 0.412, + "step": 9835 + }, + { + "epoch": 0.29, + "grad_norm": 1.6541141742343328, + "learning_rate": 8.386244161283924e-06, + "loss": 0.3443, + "step": 9836 + }, + { + "epoch": 0.29, + "grad_norm": 1.5624911802774393, + "learning_rate": 8.385898557036923e-06, + "loss": 0.3486, + "step": 9837 + }, + { + "epoch": 0.29, + "grad_norm": 1.5442738626619033, + "learning_rate": 8.385552922909329e-06, + "loss": 0.3414, + "step": 9838 + }, + { + "epoch": 0.29, + "grad_norm": 1.4564361809322968, + "learning_rate": 8.385207258904194e-06, + "loss": 0.352, + "step": 9839 + }, + { + "epoch": 0.29, + "grad_norm": 1.3587613569168027, + "learning_rate": 8.384861565024564e-06, + "loss": 0.3392, + "step": 9840 + }, + { + "epoch": 0.29, + "grad_norm": 2.186951608037063, + "learning_rate": 8.384515841273498e-06, + "loss": 0.347, + "step": 9841 + }, + { + "epoch": 0.29, + "grad_norm": 1.676234168355597, + "learning_rate": 8.384170087654037e-06, + "loss": 0.3586, + "step": 9842 + }, + { + "epoch": 0.29, + "grad_norm": 1.7273236975231834, + "learning_rate": 8.383824304169242e-06, + "loss": 0.343, + "step": 9843 + }, + { + "epoch": 0.29, + "grad_norm": 2.6386549347017927, + "learning_rate": 8.383478490822157e-06, + "loss": 0.3466, + "step": 9844 + }, + { + "epoch": 0.29, + "grad_norm": 1.4635361575781771, + "learning_rate": 8.383132647615837e-06, + "loss": 0.3619, + "step": 9845 + }, + { + "epoch": 0.29, + "grad_norm": 1.4900167257794579, + "learning_rate": 8.382786774553335e-06, + "loss": 0.3605, + "step": 9846 + }, + { + "epoch": 0.29, + "grad_norm": 1.2115420445747753, + "learning_rate": 8.3824408716377e-06, + "loss": 0.3209, + "step": 9847 + }, + { + "epoch": 0.29, + "grad_norm": 2.615246818289893, + "learning_rate": 8.382094938871989e-06, + "loss": 0.4357, + "step": 9848 + }, + { + "epoch": 0.29, + "grad_norm": 1.3776173212777105, + "learning_rate": 8.381748976259253e-06, + "loss": 0.3631, + "step": 9849 + }, + { + "epoch": 0.29, + "grad_norm": 1.3754487815588048, + "learning_rate": 8.381402983802542e-06, + "loss": 0.3634, + "step": 9850 + }, + { + "epoch": 0.29, + "grad_norm": 2.249804919719771, + "learning_rate": 8.381056961504915e-06, + "loss": 0.3637, + "step": 9851 + }, + { + "epoch": 0.29, + "grad_norm": 1.5098918836468083, + "learning_rate": 8.380710909369422e-06, + "loss": 0.3411, + "step": 9852 + }, + { + "epoch": 0.29, + "grad_norm": 1.9835605185169611, + "learning_rate": 8.380364827399118e-06, + "loss": 0.3703, + "step": 9853 + }, + { + "epoch": 0.29, + "grad_norm": 1.8605264841932025, + "learning_rate": 8.380018715597056e-06, + "loss": 0.3455, + "step": 9854 + }, + { + "epoch": 0.29, + "grad_norm": 1.3760056718802973, + "learning_rate": 8.379672573966292e-06, + "loss": 0.3586, + "step": 9855 + }, + { + "epoch": 0.29, + "grad_norm": 1.4742270629336967, + "learning_rate": 8.37932640250988e-06, + "loss": 0.3343, + "step": 9856 + }, + { + "epoch": 0.29, + "grad_norm": 1.6160616453543153, + "learning_rate": 8.378980201230875e-06, + "loss": 0.3593, + "step": 9857 + }, + { + "epoch": 0.29, + "grad_norm": 1.3957179632770769, + "learning_rate": 8.37863397013233e-06, + "loss": 0.3417, + "step": 9858 + }, + { + "epoch": 0.29, + "grad_norm": 1.3308440645969144, + "learning_rate": 8.378287709217307e-06, + "loss": 0.3624, + "step": 9859 + }, + { + "epoch": 0.29, + "grad_norm": 1.6009443076590517, + "learning_rate": 8.377941418488854e-06, + "loss": 0.3334, + "step": 9860 + }, + { + "epoch": 0.29, + "grad_norm": 1.4847403816975029, + "learning_rate": 8.377595097950032e-06, + "loss": 0.3544, + "step": 9861 + }, + { + "epoch": 0.29, + "grad_norm": 1.9038272897834405, + "learning_rate": 8.377248747603896e-06, + "loss": 0.3481, + "step": 9862 + }, + { + "epoch": 0.29, + "grad_norm": 1.4714214876607445, + "learning_rate": 8.3769023674535e-06, + "loss": 0.3455, + "step": 9863 + }, + { + "epoch": 0.29, + "grad_norm": 2.0205350962681607, + "learning_rate": 8.376555957501906e-06, + "loss": 0.3342, + "step": 9864 + }, + { + "epoch": 0.29, + "grad_norm": 1.2866450001930194, + "learning_rate": 8.376209517752168e-06, + "loss": 0.3354, + "step": 9865 + }, + { + "epoch": 0.29, + "grad_norm": 1.2977361825342852, + "learning_rate": 8.375863048207343e-06, + "loss": 0.3315, + "step": 9866 + }, + { + "epoch": 0.29, + "grad_norm": 1.2337744929392036, + "learning_rate": 8.375516548870489e-06, + "loss": 0.3212, + "step": 9867 + }, + { + "epoch": 0.29, + "grad_norm": 1.4894456840852626, + "learning_rate": 8.375170019744664e-06, + "loss": 0.3549, + "step": 9868 + }, + { + "epoch": 0.29, + "grad_norm": 1.3438377443788376, + "learning_rate": 8.374823460832926e-06, + "loss": 0.3493, + "step": 9869 + }, + { + "epoch": 0.29, + "grad_norm": 1.1972031967492853, + "learning_rate": 8.374476872138333e-06, + "loss": 0.339, + "step": 9870 + }, + { + "epoch": 0.29, + "grad_norm": 1.6072836897749, + "learning_rate": 8.374130253663945e-06, + "loss": 0.3534, + "step": 9871 + }, + { + "epoch": 0.29, + "grad_norm": 1.448436097685525, + "learning_rate": 8.37378360541282e-06, + "loss": 0.3703, + "step": 9872 + }, + { + "epoch": 0.29, + "grad_norm": 1.4967178563242525, + "learning_rate": 8.373436927388017e-06, + "loss": 0.345, + "step": 9873 + }, + { + "epoch": 0.29, + "grad_norm": 2.9250542023129364, + "learning_rate": 8.373090219592596e-06, + "loss": 0.3586, + "step": 9874 + }, + { + "epoch": 0.29, + "grad_norm": 1.4019863989328123, + "learning_rate": 8.372743482029616e-06, + "loss": 0.3242, + "step": 9875 + }, + { + "epoch": 0.29, + "grad_norm": 1.7764745009734324, + "learning_rate": 8.372396714702137e-06, + "loss": 0.348, + "step": 9876 + }, + { + "epoch": 0.29, + "grad_norm": 1.5009518424627417, + "learning_rate": 8.37204991761322e-06, + "loss": 0.3803, + "step": 9877 + }, + { + "epoch": 0.29, + "grad_norm": 1.2904665675387692, + "learning_rate": 8.371703090765926e-06, + "loss": 0.3588, + "step": 9878 + }, + { + "epoch": 0.29, + "grad_norm": 1.3779676622301567, + "learning_rate": 8.371356234163314e-06, + "loss": 0.3412, + "step": 9879 + }, + { + "epoch": 0.29, + "grad_norm": 1.373752354919962, + "learning_rate": 8.371009347808445e-06, + "loss": 0.3633, + "step": 9880 + }, + { + "epoch": 0.29, + "grad_norm": 1.6042308071589826, + "learning_rate": 8.370662431704384e-06, + "loss": 0.3575, + "step": 9881 + }, + { + "epoch": 0.29, + "grad_norm": 1.4025967379926996, + "learning_rate": 8.370315485854186e-06, + "loss": 0.3418, + "step": 9882 + }, + { + "epoch": 0.29, + "grad_norm": 1.2275041727200737, + "learning_rate": 8.36996851026092e-06, + "loss": 0.3418, + "step": 9883 + }, + { + "epoch": 0.29, + "grad_norm": 1.4269798279048767, + "learning_rate": 8.36962150492764e-06, + "loss": 0.345, + "step": 9884 + }, + { + "epoch": 0.29, + "grad_norm": 1.4503414433111945, + "learning_rate": 8.369274469857418e-06, + "loss": 0.3806, + "step": 9885 + }, + { + "epoch": 0.29, + "grad_norm": 1.9494202964645757, + "learning_rate": 8.368927405053308e-06, + "loss": 0.3598, + "step": 9886 + }, + { + "epoch": 0.29, + "grad_norm": 1.5324034637855097, + "learning_rate": 8.368580310518377e-06, + "loss": 0.3562, + "step": 9887 + }, + { + "epoch": 0.29, + "grad_norm": 1.7656834945661124, + "learning_rate": 8.368233186255688e-06, + "loss": 0.3588, + "step": 9888 + }, + { + "epoch": 0.29, + "grad_norm": 1.3248546181409413, + "learning_rate": 8.367886032268301e-06, + "loss": 0.351, + "step": 9889 + }, + { + "epoch": 0.29, + "grad_norm": 1.4445100442978194, + "learning_rate": 8.367538848559287e-06, + "loss": 0.3487, + "step": 9890 + }, + { + "epoch": 0.29, + "grad_norm": 1.5372561647564469, + "learning_rate": 8.367191635131701e-06, + "loss": 0.3446, + "step": 9891 + }, + { + "epoch": 0.29, + "grad_norm": 1.7147349374012384, + "learning_rate": 8.366844391988614e-06, + "loss": 0.3658, + "step": 9892 + }, + { + "epoch": 0.29, + "grad_norm": 1.6254319579232455, + "learning_rate": 8.366497119133087e-06, + "loss": 0.3604, + "step": 9893 + }, + { + "epoch": 0.29, + "grad_norm": 1.4834056609686737, + "learning_rate": 8.366149816568186e-06, + "loss": 0.3528, + "step": 9894 + }, + { + "epoch": 0.29, + "grad_norm": 1.9939393031658474, + "learning_rate": 8.365802484296977e-06, + "loss": 0.3904, + "step": 9895 + }, + { + "epoch": 0.29, + "grad_norm": 1.7189137942725952, + "learning_rate": 8.365455122322522e-06, + "loss": 0.4137, + "step": 9896 + }, + { + "epoch": 0.29, + "grad_norm": 1.4093385144063233, + "learning_rate": 8.365107730647888e-06, + "loss": 0.3659, + "step": 9897 + }, + { + "epoch": 0.29, + "grad_norm": 1.041991146063776, + "learning_rate": 8.364760309276141e-06, + "loss": 0.5524, + "step": 9898 + }, + { + "epoch": 0.29, + "grad_norm": 1.6042950913342071, + "learning_rate": 8.364412858210349e-06, + "loss": 0.345, + "step": 9899 + }, + { + "epoch": 0.29, + "grad_norm": 1.3115134805867992, + "learning_rate": 8.364065377453574e-06, + "loss": 0.3746, + "step": 9900 + }, + { + "epoch": 0.29, + "grad_norm": 1.436000014566991, + "learning_rate": 8.363717867008885e-06, + "loss": 0.3385, + "step": 9901 + }, + { + "epoch": 0.29, + "grad_norm": 1.4591285120326807, + "learning_rate": 8.363370326879347e-06, + "loss": 0.325, + "step": 9902 + }, + { + "epoch": 0.29, + "grad_norm": 1.4137464565305353, + "learning_rate": 8.36302275706803e-06, + "loss": 0.3566, + "step": 9903 + }, + { + "epoch": 0.29, + "grad_norm": 1.3280412373650052, + "learning_rate": 8.362675157578002e-06, + "loss": 0.3473, + "step": 9904 + }, + { + "epoch": 0.29, + "grad_norm": 1.3111579112249603, + "learning_rate": 8.362327528412326e-06, + "loss": 0.3284, + "step": 9905 + }, + { + "epoch": 0.29, + "grad_norm": 1.5386285739730199, + "learning_rate": 8.361979869574073e-06, + "loss": 0.3526, + "step": 9906 + }, + { + "epoch": 0.29, + "grad_norm": 1.606178809120963, + "learning_rate": 8.36163218106631e-06, + "loss": 0.3772, + "step": 9907 + }, + { + "epoch": 0.29, + "grad_norm": 2.8121277683063246, + "learning_rate": 8.361284462892105e-06, + "loss": 0.3701, + "step": 9908 + }, + { + "epoch": 0.29, + "grad_norm": 1.4582097985661027, + "learning_rate": 8.360936715054531e-06, + "loss": 0.3362, + "step": 9909 + }, + { + "epoch": 0.29, + "grad_norm": 1.5661578896542239, + "learning_rate": 8.360588937556649e-06, + "loss": 0.3754, + "step": 9910 + }, + { + "epoch": 0.29, + "grad_norm": 1.4600216139933164, + "learning_rate": 8.360241130401534e-06, + "loss": 0.3508, + "step": 9911 + }, + { + "epoch": 0.29, + "grad_norm": 1.5469033407232107, + "learning_rate": 8.359893293592253e-06, + "loss": 0.3606, + "step": 9912 + }, + { + "epoch": 0.29, + "grad_norm": 4.750864777689399, + "learning_rate": 8.359545427131876e-06, + "loss": 0.3568, + "step": 9913 + }, + { + "epoch": 0.29, + "grad_norm": 2.5647486462324474, + "learning_rate": 8.359197531023476e-06, + "loss": 0.3557, + "step": 9914 + }, + { + "epoch": 0.29, + "grad_norm": 1.616567723055247, + "learning_rate": 8.358849605270118e-06, + "loss": 0.387, + "step": 9915 + }, + { + "epoch": 0.29, + "grad_norm": 1.4130401240179287, + "learning_rate": 8.358501649874876e-06, + "loss": 0.3366, + "step": 9916 + }, + { + "epoch": 0.29, + "grad_norm": 1.3730214722766168, + "learning_rate": 8.35815366484082e-06, + "loss": 0.3351, + "step": 9917 + }, + { + "epoch": 0.29, + "grad_norm": 1.4393934507492763, + "learning_rate": 8.35780565017102e-06, + "loss": 0.3592, + "step": 9918 + }, + { + "epoch": 0.29, + "grad_norm": 1.4110825782287728, + "learning_rate": 8.357457605868547e-06, + "loss": 0.3508, + "step": 9919 + }, + { + "epoch": 0.29, + "grad_norm": 1.602721591231212, + "learning_rate": 8.357109531936475e-06, + "loss": 0.3548, + "step": 9920 + }, + { + "epoch": 0.29, + "grad_norm": 1.5733946749182797, + "learning_rate": 8.356761428377873e-06, + "loss": 0.3708, + "step": 9921 + }, + { + "epoch": 0.29, + "grad_norm": 1.3791900044737575, + "learning_rate": 8.356413295195816e-06, + "loss": 0.3269, + "step": 9922 + }, + { + "epoch": 0.29, + "grad_norm": 1.6482025134574436, + "learning_rate": 8.356065132393373e-06, + "loss": 0.3485, + "step": 9923 + }, + { + "epoch": 0.29, + "grad_norm": 1.3274204987521925, + "learning_rate": 8.355716939973618e-06, + "loss": 0.3356, + "step": 9924 + }, + { + "epoch": 0.29, + "grad_norm": 1.4427664146883559, + "learning_rate": 8.355368717939624e-06, + "loss": 0.3552, + "step": 9925 + }, + { + "epoch": 0.29, + "grad_norm": 1.520525107748722, + "learning_rate": 8.355020466294463e-06, + "loss": 0.3572, + "step": 9926 + }, + { + "epoch": 0.29, + "grad_norm": 0.9784598183915412, + "learning_rate": 8.35467218504121e-06, + "loss": 0.6663, + "step": 9927 + }, + { + "epoch": 0.29, + "grad_norm": 1.4015811625942538, + "learning_rate": 8.354323874182937e-06, + "loss": 0.3397, + "step": 9928 + }, + { + "epoch": 0.29, + "grad_norm": 1.38134145565907, + "learning_rate": 8.35397553372272e-06, + "loss": 0.3516, + "step": 9929 + }, + { + "epoch": 0.29, + "grad_norm": 1.3312861620547392, + "learning_rate": 8.353627163663631e-06, + "loss": 0.3384, + "step": 9930 + }, + { + "epoch": 0.29, + "grad_norm": 2.9931285738812208, + "learning_rate": 8.353278764008745e-06, + "loss": 0.3556, + "step": 9931 + }, + { + "epoch": 0.29, + "grad_norm": 1.5567358932165705, + "learning_rate": 8.352930334761136e-06, + "loss": 0.36, + "step": 9932 + }, + { + "epoch": 0.29, + "grad_norm": 1.8242344682358227, + "learning_rate": 8.352581875923879e-06, + "loss": 0.3589, + "step": 9933 + }, + { + "epoch": 0.29, + "grad_norm": 1.5540040659955763, + "learning_rate": 8.352233387500053e-06, + "loss": 0.3525, + "step": 9934 + }, + { + "epoch": 0.29, + "grad_norm": 0.9941465960391885, + "learning_rate": 8.351884869492727e-06, + "loss": 0.619, + "step": 9935 + }, + { + "epoch": 0.29, + "grad_norm": 1.4329924651711308, + "learning_rate": 8.351536321904983e-06, + "loss": 0.3835, + "step": 9936 + }, + { + "epoch": 0.29, + "grad_norm": 1.4616117309894097, + "learning_rate": 8.351187744739892e-06, + "loss": 0.3472, + "step": 9937 + }, + { + "epoch": 0.29, + "grad_norm": 1.5097818772210223, + "learning_rate": 8.35083913800053e-06, + "loss": 0.3693, + "step": 9938 + }, + { + "epoch": 0.29, + "grad_norm": 4.224212944701451, + "learning_rate": 8.350490501689978e-06, + "loss": 0.3336, + "step": 9939 + }, + { + "epoch": 0.29, + "grad_norm": 3.728880524228701, + "learning_rate": 8.35014183581131e-06, + "loss": 0.4587, + "step": 9940 + }, + { + "epoch": 0.29, + "grad_norm": 1.6084403463916757, + "learning_rate": 8.349793140367603e-06, + "loss": 0.334, + "step": 9941 + }, + { + "epoch": 0.29, + "grad_norm": 1.3690465701103374, + "learning_rate": 8.349444415361934e-06, + "loss": 0.3627, + "step": 9942 + }, + { + "epoch": 0.29, + "grad_norm": 1.5210525124237513, + "learning_rate": 8.34909566079738e-06, + "loss": 0.3446, + "step": 9943 + }, + { + "epoch": 0.29, + "grad_norm": 1.3652243505660098, + "learning_rate": 8.348746876677021e-06, + "loss": 0.3278, + "step": 9944 + }, + { + "epoch": 0.29, + "grad_norm": 1.3819217188895507, + "learning_rate": 8.348398063003933e-06, + "loss": 0.3496, + "step": 9945 + }, + { + "epoch": 0.29, + "grad_norm": 1.4768436796423274, + "learning_rate": 8.348049219781195e-06, + "loss": 0.34, + "step": 9946 + }, + { + "epoch": 0.29, + "grad_norm": 1.3554077603862325, + "learning_rate": 8.347700347011885e-06, + "loss": 0.3287, + "step": 9947 + }, + { + "epoch": 0.29, + "grad_norm": 1.7479313938241792, + "learning_rate": 8.347351444699084e-06, + "loss": 0.3569, + "step": 9948 + }, + { + "epoch": 0.29, + "grad_norm": 1.4416720175064048, + "learning_rate": 8.347002512845868e-06, + "loss": 0.3489, + "step": 9949 + }, + { + "epoch": 0.29, + "grad_norm": 1.4121416698739615, + "learning_rate": 8.346653551455317e-06, + "loss": 0.3315, + "step": 9950 + }, + { + "epoch": 0.29, + "grad_norm": 1.508168280197038, + "learning_rate": 8.346304560530513e-06, + "loss": 0.3786, + "step": 9951 + }, + { + "epoch": 0.29, + "grad_norm": 1.699725376976626, + "learning_rate": 8.345955540074532e-06, + "loss": 0.3511, + "step": 9952 + }, + { + "epoch": 0.29, + "grad_norm": 2.2347476081012037, + "learning_rate": 8.34560649009046e-06, + "loss": 0.3662, + "step": 9953 + }, + { + "epoch": 0.29, + "grad_norm": 1.5784240983081959, + "learning_rate": 8.34525741058137e-06, + "loss": 0.3588, + "step": 9954 + }, + { + "epoch": 0.29, + "grad_norm": 1.4588054843729197, + "learning_rate": 8.344908301550347e-06, + "loss": 0.3679, + "step": 9955 + }, + { + "epoch": 0.29, + "grad_norm": 1.5378932711320616, + "learning_rate": 8.344559163000472e-06, + "loss": 0.3336, + "step": 9956 + }, + { + "epoch": 0.29, + "grad_norm": 1.6994907973512492, + "learning_rate": 8.344209994934824e-06, + "loss": 0.3619, + "step": 9957 + }, + { + "epoch": 0.29, + "grad_norm": 1.2575890543391983, + "learning_rate": 8.343860797356486e-06, + "loss": 0.3413, + "step": 9958 + }, + { + "epoch": 0.29, + "grad_norm": 1.4824131932311366, + "learning_rate": 8.343511570268541e-06, + "loss": 0.3571, + "step": 9959 + }, + { + "epoch": 0.29, + "grad_norm": 1.3517286002084221, + "learning_rate": 8.343162313674069e-06, + "loss": 0.3394, + "step": 9960 + }, + { + "epoch": 0.29, + "grad_norm": 1.4653565750339954, + "learning_rate": 8.342813027576152e-06, + "loss": 0.3636, + "step": 9961 + }, + { + "epoch": 0.29, + "grad_norm": 1.7263304991933706, + "learning_rate": 8.342463711977871e-06, + "loss": 0.3411, + "step": 9962 + }, + { + "epoch": 0.29, + "grad_norm": 1.5767606640260212, + "learning_rate": 8.342114366882312e-06, + "loss": 0.3481, + "step": 9963 + }, + { + "epoch": 0.29, + "grad_norm": 1.3908381699172883, + "learning_rate": 8.341764992292559e-06, + "loss": 0.3853, + "step": 9964 + }, + { + "epoch": 0.29, + "grad_norm": 0.9572114901770423, + "learning_rate": 8.34141558821169e-06, + "loss": 0.5512, + "step": 9965 + }, + { + "epoch": 0.29, + "grad_norm": 1.5917001394925454, + "learning_rate": 8.341066154642792e-06, + "loss": 0.3519, + "step": 9966 + }, + { + "epoch": 0.29, + "grad_norm": 1.7395533557553233, + "learning_rate": 8.340716691588949e-06, + "loss": 0.3847, + "step": 9967 + }, + { + "epoch": 0.29, + "grad_norm": 1.5994663635264161, + "learning_rate": 8.340367199053243e-06, + "loss": 0.3324, + "step": 9968 + }, + { + "epoch": 0.29, + "grad_norm": 1.608985906729877, + "learning_rate": 8.34001767703876e-06, + "loss": 0.353, + "step": 9969 + }, + { + "epoch": 0.29, + "grad_norm": 1.4153144784096674, + "learning_rate": 8.339668125548584e-06, + "loss": 0.3576, + "step": 9970 + }, + { + "epoch": 0.29, + "grad_norm": 0.9328017654572777, + "learning_rate": 8.339318544585799e-06, + "loss": 0.6226, + "step": 9971 + }, + { + "epoch": 0.29, + "grad_norm": 1.4705356197596764, + "learning_rate": 8.338968934153491e-06, + "loss": 0.3818, + "step": 9972 + }, + { + "epoch": 0.29, + "grad_norm": 1.6547305789273232, + "learning_rate": 8.338619294254747e-06, + "loss": 0.3607, + "step": 9973 + }, + { + "epoch": 0.29, + "grad_norm": 1.5990828696646096, + "learning_rate": 8.338269624892648e-06, + "loss": 0.3654, + "step": 9974 + }, + { + "epoch": 0.29, + "grad_norm": 1.535594814764778, + "learning_rate": 8.337919926070283e-06, + "loss": 0.3684, + "step": 9975 + }, + { + "epoch": 0.29, + "grad_norm": 0.8963816227185676, + "learning_rate": 8.337570197790739e-06, + "loss": 0.5903, + "step": 9976 + }, + { + "epoch": 0.29, + "grad_norm": 1.6515543714345318, + "learning_rate": 8.3372204400571e-06, + "loss": 0.3517, + "step": 9977 + }, + { + "epoch": 0.29, + "grad_norm": 1.6577046384731724, + "learning_rate": 8.336870652872453e-06, + "loss": 0.3526, + "step": 9978 + }, + { + "epoch": 0.29, + "grad_norm": 1.705034616998216, + "learning_rate": 8.336520836239888e-06, + "loss": 0.3638, + "step": 9979 + }, + { + "epoch": 0.29, + "grad_norm": 1.7493249980704726, + "learning_rate": 8.336170990162487e-06, + "loss": 0.3336, + "step": 9980 + }, + { + "epoch": 0.29, + "grad_norm": 0.9921761814213936, + "learning_rate": 8.335821114643341e-06, + "loss": 0.676, + "step": 9981 + }, + { + "epoch": 0.29, + "grad_norm": 1.480842492181252, + "learning_rate": 8.335471209685538e-06, + "loss": 0.3539, + "step": 9982 + }, + { + "epoch": 0.29, + "grad_norm": 1.4708056487976697, + "learning_rate": 8.335121275292162e-06, + "loss": 0.3531, + "step": 9983 + }, + { + "epoch": 0.29, + "grad_norm": 1.4987613662624995, + "learning_rate": 8.334771311466306e-06, + "loss": 0.4203, + "step": 9984 + }, + { + "epoch": 0.29, + "grad_norm": 1.4768912190901597, + "learning_rate": 8.334421318211054e-06, + "loss": 0.3292, + "step": 9985 + }, + { + "epoch": 0.29, + "grad_norm": 1.6657509243138686, + "learning_rate": 8.334071295529498e-06, + "loss": 0.3473, + "step": 9986 + }, + { + "epoch": 0.29, + "grad_norm": 1.4259781169387977, + "learning_rate": 8.333721243424726e-06, + "loss": 0.3652, + "step": 9987 + }, + { + "epoch": 0.29, + "grad_norm": 1.3618627945347497, + "learning_rate": 8.333371161899826e-06, + "loss": 0.3253, + "step": 9988 + }, + { + "epoch": 0.29, + "grad_norm": 1.7547913540885627, + "learning_rate": 8.33302105095789e-06, + "loss": 0.3483, + "step": 9989 + }, + { + "epoch": 0.29, + "grad_norm": 3.0823097600487688, + "learning_rate": 8.332670910602006e-06, + "loss": 0.3522, + "step": 9990 + }, + { + "epoch": 0.29, + "grad_norm": 1.6533063030570663, + "learning_rate": 8.332320740835264e-06, + "loss": 0.3689, + "step": 9991 + }, + { + "epoch": 0.29, + "grad_norm": 2.32521085064468, + "learning_rate": 8.331970541660755e-06, + "loss": 0.3394, + "step": 9992 + }, + { + "epoch": 0.29, + "grad_norm": 1.4613143829159105, + "learning_rate": 8.331620313081568e-06, + "loss": 0.3542, + "step": 9993 + }, + { + "epoch": 0.29, + "grad_norm": 1.625277367780657, + "learning_rate": 8.331270055100795e-06, + "loss": 0.3712, + "step": 9994 + }, + { + "epoch": 0.29, + "grad_norm": 1.385337230705145, + "learning_rate": 8.330919767721528e-06, + "loss": 0.334, + "step": 9995 + }, + { + "epoch": 0.29, + "grad_norm": 1.4683794454167094, + "learning_rate": 8.330569450946854e-06, + "loss": 0.3268, + "step": 9996 + }, + { + "epoch": 0.29, + "grad_norm": 1.015057286362849, + "learning_rate": 8.33021910477987e-06, + "loss": 0.6122, + "step": 9997 + }, + { + "epoch": 0.29, + "grad_norm": 1.6331999832593755, + "learning_rate": 8.329868729223667e-06, + "loss": 0.328, + "step": 9998 + }, + { + "epoch": 0.29, + "grad_norm": 1.3391400799804265, + "learning_rate": 8.329518324281332e-06, + "loss": 0.3355, + "step": 9999 + }, + { + "epoch": 0.29, + "grad_norm": 1.669443225896886, + "learning_rate": 8.329167889955962e-06, + "loss": 0.3387, + "step": 10000 + }, + { + "epoch": 0.29, + "grad_norm": 0.9618847318912415, + "learning_rate": 8.32881742625065e-06, + "loss": 0.5881, + "step": 10001 + }, + { + "epoch": 0.29, + "grad_norm": 1.369033943007747, + "learning_rate": 8.328466933168487e-06, + "loss": 0.346, + "step": 10002 + }, + { + "epoch": 0.29, + "grad_norm": 1.3307984771717425, + "learning_rate": 8.328116410712566e-06, + "loss": 0.3559, + "step": 10003 + }, + { + "epoch": 0.29, + "grad_norm": 1.51267210341629, + "learning_rate": 8.327765858885978e-06, + "loss": 0.3647, + "step": 10004 + }, + { + "epoch": 0.29, + "grad_norm": 0.8678718045297321, + "learning_rate": 8.327415277691824e-06, + "loss": 0.5934, + "step": 10005 + }, + { + "epoch": 0.29, + "grad_norm": 1.342291330373326, + "learning_rate": 8.32706466713319e-06, + "loss": 0.3307, + "step": 10006 + }, + { + "epoch": 0.29, + "grad_norm": 1.6906431342233434, + "learning_rate": 8.326714027213176e-06, + "loss": 0.3756, + "step": 10007 + }, + { + "epoch": 0.29, + "grad_norm": 1.6395282632508146, + "learning_rate": 8.32636335793487e-06, + "loss": 0.3509, + "step": 10008 + }, + { + "epoch": 0.29, + "grad_norm": 1.4112267607202016, + "learning_rate": 8.326012659301375e-06, + "loss": 0.35, + "step": 10009 + }, + { + "epoch": 0.29, + "grad_norm": 1.4323554652156214, + "learning_rate": 8.325661931315779e-06, + "loss": 0.3233, + "step": 10010 + }, + { + "epoch": 0.29, + "grad_norm": 1.4692306807944044, + "learning_rate": 8.325311173981179e-06, + "loss": 0.3344, + "step": 10011 + }, + { + "epoch": 0.29, + "grad_norm": 1.4508489623792142, + "learning_rate": 8.324960387300672e-06, + "loss": 0.3339, + "step": 10012 + }, + { + "epoch": 0.29, + "grad_norm": 1.3398988797870712, + "learning_rate": 8.32460957127735e-06, + "loss": 0.3713, + "step": 10013 + }, + { + "epoch": 0.29, + "grad_norm": 1.3922400663578431, + "learning_rate": 8.324258725914316e-06, + "loss": 0.3493, + "step": 10014 + }, + { + "epoch": 0.29, + "grad_norm": 1.4581349024046104, + "learning_rate": 8.32390785121466e-06, + "loss": 0.3414, + "step": 10015 + }, + { + "epoch": 0.29, + "grad_norm": 1.362843937473314, + "learning_rate": 8.32355694718148e-06, + "loss": 0.326, + "step": 10016 + }, + { + "epoch": 0.29, + "grad_norm": 1.9315581123219516, + "learning_rate": 8.323206013817872e-06, + "loss": 0.3496, + "step": 10017 + }, + { + "epoch": 0.29, + "grad_norm": 1.6901547087561184, + "learning_rate": 8.322855051126937e-06, + "loss": 0.3391, + "step": 10018 + }, + { + "epoch": 0.29, + "grad_norm": 2.054491975644463, + "learning_rate": 8.322504059111766e-06, + "loss": 0.3658, + "step": 10019 + }, + { + "epoch": 0.29, + "grad_norm": 1.7278056275210008, + "learning_rate": 8.322153037775462e-06, + "loss": 0.3543, + "step": 10020 + }, + { + "epoch": 0.29, + "grad_norm": 1.603712807944425, + "learning_rate": 8.32180198712112e-06, + "loss": 0.3489, + "step": 10021 + }, + { + "epoch": 0.29, + "grad_norm": 1.3298296043686244, + "learning_rate": 8.32145090715184e-06, + "loss": 0.3846, + "step": 10022 + }, + { + "epoch": 0.29, + "grad_norm": 1.7009524697235079, + "learning_rate": 8.321099797870718e-06, + "loss": 0.3765, + "step": 10023 + }, + { + "epoch": 0.29, + "grad_norm": 1.2946143447468146, + "learning_rate": 8.320748659280853e-06, + "loss": 0.346, + "step": 10024 + }, + { + "epoch": 0.29, + "grad_norm": 1.492136333761515, + "learning_rate": 8.320397491385344e-06, + "loss": 0.3633, + "step": 10025 + }, + { + "epoch": 0.29, + "grad_norm": 1.4299699895970774, + "learning_rate": 8.320046294187291e-06, + "loss": 0.3694, + "step": 10026 + }, + { + "epoch": 0.29, + "grad_norm": 1.386922575430122, + "learning_rate": 8.319695067689792e-06, + "loss": 0.3712, + "step": 10027 + }, + { + "epoch": 0.29, + "grad_norm": 1.3542217707927766, + "learning_rate": 8.319343811895946e-06, + "loss": 0.3486, + "step": 10028 + }, + { + "epoch": 0.29, + "grad_norm": 0.9195622556710865, + "learning_rate": 8.318992526808856e-06, + "loss": 0.5668, + "step": 10029 + }, + { + "epoch": 0.29, + "grad_norm": 1.5333906212471586, + "learning_rate": 8.31864121243162e-06, + "loss": 0.3252, + "step": 10030 + }, + { + "epoch": 0.29, + "grad_norm": 1.5546420138614587, + "learning_rate": 8.318289868767338e-06, + "loss": 0.35, + "step": 10031 + }, + { + "epoch": 0.29, + "grad_norm": 2.3593818347960616, + "learning_rate": 8.31793849581911e-06, + "loss": 0.359, + "step": 10032 + }, + { + "epoch": 0.29, + "grad_norm": 1.4374092172284954, + "learning_rate": 8.31758709359004e-06, + "loss": 0.3377, + "step": 10033 + }, + { + "epoch": 0.29, + "grad_norm": 1.5090342146892262, + "learning_rate": 8.317235662083227e-06, + "loss": 0.3179, + "step": 10034 + }, + { + "epoch": 0.29, + "grad_norm": 1.3779051645596827, + "learning_rate": 8.316884201301772e-06, + "loss": 0.3614, + "step": 10035 + }, + { + "epoch": 0.29, + "grad_norm": 1.4891409868529186, + "learning_rate": 8.316532711248777e-06, + "loss": 0.372, + "step": 10036 + }, + { + "epoch": 0.29, + "grad_norm": 1.0273806317043848, + "learning_rate": 8.316181191927344e-06, + "loss": 0.5962, + "step": 10037 + }, + { + "epoch": 0.29, + "grad_norm": 1.3718726516234192, + "learning_rate": 8.315829643340573e-06, + "loss": 0.3445, + "step": 10038 + }, + { + "epoch": 0.29, + "grad_norm": 2.920493050779776, + "learning_rate": 8.315478065491573e-06, + "loss": 0.3505, + "step": 10039 + }, + { + "epoch": 0.29, + "grad_norm": 1.3314528180447367, + "learning_rate": 8.315126458383439e-06, + "loss": 0.3343, + "step": 10040 + }, + { + "epoch": 0.29, + "grad_norm": 1.493939644349456, + "learning_rate": 8.314774822019278e-06, + "loss": 0.3465, + "step": 10041 + }, + { + "epoch": 0.29, + "grad_norm": 1.7276916089341525, + "learning_rate": 8.314423156402193e-06, + "loss": 0.3683, + "step": 10042 + }, + { + "epoch": 0.29, + "grad_norm": 1.4356842440220914, + "learning_rate": 8.314071461535286e-06, + "loss": 0.3694, + "step": 10043 + }, + { + "epoch": 0.29, + "grad_norm": 1.5089526438832965, + "learning_rate": 8.313719737421661e-06, + "loss": 0.3493, + "step": 10044 + }, + { + "epoch": 0.29, + "grad_norm": 1.4419627028954762, + "learning_rate": 8.313367984064425e-06, + "loss": 0.3602, + "step": 10045 + }, + { + "epoch": 0.29, + "grad_norm": 1.2276640884609085, + "learning_rate": 8.313016201466678e-06, + "loss": 0.342, + "step": 10046 + }, + { + "epoch": 0.29, + "grad_norm": 1.3533630564393957, + "learning_rate": 8.312664389631526e-06, + "loss": 0.3161, + "step": 10047 + }, + { + "epoch": 0.29, + "grad_norm": 1.6444418659322728, + "learning_rate": 8.312312548562075e-06, + "loss": 0.3788, + "step": 10048 + }, + { + "epoch": 0.29, + "grad_norm": 2.6593844856995985, + "learning_rate": 8.311960678261431e-06, + "loss": 0.3346, + "step": 10049 + }, + { + "epoch": 0.29, + "grad_norm": 1.533938677174282, + "learning_rate": 8.311608778732693e-06, + "loss": 0.3453, + "step": 10050 + }, + { + "epoch": 0.29, + "grad_norm": 1.6176834965274882, + "learning_rate": 8.311256849978974e-06, + "loss": 0.3507, + "step": 10051 + }, + { + "epoch": 0.29, + "grad_norm": 1.8816943295339248, + "learning_rate": 8.310904892003374e-06, + "loss": 0.3622, + "step": 10052 + }, + { + "epoch": 0.29, + "grad_norm": 1.8017583731910984, + "learning_rate": 8.310552904809004e-06, + "loss": 0.3693, + "step": 10053 + }, + { + "epoch": 0.29, + "grad_norm": 1.4591577683570203, + "learning_rate": 8.310200888398967e-06, + "loss": 0.3217, + "step": 10054 + }, + { + "epoch": 0.29, + "grad_norm": 1.390715806500531, + "learning_rate": 8.30984884277637e-06, + "loss": 0.3421, + "step": 10055 + }, + { + "epoch": 0.29, + "grad_norm": 1.4489493783399705, + "learning_rate": 8.30949676794432e-06, + "loss": 0.3505, + "step": 10056 + }, + { + "epoch": 0.29, + "grad_norm": 1.756805539574105, + "learning_rate": 8.309144663905925e-06, + "loss": 0.3497, + "step": 10057 + }, + { + "epoch": 0.29, + "grad_norm": 1.4747509028848858, + "learning_rate": 8.308792530664292e-06, + "loss": 0.3677, + "step": 10058 + }, + { + "epoch": 0.29, + "grad_norm": 1.4966267249559617, + "learning_rate": 8.308440368222528e-06, + "loss": 0.3415, + "step": 10059 + }, + { + "epoch": 0.29, + "grad_norm": 1.4895987880412882, + "learning_rate": 8.30808817658374e-06, + "loss": 0.3754, + "step": 10060 + }, + { + "epoch": 0.29, + "grad_norm": 1.3809595404940427, + "learning_rate": 8.307735955751038e-06, + "loss": 0.3661, + "step": 10061 + }, + { + "epoch": 0.29, + "grad_norm": 0.9510182536498006, + "learning_rate": 8.30738370572753e-06, + "loss": 0.6024, + "step": 10062 + }, + { + "epoch": 0.29, + "grad_norm": 1.3981018574423563, + "learning_rate": 8.307031426516324e-06, + "loss": 0.3469, + "step": 10063 + }, + { + "epoch": 0.29, + "grad_norm": 1.7122333563238072, + "learning_rate": 8.306679118120527e-06, + "loss": 0.3564, + "step": 10064 + }, + { + "epoch": 0.29, + "grad_norm": 1.4987264215370133, + "learning_rate": 8.306326780543253e-06, + "loss": 0.3883, + "step": 10065 + }, + { + "epoch": 0.29, + "grad_norm": 0.9371046095366978, + "learning_rate": 8.305974413787606e-06, + "loss": 0.6142, + "step": 10066 + }, + { + "epoch": 0.29, + "grad_norm": 1.4486420280008443, + "learning_rate": 8.305622017856699e-06, + "loss": 0.3647, + "step": 10067 + }, + { + "epoch": 0.29, + "grad_norm": 4.053340449021809, + "learning_rate": 8.305269592753641e-06, + "loss": 0.3301, + "step": 10068 + }, + { + "epoch": 0.29, + "grad_norm": 1.7028703639488942, + "learning_rate": 8.304917138481542e-06, + "loss": 0.3606, + "step": 10069 + }, + { + "epoch": 0.29, + "grad_norm": 1.3543074545399156, + "learning_rate": 8.304564655043514e-06, + "loss": 0.3541, + "step": 10070 + }, + { + "epoch": 0.29, + "grad_norm": 1.3012592411254242, + "learning_rate": 8.304212142442665e-06, + "loss": 0.364, + "step": 10071 + }, + { + "epoch": 0.29, + "grad_norm": 1.9536118869496641, + "learning_rate": 8.303859600682107e-06, + "loss": 0.3446, + "step": 10072 + }, + { + "epoch": 0.29, + "grad_norm": 1.7790908782299504, + "learning_rate": 8.303507029764953e-06, + "loss": 0.3278, + "step": 10073 + }, + { + "epoch": 0.29, + "grad_norm": 2.6752118032040455, + "learning_rate": 8.303154429694311e-06, + "loss": 0.342, + "step": 10074 + }, + { + "epoch": 0.29, + "grad_norm": 1.555869293134245, + "learning_rate": 8.302801800473295e-06, + "loss": 0.3584, + "step": 10075 + }, + { + "epoch": 0.29, + "grad_norm": 1.3536735727020235, + "learning_rate": 8.302449142105016e-06, + "loss": 0.3441, + "step": 10076 + }, + { + "epoch": 0.29, + "grad_norm": 1.3358001259238448, + "learning_rate": 8.302096454592588e-06, + "loss": 0.3494, + "step": 10077 + }, + { + "epoch": 0.29, + "grad_norm": 1.5413922959484325, + "learning_rate": 8.301743737939122e-06, + "loss": 0.3607, + "step": 10078 + }, + { + "epoch": 0.29, + "grad_norm": 1.7011686250703293, + "learning_rate": 8.30139099214773e-06, + "loss": 0.3647, + "step": 10079 + }, + { + "epoch": 0.29, + "grad_norm": 1.4536003581039798, + "learning_rate": 8.301038217221526e-06, + "loss": 0.3751, + "step": 10080 + }, + { + "epoch": 0.29, + "grad_norm": 1.3374617400689466, + "learning_rate": 8.300685413163623e-06, + "loss": 0.3459, + "step": 10081 + }, + { + "epoch": 0.29, + "grad_norm": 1.651422144498401, + "learning_rate": 8.300332579977135e-06, + "loss": 0.3555, + "step": 10082 + }, + { + "epoch": 0.29, + "grad_norm": 1.4429015697040808, + "learning_rate": 8.299979717665176e-06, + "loss": 0.3415, + "step": 10083 + }, + { + "epoch": 0.29, + "grad_norm": 1.402130364204299, + "learning_rate": 8.299626826230857e-06, + "loss": 0.3494, + "step": 10084 + }, + { + "epoch": 0.29, + "grad_norm": 1.535286668157626, + "learning_rate": 8.299273905677297e-06, + "loss": 0.345, + "step": 10085 + }, + { + "epoch": 0.29, + "grad_norm": 1.871126176646377, + "learning_rate": 8.298920956007608e-06, + "loss": 0.3389, + "step": 10086 + }, + { + "epoch": 0.29, + "grad_norm": 2.2818497289604083, + "learning_rate": 8.298567977224903e-06, + "loss": 0.3158, + "step": 10087 + }, + { + "epoch": 0.29, + "grad_norm": 1.5871071531634398, + "learning_rate": 8.2982149693323e-06, + "loss": 0.3662, + "step": 10088 + }, + { + "epoch": 0.29, + "grad_norm": 1.4700000960832895, + "learning_rate": 8.297861932332916e-06, + "loss": 0.3319, + "step": 10089 + }, + { + "epoch": 0.29, + "grad_norm": 1.3907984343317932, + "learning_rate": 8.297508866229861e-06, + "loss": 0.3425, + "step": 10090 + }, + { + "epoch": 0.29, + "grad_norm": 1.5302544391110453, + "learning_rate": 8.297155771026254e-06, + "loss": 0.3956, + "step": 10091 + }, + { + "epoch": 0.29, + "grad_norm": 1.4880047317831118, + "learning_rate": 8.296802646725212e-06, + "loss": 0.3469, + "step": 10092 + }, + { + "epoch": 0.29, + "grad_norm": 1.4395570561001594, + "learning_rate": 8.296449493329849e-06, + "loss": 0.3501, + "step": 10093 + }, + { + "epoch": 0.29, + "grad_norm": 0.9141796361614386, + "learning_rate": 8.296096310843283e-06, + "loss": 0.6403, + "step": 10094 + }, + { + "epoch": 0.29, + "grad_norm": 1.5929782953747302, + "learning_rate": 8.29574309926863e-06, + "loss": 0.3576, + "step": 10095 + }, + { + "epoch": 0.29, + "grad_norm": 1.3942646010845519, + "learning_rate": 8.295389858609008e-06, + "loss": 0.3461, + "step": 10096 + }, + { + "epoch": 0.29, + "grad_norm": 0.9291976049481957, + "learning_rate": 8.295036588867533e-06, + "loss": 0.5835, + "step": 10097 + }, + { + "epoch": 0.29, + "grad_norm": 0.9321279104539372, + "learning_rate": 8.294683290047324e-06, + "loss": 0.6157, + "step": 10098 + }, + { + "epoch": 0.29, + "grad_norm": 2.0487414112815148, + "learning_rate": 8.2943299621515e-06, + "loss": 0.3521, + "step": 10099 + }, + { + "epoch": 0.29, + "grad_norm": 2.84379753493545, + "learning_rate": 8.293976605183175e-06, + "loss": 0.3485, + "step": 10100 + }, + { + "epoch": 0.29, + "grad_norm": 1.3920952463592335, + "learning_rate": 8.293623219145473e-06, + "loss": 0.3566, + "step": 10101 + }, + { + "epoch": 0.29, + "grad_norm": 2.081905003601059, + "learning_rate": 8.293269804041506e-06, + "loss": 0.3662, + "step": 10102 + }, + { + "epoch": 0.29, + "grad_norm": 1.553962306561227, + "learning_rate": 8.2929163598744e-06, + "loss": 0.3504, + "step": 10103 + }, + { + "epoch": 0.29, + "grad_norm": 1.3946392721223233, + "learning_rate": 8.292562886647267e-06, + "loss": 0.352, + "step": 10104 + }, + { + "epoch": 0.29, + "grad_norm": 1.3285516537674913, + "learning_rate": 8.292209384363232e-06, + "loss": 0.3724, + "step": 10105 + }, + { + "epoch": 0.29, + "grad_norm": 1.5259793864480498, + "learning_rate": 8.291855853025414e-06, + "loss": 0.3501, + "step": 10106 + }, + { + "epoch": 0.29, + "grad_norm": 1.3398750798791288, + "learning_rate": 8.29150229263693e-06, + "loss": 0.3336, + "step": 10107 + }, + { + "epoch": 0.29, + "grad_norm": 1.4122725591264997, + "learning_rate": 8.291148703200902e-06, + "loss": 0.3509, + "step": 10108 + }, + { + "epoch": 0.29, + "grad_norm": 1.3414021576831217, + "learning_rate": 8.290795084720449e-06, + "loss": 0.3299, + "step": 10109 + }, + { + "epoch": 0.29, + "grad_norm": 1.7317862321441124, + "learning_rate": 8.290441437198693e-06, + "loss": 0.3701, + "step": 10110 + }, + { + "epoch": 0.29, + "grad_norm": 1.6865830261191443, + "learning_rate": 8.290087760638756e-06, + "loss": 0.3462, + "step": 10111 + }, + { + "epoch": 0.29, + "grad_norm": 1.4530276197429242, + "learning_rate": 8.28973405504376e-06, + "loss": 0.3517, + "step": 10112 + }, + { + "epoch": 0.29, + "grad_norm": 1.6291227845954008, + "learning_rate": 8.28938032041682e-06, + "loss": 0.3572, + "step": 10113 + }, + { + "epoch": 0.29, + "grad_norm": 1.3093957950774973, + "learning_rate": 8.289026556761064e-06, + "loss": 0.3267, + "step": 10114 + }, + { + "epoch": 0.29, + "grad_norm": 1.5986874309840813, + "learning_rate": 8.288672764079613e-06, + "loss": 0.3593, + "step": 10115 + }, + { + "epoch": 0.29, + "grad_norm": 1.7056126099749365, + "learning_rate": 8.288318942375588e-06, + "loss": 0.3517, + "step": 10116 + }, + { + "epoch": 0.29, + "grad_norm": 1.7055562672718385, + "learning_rate": 8.287965091652111e-06, + "loss": 0.339, + "step": 10117 + }, + { + "epoch": 0.29, + "grad_norm": 1.4268322788805354, + "learning_rate": 8.287611211912307e-06, + "loss": 0.392, + "step": 10118 + }, + { + "epoch": 0.29, + "grad_norm": 1.5393145301880424, + "learning_rate": 8.287257303159297e-06, + "loss": 0.3433, + "step": 10119 + }, + { + "epoch": 0.29, + "grad_norm": 1.6419530712213752, + "learning_rate": 8.286903365396205e-06, + "loss": 0.3542, + "step": 10120 + }, + { + "epoch": 0.29, + "grad_norm": 1.4159616304000062, + "learning_rate": 8.286549398626153e-06, + "loss": 0.4134, + "step": 10121 + }, + { + "epoch": 0.29, + "grad_norm": 1.1867017180699764, + "learning_rate": 8.286195402852268e-06, + "loss": 0.6431, + "step": 10122 + }, + { + "epoch": 0.29, + "grad_norm": 1.7138026063897334, + "learning_rate": 8.28584137807767e-06, + "loss": 0.3451, + "step": 10123 + }, + { + "epoch": 0.29, + "grad_norm": 1.314705482110517, + "learning_rate": 8.285487324305488e-06, + "loss": 0.3297, + "step": 10124 + }, + { + "epoch": 0.29, + "grad_norm": 1.4121575780252271, + "learning_rate": 8.285133241538843e-06, + "loss": 0.3791, + "step": 10125 + }, + { + "epoch": 0.29, + "grad_norm": 1.406011087879346, + "learning_rate": 8.28477912978086e-06, + "loss": 0.4019, + "step": 10126 + }, + { + "epoch": 0.29, + "grad_norm": 1.403999405089019, + "learning_rate": 8.284424989034665e-06, + "loss": 0.3434, + "step": 10127 + }, + { + "epoch": 0.29, + "grad_norm": 1.2879533270862094, + "learning_rate": 8.284070819303383e-06, + "loss": 0.3401, + "step": 10128 + }, + { + "epoch": 0.29, + "grad_norm": 1.3317388597679538, + "learning_rate": 8.28371662059014e-06, + "loss": 0.3574, + "step": 10129 + }, + { + "epoch": 0.29, + "grad_norm": 1.4542130598594227, + "learning_rate": 8.28336239289806e-06, + "loss": 0.3554, + "step": 10130 + }, + { + "epoch": 0.29, + "grad_norm": 1.4736167398081426, + "learning_rate": 8.283008136230271e-06, + "loss": 0.3474, + "step": 10131 + }, + { + "epoch": 0.29, + "grad_norm": 1.4312404649487533, + "learning_rate": 8.2826538505899e-06, + "loss": 0.3391, + "step": 10132 + }, + { + "epoch": 0.29, + "grad_norm": 1.486504153971521, + "learning_rate": 8.282299535980071e-06, + "loss": 0.3427, + "step": 10133 + }, + { + "epoch": 0.29, + "grad_norm": 1.3352801671290517, + "learning_rate": 8.281945192403914e-06, + "loss": 0.3314, + "step": 10134 + }, + { + "epoch": 0.29, + "grad_norm": 1.384904238475193, + "learning_rate": 8.281590819864553e-06, + "loss": 0.3442, + "step": 10135 + }, + { + "epoch": 0.29, + "grad_norm": 1.4029150971020463, + "learning_rate": 8.281236418365117e-06, + "loss": 0.3501, + "step": 10136 + }, + { + "epoch": 0.29, + "grad_norm": 1.447119496927366, + "learning_rate": 8.280881987908733e-06, + "loss": 0.3469, + "step": 10137 + }, + { + "epoch": 0.29, + "grad_norm": 1.5679503844822607, + "learning_rate": 8.280527528498529e-06, + "loss": 0.3638, + "step": 10138 + }, + { + "epoch": 0.29, + "grad_norm": 1.0414467419201332, + "learning_rate": 8.280173040137632e-06, + "loss": 0.6329, + "step": 10139 + }, + { + "epoch": 0.29, + "grad_norm": 1.4092017788237883, + "learning_rate": 8.279818522829173e-06, + "loss": 0.4017, + "step": 10140 + }, + { + "epoch": 0.29, + "grad_norm": 2.1733484490003905, + "learning_rate": 8.279463976576279e-06, + "loss": 0.3472, + "step": 10141 + }, + { + "epoch": 0.29, + "grad_norm": 1.4608913254653777, + "learning_rate": 8.27910940138208e-06, + "loss": 0.3393, + "step": 10142 + }, + { + "epoch": 0.29, + "grad_norm": 1.3727455967582836, + "learning_rate": 8.278754797249702e-06, + "loss": 0.3566, + "step": 10143 + }, + { + "epoch": 0.29, + "grad_norm": 1.4152396382998285, + "learning_rate": 8.278400164182277e-06, + "loss": 0.3565, + "step": 10144 + }, + { + "epoch": 0.29, + "grad_norm": 1.3249914234374405, + "learning_rate": 8.278045502182935e-06, + "loss": 0.3535, + "step": 10145 + }, + { + "epoch": 0.29, + "grad_norm": 1.4703458606352695, + "learning_rate": 8.277690811254805e-06, + "loss": 0.3416, + "step": 10146 + }, + { + "epoch": 0.29, + "grad_norm": 1.467966348275807, + "learning_rate": 8.277336091401016e-06, + "loss": 0.3386, + "step": 10147 + }, + { + "epoch": 0.29, + "grad_norm": 1.6036952519605128, + "learning_rate": 8.276981342624702e-06, + "loss": 0.3633, + "step": 10148 + }, + { + "epoch": 0.29, + "grad_norm": 1.6408860998711907, + "learning_rate": 8.276626564928991e-06, + "loss": 0.3493, + "step": 10149 + }, + { + "epoch": 0.29, + "grad_norm": 1.5302136511459148, + "learning_rate": 8.276271758317012e-06, + "loss": 0.3711, + "step": 10150 + }, + { + "epoch": 0.29, + "grad_norm": 1.3155875502467593, + "learning_rate": 8.2759169227919e-06, + "loss": 0.3368, + "step": 10151 + }, + { + "epoch": 0.29, + "grad_norm": 2.123694716096125, + "learning_rate": 8.275562058356782e-06, + "loss": 0.3462, + "step": 10152 + }, + { + "epoch": 0.29, + "grad_norm": 1.984028641147327, + "learning_rate": 8.275207165014797e-06, + "loss": 0.3382, + "step": 10153 + }, + { + "epoch": 0.29, + "grad_norm": 1.3782663527755308, + "learning_rate": 8.27485224276907e-06, + "loss": 0.3369, + "step": 10154 + }, + { + "epoch": 0.29, + "grad_norm": 1.5331777981452557, + "learning_rate": 8.274497291622736e-06, + "loss": 0.3675, + "step": 10155 + }, + { + "epoch": 0.29, + "grad_norm": 1.5395521068412088, + "learning_rate": 8.274142311578926e-06, + "loss": 0.3389, + "step": 10156 + }, + { + "epoch": 0.29, + "grad_norm": 1.839061627763447, + "learning_rate": 8.273787302640776e-06, + "loss": 0.3932, + "step": 10157 + }, + { + "epoch": 0.29, + "grad_norm": 1.9180511954292239, + "learning_rate": 8.273432264811416e-06, + "loss": 0.3688, + "step": 10158 + }, + { + "epoch": 0.29, + "grad_norm": 0.95160675993526, + "learning_rate": 8.273077198093978e-06, + "loss": 0.5684, + "step": 10159 + }, + { + "epoch": 0.29, + "grad_norm": 1.4467744920213876, + "learning_rate": 8.2727221024916e-06, + "loss": 0.3611, + "step": 10160 + }, + { + "epoch": 0.29, + "grad_norm": 1.3746572562881734, + "learning_rate": 8.272366978007412e-06, + "loss": 0.3635, + "step": 10161 + }, + { + "epoch": 0.29, + "grad_norm": 1.8281199442951892, + "learning_rate": 8.272011824644548e-06, + "loss": 0.3661, + "step": 10162 + }, + { + "epoch": 0.29, + "grad_norm": 2.7933104742802506, + "learning_rate": 8.271656642406146e-06, + "loss": 0.353, + "step": 10163 + }, + { + "epoch": 0.29, + "grad_norm": 1.3983443521859227, + "learning_rate": 8.271301431295336e-06, + "loss": 0.3685, + "step": 10164 + }, + { + "epoch": 0.29, + "grad_norm": 1.5033803855396046, + "learning_rate": 8.270946191315254e-06, + "loss": 0.3558, + "step": 10165 + }, + { + "epoch": 0.29, + "grad_norm": 1.4906453754233482, + "learning_rate": 8.270590922469037e-06, + "loss": 0.345, + "step": 10166 + }, + { + "epoch": 0.29, + "grad_norm": 1.3753156231601416, + "learning_rate": 8.27023562475982e-06, + "loss": 0.3861, + "step": 10167 + }, + { + "epoch": 0.29, + "grad_norm": 1.5404934941148674, + "learning_rate": 8.269880298190733e-06, + "loss": 0.3195, + "step": 10168 + }, + { + "epoch": 0.29, + "grad_norm": 1.3115793391204154, + "learning_rate": 8.26952494276492e-06, + "loss": 0.3341, + "step": 10169 + }, + { + "epoch": 0.29, + "grad_norm": 1.4617457674092385, + "learning_rate": 8.269169558485511e-06, + "loss": 0.347, + "step": 10170 + }, + { + "epoch": 0.3, + "grad_norm": 1.3887740188842002, + "learning_rate": 8.268814145355646e-06, + "loss": 0.3406, + "step": 10171 + }, + { + "epoch": 0.3, + "grad_norm": 1.6961407343980903, + "learning_rate": 8.268458703378459e-06, + "loss": 0.3752, + "step": 10172 + }, + { + "epoch": 0.3, + "grad_norm": 1.4181724044365913, + "learning_rate": 8.268103232557088e-06, + "loss": 0.3476, + "step": 10173 + }, + { + "epoch": 0.3, + "grad_norm": 3.0641262207653166, + "learning_rate": 8.267747732894669e-06, + "loss": 0.3649, + "step": 10174 + }, + { + "epoch": 0.3, + "grad_norm": 1.4088989674081733, + "learning_rate": 8.26739220439434e-06, + "loss": 0.3415, + "step": 10175 + }, + { + "epoch": 0.3, + "grad_norm": 1.4937996270440297, + "learning_rate": 8.26703664705924e-06, + "loss": 0.3497, + "step": 10176 + }, + { + "epoch": 0.3, + "grad_norm": 0.8644227777797313, + "learning_rate": 8.266681060892505e-06, + "loss": 0.6066, + "step": 10177 + }, + { + "epoch": 0.3, + "grad_norm": 1.656158656034856, + "learning_rate": 8.266325445897272e-06, + "loss": 0.3557, + "step": 10178 + }, + { + "epoch": 0.3, + "grad_norm": 1.5825085247986654, + "learning_rate": 8.265969802076682e-06, + "loss": 0.3589, + "step": 10179 + }, + { + "epoch": 0.3, + "grad_norm": 1.833084789081252, + "learning_rate": 8.265614129433872e-06, + "loss": 0.3619, + "step": 10180 + }, + { + "epoch": 0.3, + "grad_norm": 1.527754775385922, + "learning_rate": 8.26525842797198e-06, + "loss": 0.3543, + "step": 10181 + }, + { + "epoch": 0.3, + "grad_norm": 1.7739185789161953, + "learning_rate": 8.264902697694149e-06, + "loss": 0.3382, + "step": 10182 + }, + { + "epoch": 0.3, + "grad_norm": 1.8521757739386107, + "learning_rate": 8.264546938603512e-06, + "loss": 0.3571, + "step": 10183 + }, + { + "epoch": 0.3, + "grad_norm": 1.4618611677639688, + "learning_rate": 8.264191150703214e-06, + "loss": 0.3572, + "step": 10184 + }, + { + "epoch": 0.3, + "grad_norm": 1.4294424151150125, + "learning_rate": 8.263835333996393e-06, + "loss": 0.3491, + "step": 10185 + }, + { + "epoch": 0.3, + "grad_norm": 1.661832412743687, + "learning_rate": 8.263479488486188e-06, + "loss": 0.3611, + "step": 10186 + }, + { + "epoch": 0.3, + "grad_norm": 1.7775386261342134, + "learning_rate": 8.263123614175742e-06, + "loss": 0.3562, + "step": 10187 + }, + { + "epoch": 0.3, + "grad_norm": 1.3512503606451258, + "learning_rate": 8.26276771106819e-06, + "loss": 0.3506, + "step": 10188 + }, + { + "epoch": 0.3, + "grad_norm": 1.8105607276126512, + "learning_rate": 8.262411779166681e-06, + "loss": 0.333, + "step": 10189 + }, + { + "epoch": 0.3, + "grad_norm": 1.483041298529285, + "learning_rate": 8.26205581847435e-06, + "loss": 0.3433, + "step": 10190 + }, + { + "epoch": 0.3, + "grad_norm": 1.336452849076757, + "learning_rate": 8.26169982899434e-06, + "loss": 0.33, + "step": 10191 + }, + { + "epoch": 0.3, + "grad_norm": 1.3903173566377558, + "learning_rate": 8.261343810729794e-06, + "loss": 0.329, + "step": 10192 + }, + { + "epoch": 0.3, + "grad_norm": 1.5232427063918779, + "learning_rate": 8.260987763683852e-06, + "loss": 0.3324, + "step": 10193 + }, + { + "epoch": 0.3, + "grad_norm": 1.410843879835699, + "learning_rate": 8.260631687859658e-06, + "loss": 0.3535, + "step": 10194 + }, + { + "epoch": 0.3, + "grad_norm": 1.465652336404214, + "learning_rate": 8.260275583260351e-06, + "loss": 0.3292, + "step": 10195 + }, + { + "epoch": 0.3, + "grad_norm": 1.6403171973593782, + "learning_rate": 8.259919449889075e-06, + "loss": 0.3559, + "step": 10196 + }, + { + "epoch": 0.3, + "grad_norm": 1.469907730878031, + "learning_rate": 8.259563287748978e-06, + "loss": 0.3576, + "step": 10197 + }, + { + "epoch": 0.3, + "grad_norm": 1.7773354268253092, + "learning_rate": 8.259207096843194e-06, + "loss": 0.3591, + "step": 10198 + }, + { + "epoch": 0.3, + "grad_norm": 2.595063470940391, + "learning_rate": 8.258850877174875e-06, + "loss": 0.3821, + "step": 10199 + }, + { + "epoch": 0.3, + "grad_norm": 1.5432565636622915, + "learning_rate": 8.258494628747159e-06, + "loss": 0.3484, + "step": 10200 + }, + { + "epoch": 0.3, + "grad_norm": 1.9430279273050537, + "learning_rate": 8.258138351563192e-06, + "loss": 0.35, + "step": 10201 + }, + { + "epoch": 0.3, + "grad_norm": 1.5088512075090688, + "learning_rate": 8.257782045626117e-06, + "loss": 0.3281, + "step": 10202 + }, + { + "epoch": 0.3, + "grad_norm": 1.4006710228230395, + "learning_rate": 8.257425710939083e-06, + "loss": 0.3419, + "step": 10203 + }, + { + "epoch": 0.3, + "grad_norm": 1.4089675799271137, + "learning_rate": 8.257069347505227e-06, + "loss": 0.339, + "step": 10204 + }, + { + "epoch": 0.3, + "grad_norm": 1.449713106606803, + "learning_rate": 8.2567129553277e-06, + "loss": 0.3368, + "step": 10205 + }, + { + "epoch": 0.3, + "grad_norm": 1.6236084137213358, + "learning_rate": 8.256356534409645e-06, + "loss": 0.3645, + "step": 10206 + }, + { + "epoch": 0.3, + "grad_norm": 1.6091831478640521, + "learning_rate": 8.256000084754205e-06, + "loss": 0.3504, + "step": 10207 + }, + { + "epoch": 0.3, + "grad_norm": 1.316615228393381, + "learning_rate": 8.25564360636453e-06, + "loss": 0.3361, + "step": 10208 + }, + { + "epoch": 0.3, + "grad_norm": 1.3089145644414388, + "learning_rate": 8.255287099243764e-06, + "loss": 0.3586, + "step": 10209 + }, + { + "epoch": 0.3, + "grad_norm": 1.4446228803973604, + "learning_rate": 8.254930563395054e-06, + "loss": 0.3584, + "step": 10210 + }, + { + "epoch": 0.3, + "grad_norm": 1.553367599526075, + "learning_rate": 8.254573998821543e-06, + "loss": 0.3523, + "step": 10211 + }, + { + "epoch": 0.3, + "grad_norm": 4.373403623046646, + "learning_rate": 8.254217405526383e-06, + "loss": 0.3634, + "step": 10212 + }, + { + "epoch": 0.3, + "grad_norm": 1.5247986437024255, + "learning_rate": 8.253860783512718e-06, + "loss": 0.3434, + "step": 10213 + }, + { + "epoch": 0.3, + "grad_norm": 1.5351474763132937, + "learning_rate": 8.253504132783694e-06, + "loss": 0.3382, + "step": 10214 + }, + { + "epoch": 0.3, + "grad_norm": 1.4288223851900406, + "learning_rate": 8.253147453342461e-06, + "loss": 0.3382, + "step": 10215 + }, + { + "epoch": 0.3, + "grad_norm": 1.5978352569239538, + "learning_rate": 8.252790745192164e-06, + "loss": 0.3271, + "step": 10216 + }, + { + "epoch": 0.3, + "grad_norm": 1.6405267507304526, + "learning_rate": 8.252434008335955e-06, + "loss": 0.35, + "step": 10217 + }, + { + "epoch": 0.3, + "grad_norm": 1.469080680717589, + "learning_rate": 8.25207724277698e-06, + "loss": 0.4257, + "step": 10218 + }, + { + "epoch": 0.3, + "grad_norm": 1.3737045055515262, + "learning_rate": 8.251720448518385e-06, + "loss": 0.347, + "step": 10219 + }, + { + "epoch": 0.3, + "grad_norm": 1.5823914727930775, + "learning_rate": 8.251363625563323e-06, + "loss": 0.3766, + "step": 10220 + }, + { + "epoch": 0.3, + "grad_norm": 1.6451900624702003, + "learning_rate": 8.25100677391494e-06, + "loss": 0.3368, + "step": 10221 + }, + { + "epoch": 0.3, + "grad_norm": 1.6319643860971424, + "learning_rate": 8.250649893576385e-06, + "loss": 0.3527, + "step": 10222 + }, + { + "epoch": 0.3, + "grad_norm": 1.201053447849022, + "learning_rate": 8.250292984550812e-06, + "loss": 0.3314, + "step": 10223 + }, + { + "epoch": 0.3, + "grad_norm": 1.4340349107654065, + "learning_rate": 8.249936046841365e-06, + "loss": 0.377, + "step": 10224 + }, + { + "epoch": 0.3, + "grad_norm": 1.3117122304138904, + "learning_rate": 8.249579080451195e-06, + "loss": 0.3521, + "step": 10225 + }, + { + "epoch": 0.3, + "grad_norm": 1.4359438346823215, + "learning_rate": 8.249222085383456e-06, + "loss": 0.3501, + "step": 10226 + }, + { + "epoch": 0.3, + "grad_norm": 1.4428580431752391, + "learning_rate": 8.248865061641294e-06, + "loss": 0.3685, + "step": 10227 + }, + { + "epoch": 0.3, + "grad_norm": 1.3981533241034783, + "learning_rate": 8.248508009227864e-06, + "loss": 0.3962, + "step": 10228 + }, + { + "epoch": 0.3, + "grad_norm": 1.4112717052045571, + "learning_rate": 8.248150928146313e-06, + "loss": 0.3387, + "step": 10229 + }, + { + "epoch": 0.3, + "grad_norm": 1.3890892110727713, + "learning_rate": 8.247793818399795e-06, + "loss": 0.3385, + "step": 10230 + }, + { + "epoch": 0.3, + "grad_norm": 1.369440183312026, + "learning_rate": 8.24743667999146e-06, + "loss": 0.3429, + "step": 10231 + }, + { + "epoch": 0.3, + "grad_norm": 1.5558657250509325, + "learning_rate": 8.24707951292446e-06, + "loss": 0.3377, + "step": 10232 + }, + { + "epoch": 0.3, + "grad_norm": 1.4425520223117754, + "learning_rate": 8.24672231720195e-06, + "loss": 0.3627, + "step": 10233 + }, + { + "epoch": 0.3, + "grad_norm": 1.365824351892146, + "learning_rate": 8.246365092827076e-06, + "loss": 0.3509, + "step": 10234 + }, + { + "epoch": 0.3, + "grad_norm": 1.4105695065569013, + "learning_rate": 8.246007839802997e-06, + "loss": 0.3473, + "step": 10235 + }, + { + "epoch": 0.3, + "grad_norm": 1.3804626930791422, + "learning_rate": 8.24565055813286e-06, + "loss": 0.3291, + "step": 10236 + }, + { + "epoch": 0.3, + "grad_norm": 1.4545919424696347, + "learning_rate": 8.24529324781982e-06, + "loss": 0.3503, + "step": 10237 + }, + { + "epoch": 0.3, + "grad_norm": 1.3668843168927125, + "learning_rate": 8.244935908867035e-06, + "loss": 0.3556, + "step": 10238 + }, + { + "epoch": 0.3, + "grad_norm": 1.7210904534635416, + "learning_rate": 8.244578541277652e-06, + "loss": 0.3574, + "step": 10239 + }, + { + "epoch": 0.3, + "grad_norm": 1.46635478087516, + "learning_rate": 8.24422114505483e-06, + "loss": 0.3694, + "step": 10240 + }, + { + "epoch": 0.3, + "grad_norm": 1.2771415709518377, + "learning_rate": 8.243863720201716e-06, + "loss": 0.3318, + "step": 10241 + }, + { + "epoch": 0.3, + "grad_norm": 1.8344518204867843, + "learning_rate": 8.243506266721473e-06, + "loss": 0.3609, + "step": 10242 + }, + { + "epoch": 0.3, + "grad_norm": 1.6433756091521923, + "learning_rate": 8.243148784617248e-06, + "loss": 0.3301, + "step": 10243 + }, + { + "epoch": 0.3, + "grad_norm": 1.5337425145785042, + "learning_rate": 8.242791273892202e-06, + "loss": 0.3695, + "step": 10244 + }, + { + "epoch": 0.3, + "grad_norm": 1.352104975505497, + "learning_rate": 8.242433734549484e-06, + "loss": 0.3447, + "step": 10245 + }, + { + "epoch": 0.3, + "grad_norm": 1.5485109634513055, + "learning_rate": 8.242076166592254e-06, + "loss": 0.3454, + "step": 10246 + }, + { + "epoch": 0.3, + "grad_norm": 2.627821671137143, + "learning_rate": 8.241718570023667e-06, + "loss": 0.3703, + "step": 10247 + }, + { + "epoch": 0.3, + "grad_norm": 1.308504066195415, + "learning_rate": 8.241360944846876e-06, + "loss": 0.3181, + "step": 10248 + }, + { + "epoch": 0.3, + "grad_norm": 1.4989204615898322, + "learning_rate": 8.241003291065038e-06, + "loss": 0.3478, + "step": 10249 + }, + { + "epoch": 0.3, + "grad_norm": 1.4098383048741847, + "learning_rate": 8.240645608681312e-06, + "loss": 0.3508, + "step": 10250 + }, + { + "epoch": 0.3, + "grad_norm": 1.2841426123108204, + "learning_rate": 8.24028789769885e-06, + "loss": 0.3504, + "step": 10251 + }, + { + "epoch": 0.3, + "grad_norm": 1.3178267155363201, + "learning_rate": 8.239930158120813e-06, + "loss": 0.3266, + "step": 10252 + }, + { + "epoch": 0.3, + "grad_norm": 1.4001616845033207, + "learning_rate": 8.239572389950357e-06, + "loss": 0.3649, + "step": 10253 + }, + { + "epoch": 0.3, + "grad_norm": 1.3291468517455218, + "learning_rate": 8.239214593190638e-06, + "loss": 0.3476, + "step": 10254 + }, + { + "epoch": 0.3, + "grad_norm": 1.6500695537015546, + "learning_rate": 8.238856767844814e-06, + "loss": 0.3634, + "step": 10255 + }, + { + "epoch": 0.3, + "grad_norm": 1.4081126723458826, + "learning_rate": 8.238498913916045e-06, + "loss": 0.3626, + "step": 10256 + }, + { + "epoch": 0.3, + "grad_norm": 1.4151941869225202, + "learning_rate": 8.238141031407484e-06, + "loss": 0.3457, + "step": 10257 + }, + { + "epoch": 0.3, + "grad_norm": 1.3335891542504508, + "learning_rate": 8.237783120322293e-06, + "loss": 0.3407, + "step": 10258 + }, + { + "epoch": 0.3, + "grad_norm": 1.402793797156293, + "learning_rate": 8.237425180663632e-06, + "loss": 0.3405, + "step": 10259 + }, + { + "epoch": 0.3, + "grad_norm": 1.3525958211151723, + "learning_rate": 8.237067212434658e-06, + "loss": 0.3351, + "step": 10260 + }, + { + "epoch": 0.3, + "grad_norm": 1.3096164350357948, + "learning_rate": 8.236709215638529e-06, + "loss": 0.3391, + "step": 10261 + }, + { + "epoch": 0.3, + "grad_norm": 1.519302938944519, + "learning_rate": 8.236351190278405e-06, + "loss": 0.3885, + "step": 10262 + }, + { + "epoch": 0.3, + "grad_norm": 1.3171106992720192, + "learning_rate": 8.235993136357445e-06, + "loss": 0.3397, + "step": 10263 + }, + { + "epoch": 0.3, + "grad_norm": 1.4031176056399108, + "learning_rate": 8.23563505387881e-06, + "loss": 0.3351, + "step": 10264 + }, + { + "epoch": 0.3, + "grad_norm": 1.358459162699489, + "learning_rate": 8.235276942845662e-06, + "loss": 0.3376, + "step": 10265 + }, + { + "epoch": 0.3, + "grad_norm": 1.477019914626966, + "learning_rate": 8.234918803261157e-06, + "loss": 0.3458, + "step": 10266 + }, + { + "epoch": 0.3, + "grad_norm": 1.344951185698881, + "learning_rate": 8.234560635128458e-06, + "loss": 0.3616, + "step": 10267 + }, + { + "epoch": 0.3, + "grad_norm": 1.9753032902039358, + "learning_rate": 8.234202438450725e-06, + "loss": 0.3447, + "step": 10268 + }, + { + "epoch": 0.3, + "grad_norm": 1.5159359821034668, + "learning_rate": 8.233844213231121e-06, + "loss": 0.3269, + "step": 10269 + }, + { + "epoch": 0.3, + "grad_norm": 1.5425779188270832, + "learning_rate": 8.233485959472804e-06, + "loss": 0.3483, + "step": 10270 + }, + { + "epoch": 0.3, + "grad_norm": 1.4118998485508187, + "learning_rate": 8.23312767717894e-06, + "loss": 0.3343, + "step": 10271 + }, + { + "epoch": 0.3, + "grad_norm": 1.3626363589554658, + "learning_rate": 8.232769366352687e-06, + "loss": 0.339, + "step": 10272 + }, + { + "epoch": 0.3, + "grad_norm": 1.3181005872315492, + "learning_rate": 8.232411026997208e-06, + "loss": 0.3314, + "step": 10273 + }, + { + "epoch": 0.3, + "grad_norm": 1.5333463702351913, + "learning_rate": 8.232052659115668e-06, + "loss": 0.3285, + "step": 10274 + }, + { + "epoch": 0.3, + "grad_norm": 1.4595391429169051, + "learning_rate": 8.231694262711226e-06, + "loss": 0.3362, + "step": 10275 + }, + { + "epoch": 0.3, + "grad_norm": 1.4980997829753484, + "learning_rate": 8.231335837787045e-06, + "loss": 0.3657, + "step": 10276 + }, + { + "epoch": 0.3, + "grad_norm": 1.4388129113702792, + "learning_rate": 8.23097738434629e-06, + "loss": 0.3541, + "step": 10277 + }, + { + "epoch": 0.3, + "grad_norm": 1.732872482727585, + "learning_rate": 8.230618902392125e-06, + "loss": 0.3513, + "step": 10278 + }, + { + "epoch": 0.3, + "grad_norm": 1.3047658018322776, + "learning_rate": 8.230260391927712e-06, + "loss": 0.3358, + "step": 10279 + }, + { + "epoch": 0.3, + "grad_norm": 1.376116585781859, + "learning_rate": 8.229901852956215e-06, + "loss": 0.3565, + "step": 10280 + }, + { + "epoch": 0.3, + "grad_norm": 4.176811430910902, + "learning_rate": 8.229543285480797e-06, + "loss": 0.3815, + "step": 10281 + }, + { + "epoch": 0.3, + "grad_norm": 0.9168396933669516, + "learning_rate": 8.229184689504626e-06, + "loss": 0.6192, + "step": 10282 + }, + { + "epoch": 0.3, + "grad_norm": 1.4544842240453875, + "learning_rate": 8.228826065030862e-06, + "loss": 0.3582, + "step": 10283 + }, + { + "epoch": 0.3, + "grad_norm": 1.622231090329387, + "learning_rate": 8.228467412062675e-06, + "loss": 0.3679, + "step": 10284 + }, + { + "epoch": 0.3, + "grad_norm": 1.5072907760331078, + "learning_rate": 8.228108730603224e-06, + "loss": 0.3532, + "step": 10285 + }, + { + "epoch": 0.3, + "grad_norm": 1.2748515276203056, + "learning_rate": 8.227750020655679e-06, + "loss": 0.3404, + "step": 10286 + }, + { + "epoch": 0.3, + "grad_norm": 1.370660854146262, + "learning_rate": 8.227391282223205e-06, + "loss": 0.3331, + "step": 10287 + }, + { + "epoch": 0.3, + "grad_norm": 1.3380510488479744, + "learning_rate": 8.227032515308968e-06, + "loss": 0.3133, + "step": 10288 + }, + { + "epoch": 0.3, + "grad_norm": 1.5405081244435754, + "learning_rate": 8.226673719916131e-06, + "loss": 0.3602, + "step": 10289 + }, + { + "epoch": 0.3, + "grad_norm": 1.670755533273643, + "learning_rate": 8.226314896047863e-06, + "loss": 0.3838, + "step": 10290 + }, + { + "epoch": 0.3, + "grad_norm": 1.68195113008621, + "learning_rate": 8.22595604370733e-06, + "loss": 0.3439, + "step": 10291 + }, + { + "epoch": 0.3, + "grad_norm": 1.5306392123329169, + "learning_rate": 8.2255971628977e-06, + "loss": 0.3323, + "step": 10292 + }, + { + "epoch": 0.3, + "grad_norm": 0.9757869363734091, + "learning_rate": 8.225238253622142e-06, + "loss": 0.6361, + "step": 10293 + }, + { + "epoch": 0.3, + "grad_norm": 1.3681164198268962, + "learning_rate": 8.224879315883816e-06, + "loss": 0.3777, + "step": 10294 + }, + { + "epoch": 0.3, + "grad_norm": 1.4063815717464303, + "learning_rate": 8.224520349685896e-06, + "loss": 0.3858, + "step": 10295 + }, + { + "epoch": 0.3, + "grad_norm": 4.770991325529832, + "learning_rate": 8.224161355031547e-06, + "loss": 0.3279, + "step": 10296 + }, + { + "epoch": 0.3, + "grad_norm": 1.5216935082726466, + "learning_rate": 8.22380233192394e-06, + "loss": 0.3635, + "step": 10297 + }, + { + "epoch": 0.3, + "grad_norm": 1.4521578639296826, + "learning_rate": 8.223443280366241e-06, + "loss": 0.3814, + "step": 10298 + }, + { + "epoch": 0.3, + "grad_norm": 2.906373558099301, + "learning_rate": 8.223084200361619e-06, + "loss": 0.3546, + "step": 10299 + }, + { + "epoch": 0.3, + "grad_norm": 2.643755771601498, + "learning_rate": 8.222725091913244e-06, + "loss": 0.3667, + "step": 10300 + }, + { + "epoch": 0.3, + "grad_norm": 1.5565142035472268, + "learning_rate": 8.222365955024283e-06, + "loss": 0.3384, + "step": 10301 + }, + { + "epoch": 0.3, + "grad_norm": 1.536856165815387, + "learning_rate": 8.222006789697909e-06, + "loss": 0.3553, + "step": 10302 + }, + { + "epoch": 0.3, + "grad_norm": 1.8841039404611497, + "learning_rate": 8.221647595937287e-06, + "loss": 0.3711, + "step": 10303 + }, + { + "epoch": 0.3, + "grad_norm": 1.530367528325546, + "learning_rate": 8.221288373745591e-06, + "loss": 0.3363, + "step": 10304 + }, + { + "epoch": 0.3, + "grad_norm": 1.3451473785383028, + "learning_rate": 8.220929123125989e-06, + "loss": 0.3328, + "step": 10305 + }, + { + "epoch": 0.3, + "grad_norm": 1.6389198583538551, + "learning_rate": 8.220569844081651e-06, + "loss": 0.3451, + "step": 10306 + }, + { + "epoch": 0.3, + "grad_norm": 1.523293380109933, + "learning_rate": 8.220210536615749e-06, + "loss": 0.3441, + "step": 10307 + }, + { + "epoch": 0.3, + "grad_norm": 1.5967180883539935, + "learning_rate": 8.219851200731452e-06, + "loss": 0.3332, + "step": 10308 + }, + { + "epoch": 0.3, + "grad_norm": 1.365205769603551, + "learning_rate": 8.219491836431934e-06, + "loss": 0.402, + "step": 10309 + }, + { + "epoch": 0.3, + "grad_norm": 1.406800932977689, + "learning_rate": 8.219132443720364e-06, + "loss": 0.3699, + "step": 10310 + }, + { + "epoch": 0.3, + "grad_norm": 1.840909101867468, + "learning_rate": 8.218773022599916e-06, + "loss": 0.3532, + "step": 10311 + }, + { + "epoch": 0.3, + "grad_norm": 1.4220789537303524, + "learning_rate": 8.218413573073759e-06, + "loss": 0.3605, + "step": 10312 + }, + { + "epoch": 0.3, + "grad_norm": 1.6221466747225415, + "learning_rate": 8.218054095145066e-06, + "loss": 0.3413, + "step": 10313 + }, + { + "epoch": 0.3, + "grad_norm": 1.4987175198943905, + "learning_rate": 8.21769458881701e-06, + "loss": 0.3357, + "step": 10314 + }, + { + "epoch": 0.3, + "grad_norm": 2.274541755276717, + "learning_rate": 8.217335054092763e-06, + "loss": 0.3302, + "step": 10315 + }, + { + "epoch": 0.3, + "grad_norm": 1.4211255895818848, + "learning_rate": 8.2169754909755e-06, + "loss": 0.335, + "step": 10316 + }, + { + "epoch": 0.3, + "grad_norm": 1.5302552006674517, + "learning_rate": 8.216615899468393e-06, + "loss": 0.3518, + "step": 10317 + }, + { + "epoch": 0.3, + "grad_norm": 1.3046791273684752, + "learning_rate": 8.216256279574614e-06, + "loss": 0.3236, + "step": 10318 + }, + { + "epoch": 0.3, + "grad_norm": 1.6501837793857668, + "learning_rate": 8.215896631297338e-06, + "loss": 0.3477, + "step": 10319 + }, + { + "epoch": 0.3, + "grad_norm": 1.352936517323275, + "learning_rate": 8.215536954639738e-06, + "loss": 0.3592, + "step": 10320 + }, + { + "epoch": 0.3, + "grad_norm": 2.0112573855623594, + "learning_rate": 8.215177249604988e-06, + "loss": 0.3773, + "step": 10321 + }, + { + "epoch": 0.3, + "grad_norm": 1.6891883798984935, + "learning_rate": 8.214817516196265e-06, + "loss": 0.3462, + "step": 10322 + }, + { + "epoch": 0.3, + "grad_norm": 1.0524623424756345, + "learning_rate": 8.21445775441674e-06, + "loss": 0.6107, + "step": 10323 + }, + { + "epoch": 0.3, + "grad_norm": 1.609541381680804, + "learning_rate": 8.214097964269592e-06, + "loss": 0.3366, + "step": 10324 + }, + { + "epoch": 0.3, + "grad_norm": 1.5840970785708368, + "learning_rate": 8.213738145757992e-06, + "loss": 0.3872, + "step": 10325 + }, + { + "epoch": 0.3, + "grad_norm": 1.4936651603459388, + "learning_rate": 8.213378298885116e-06, + "loss": 0.3572, + "step": 10326 + }, + { + "epoch": 0.3, + "grad_norm": 1.4278075115961397, + "learning_rate": 8.213018423654144e-06, + "loss": 0.3316, + "step": 10327 + }, + { + "epoch": 0.3, + "grad_norm": 1.4650199186804151, + "learning_rate": 8.212658520068247e-06, + "loss": 0.3362, + "step": 10328 + }, + { + "epoch": 0.3, + "grad_norm": 1.6731906557069425, + "learning_rate": 8.212298588130602e-06, + "loss": 0.3503, + "step": 10329 + }, + { + "epoch": 0.3, + "grad_norm": 1.3976944947075391, + "learning_rate": 8.211938627844389e-06, + "loss": 0.3391, + "step": 10330 + }, + { + "epoch": 0.3, + "grad_norm": 1.5087349800009227, + "learning_rate": 8.21157863921278e-06, + "loss": 0.3393, + "step": 10331 + }, + { + "epoch": 0.3, + "grad_norm": 1.3492243236076187, + "learning_rate": 8.211218622238954e-06, + "loss": 0.3484, + "step": 10332 + }, + { + "epoch": 0.3, + "grad_norm": 1.6894632602086739, + "learning_rate": 8.210858576926088e-06, + "loss": 0.3295, + "step": 10333 + }, + { + "epoch": 0.3, + "grad_norm": 1.6703041410589732, + "learning_rate": 8.21049850327736e-06, + "loss": 0.3551, + "step": 10334 + }, + { + "epoch": 0.3, + "grad_norm": 1.6889370816527467, + "learning_rate": 8.210138401295947e-06, + "loss": 0.3582, + "step": 10335 + }, + { + "epoch": 0.3, + "grad_norm": 1.4950247948620816, + "learning_rate": 8.209778270985027e-06, + "loss": 0.3573, + "step": 10336 + }, + { + "epoch": 0.3, + "grad_norm": 1.461856337443593, + "learning_rate": 8.209418112347778e-06, + "loss": 0.3566, + "step": 10337 + }, + { + "epoch": 0.3, + "grad_norm": 1.357741788548424, + "learning_rate": 8.209057925387377e-06, + "loss": 0.3524, + "step": 10338 + }, + { + "epoch": 0.3, + "grad_norm": 1.2939187164737662, + "learning_rate": 8.208697710107007e-06, + "loss": 0.3388, + "step": 10339 + }, + { + "epoch": 0.3, + "grad_norm": 1.682423163869611, + "learning_rate": 8.208337466509842e-06, + "loss": 0.346, + "step": 10340 + }, + { + "epoch": 0.3, + "grad_norm": 1.4010329942233684, + "learning_rate": 8.207977194599064e-06, + "loss": 0.3788, + "step": 10341 + }, + { + "epoch": 0.3, + "grad_norm": 1.3937041074894732, + "learning_rate": 8.207616894377852e-06, + "loss": 0.3414, + "step": 10342 + }, + { + "epoch": 0.3, + "grad_norm": 1.6897815334998791, + "learning_rate": 8.207256565849386e-06, + "loss": 0.3625, + "step": 10343 + }, + { + "epoch": 0.3, + "grad_norm": 1.3564317269241954, + "learning_rate": 8.206896209016845e-06, + "loss": 0.334, + "step": 10344 + }, + { + "epoch": 0.3, + "grad_norm": 1.4333360517860192, + "learning_rate": 8.206535823883407e-06, + "loss": 0.3475, + "step": 10345 + }, + { + "epoch": 0.3, + "grad_norm": 1.6498622082462926, + "learning_rate": 8.206175410452257e-06, + "loss": 0.3583, + "step": 10346 + }, + { + "epoch": 0.3, + "grad_norm": 1.5098780888255865, + "learning_rate": 8.205814968726573e-06, + "loss": 0.3694, + "step": 10347 + }, + { + "epoch": 0.3, + "grad_norm": 1.584160239012121, + "learning_rate": 8.205454498709537e-06, + "loss": 0.3332, + "step": 10348 + }, + { + "epoch": 0.3, + "grad_norm": 1.6344683903495425, + "learning_rate": 8.205094000404328e-06, + "loss": 0.3346, + "step": 10349 + }, + { + "epoch": 0.3, + "grad_norm": 1.8901920630427573, + "learning_rate": 8.20473347381413e-06, + "loss": 0.3469, + "step": 10350 + }, + { + "epoch": 0.3, + "grad_norm": 1.3638851063835833, + "learning_rate": 8.204372918942123e-06, + "loss": 0.3538, + "step": 10351 + }, + { + "epoch": 0.3, + "grad_norm": 1.705422648556976, + "learning_rate": 8.20401233579149e-06, + "loss": 0.3428, + "step": 10352 + }, + { + "epoch": 0.3, + "grad_norm": 1.3463792077827843, + "learning_rate": 8.203651724365412e-06, + "loss": 0.3214, + "step": 10353 + }, + { + "epoch": 0.3, + "grad_norm": 1.4297805664677021, + "learning_rate": 8.203291084667072e-06, + "loss": 0.3087, + "step": 10354 + }, + { + "epoch": 0.3, + "grad_norm": 2.4673702705940284, + "learning_rate": 8.202930416699652e-06, + "loss": 0.3658, + "step": 10355 + }, + { + "epoch": 0.3, + "grad_norm": 1.4235171514397398, + "learning_rate": 8.202569720466339e-06, + "loss": 0.3474, + "step": 10356 + }, + { + "epoch": 0.3, + "grad_norm": 1.4473631589263636, + "learning_rate": 8.202208995970309e-06, + "loss": 0.3848, + "step": 10357 + }, + { + "epoch": 0.3, + "grad_norm": 1.8450907376936547, + "learning_rate": 8.20184824321475e-06, + "loss": 0.3659, + "step": 10358 + }, + { + "epoch": 0.3, + "grad_norm": 0.9965363979348209, + "learning_rate": 8.201487462202846e-06, + "loss": 0.6057, + "step": 10359 + }, + { + "epoch": 0.3, + "grad_norm": 1.539330895152144, + "learning_rate": 8.201126652937778e-06, + "loss": 0.3388, + "step": 10360 + }, + { + "epoch": 0.3, + "grad_norm": 1.742599324859984, + "learning_rate": 8.200765815422733e-06, + "loss": 0.3368, + "step": 10361 + }, + { + "epoch": 0.3, + "grad_norm": 1.5203570678358653, + "learning_rate": 8.200404949660894e-06, + "loss": 0.3406, + "step": 10362 + }, + { + "epoch": 0.3, + "grad_norm": 1.4245721806726934, + "learning_rate": 8.200044055655443e-06, + "loss": 0.3382, + "step": 10363 + }, + { + "epoch": 0.3, + "grad_norm": 1.3353006990877732, + "learning_rate": 8.19968313340957e-06, + "loss": 0.3414, + "step": 10364 + }, + { + "epoch": 0.3, + "grad_norm": 1.479907116044183, + "learning_rate": 8.19932218292646e-06, + "loss": 0.3694, + "step": 10365 + }, + { + "epoch": 0.3, + "grad_norm": 3.0951252535230926, + "learning_rate": 8.198961204209292e-06, + "loss": 0.3389, + "step": 10366 + }, + { + "epoch": 0.3, + "grad_norm": 1.5189038952212313, + "learning_rate": 8.19860019726126e-06, + "loss": 0.3871, + "step": 10367 + }, + { + "epoch": 0.3, + "grad_norm": 1.6611378945807278, + "learning_rate": 8.198239162085544e-06, + "loss": 0.3618, + "step": 10368 + }, + { + "epoch": 0.3, + "grad_norm": 1.836426838926274, + "learning_rate": 8.19787809868533e-06, + "loss": 0.3587, + "step": 10369 + }, + { + "epoch": 0.3, + "grad_norm": 1.436280982111053, + "learning_rate": 8.197517007063808e-06, + "loss": 0.3479, + "step": 10370 + }, + { + "epoch": 0.3, + "grad_norm": 1.4337152518039415, + "learning_rate": 8.197155887224165e-06, + "loss": 0.3712, + "step": 10371 + }, + { + "epoch": 0.3, + "grad_norm": 1.3668776001700937, + "learning_rate": 8.196794739169582e-06, + "loss": 0.3501, + "step": 10372 + }, + { + "epoch": 0.3, + "grad_norm": 1.46298504568498, + "learning_rate": 8.196433562903252e-06, + "loss": 0.3907, + "step": 10373 + }, + { + "epoch": 0.3, + "grad_norm": 1.5063693983333264, + "learning_rate": 8.19607235842836e-06, + "loss": 0.3502, + "step": 10374 + }, + { + "epoch": 0.3, + "grad_norm": 1.1642979544241592, + "learning_rate": 8.195711125748094e-06, + "loss": 0.3276, + "step": 10375 + }, + { + "epoch": 0.3, + "grad_norm": 2.86651645542288, + "learning_rate": 8.195349864865642e-06, + "loss": 0.3878, + "step": 10376 + }, + { + "epoch": 0.3, + "grad_norm": 1.5062904815935705, + "learning_rate": 8.194988575784192e-06, + "loss": 0.3798, + "step": 10377 + }, + { + "epoch": 0.3, + "grad_norm": 1.6880608938912676, + "learning_rate": 8.194627258506934e-06, + "loss": 0.3544, + "step": 10378 + }, + { + "epoch": 0.3, + "grad_norm": 1.4127008741212734, + "learning_rate": 8.194265913037051e-06, + "loss": 0.3434, + "step": 10379 + }, + { + "epoch": 0.3, + "grad_norm": 1.3538353275553108, + "learning_rate": 8.193904539377739e-06, + "loss": 0.3691, + "step": 10380 + }, + { + "epoch": 0.3, + "grad_norm": 1.26599199218197, + "learning_rate": 8.193543137532182e-06, + "loss": 0.3563, + "step": 10381 + }, + { + "epoch": 0.3, + "grad_norm": 1.438492817859484, + "learning_rate": 8.193181707503574e-06, + "loss": 0.3338, + "step": 10382 + }, + { + "epoch": 0.3, + "grad_norm": 1.4729053976961428, + "learning_rate": 8.1928202492951e-06, + "loss": 0.3267, + "step": 10383 + }, + { + "epoch": 0.3, + "grad_norm": 1.7680028373469785, + "learning_rate": 8.192458762909953e-06, + "loss": 0.3489, + "step": 10384 + }, + { + "epoch": 0.3, + "grad_norm": 1.8082523057764261, + "learning_rate": 8.192097248351321e-06, + "loss": 0.3522, + "step": 10385 + }, + { + "epoch": 0.3, + "grad_norm": 4.63919122379345, + "learning_rate": 8.191735705622397e-06, + "loss": 0.3413, + "step": 10386 + }, + { + "epoch": 0.3, + "grad_norm": 1.6047316085711396, + "learning_rate": 8.191374134726368e-06, + "loss": 0.3692, + "step": 10387 + }, + { + "epoch": 0.3, + "grad_norm": 1.4818690473729734, + "learning_rate": 8.191012535666429e-06, + "loss": 0.3281, + "step": 10388 + }, + { + "epoch": 0.3, + "grad_norm": 1.4082590495586749, + "learning_rate": 8.190650908445768e-06, + "loss": 0.3683, + "step": 10389 + }, + { + "epoch": 0.3, + "grad_norm": 1.605006938271863, + "learning_rate": 8.190289253067576e-06, + "loss": 0.3365, + "step": 10390 + }, + { + "epoch": 0.3, + "grad_norm": 1.5162054500928097, + "learning_rate": 8.189927569535048e-06, + "loss": 0.3414, + "step": 10391 + }, + { + "epoch": 0.3, + "grad_norm": 1.384501551902087, + "learning_rate": 8.189565857851372e-06, + "loss": 0.3338, + "step": 10392 + }, + { + "epoch": 0.3, + "grad_norm": 1.643670356534223, + "learning_rate": 8.189204118019742e-06, + "loss": 0.3403, + "step": 10393 + }, + { + "epoch": 0.3, + "grad_norm": 1.395628150819279, + "learning_rate": 8.188842350043352e-06, + "loss": 0.3482, + "step": 10394 + }, + { + "epoch": 0.3, + "grad_norm": 1.5764957983516008, + "learning_rate": 8.188480553925392e-06, + "loss": 0.3718, + "step": 10395 + }, + { + "epoch": 0.3, + "grad_norm": 1.9306264023594928, + "learning_rate": 8.188118729669054e-06, + "loss": 0.3423, + "step": 10396 + }, + { + "epoch": 0.3, + "grad_norm": 1.5511061228891578, + "learning_rate": 8.187756877277535e-06, + "loss": 0.3438, + "step": 10397 + }, + { + "epoch": 0.3, + "grad_norm": 2.925403405474725, + "learning_rate": 8.187394996754026e-06, + "loss": 0.3402, + "step": 10398 + }, + { + "epoch": 0.3, + "grad_norm": 5.828153947612111, + "learning_rate": 8.18703308810172e-06, + "loss": 0.3387, + "step": 10399 + }, + { + "epoch": 0.3, + "grad_norm": 1.3960883754871654, + "learning_rate": 8.186671151323813e-06, + "loss": 0.3569, + "step": 10400 + }, + { + "epoch": 0.3, + "grad_norm": 1.7020057136979112, + "learning_rate": 8.186309186423496e-06, + "loss": 0.3621, + "step": 10401 + }, + { + "epoch": 0.3, + "grad_norm": 1.4861446736449495, + "learning_rate": 8.185947193403967e-06, + "loss": 0.3807, + "step": 10402 + }, + { + "epoch": 0.3, + "grad_norm": 1.348235770417199, + "learning_rate": 8.185585172268418e-06, + "loss": 0.3526, + "step": 10403 + }, + { + "epoch": 0.3, + "grad_norm": 1.6259869141747587, + "learning_rate": 8.185223123020043e-06, + "loss": 0.3634, + "step": 10404 + }, + { + "epoch": 0.3, + "grad_norm": 1.3298327667196908, + "learning_rate": 8.18486104566204e-06, + "loss": 0.346, + "step": 10405 + }, + { + "epoch": 0.3, + "grad_norm": 1.6103787618466263, + "learning_rate": 8.184498940197602e-06, + "loss": 0.3284, + "step": 10406 + }, + { + "epoch": 0.3, + "grad_norm": 1.7094962040344477, + "learning_rate": 8.184136806629928e-06, + "loss": 0.4464, + "step": 10407 + }, + { + "epoch": 0.3, + "grad_norm": 1.5852073653360141, + "learning_rate": 8.183774644962208e-06, + "loss": 0.3755, + "step": 10408 + }, + { + "epoch": 0.3, + "grad_norm": 1.5736783984178988, + "learning_rate": 8.183412455197644e-06, + "loss": 0.3452, + "step": 10409 + }, + { + "epoch": 0.3, + "grad_norm": 1.3288382823992513, + "learning_rate": 8.183050237339428e-06, + "loss": 0.3322, + "step": 10410 + }, + { + "epoch": 0.3, + "grad_norm": 1.5396934103122495, + "learning_rate": 8.182687991390758e-06, + "loss": 0.3885, + "step": 10411 + }, + { + "epoch": 0.3, + "grad_norm": 3.6574616295523836, + "learning_rate": 8.182325717354834e-06, + "loss": 0.34, + "step": 10412 + }, + { + "epoch": 0.3, + "grad_norm": 1.5777016999449383, + "learning_rate": 8.181963415234849e-06, + "loss": 0.3439, + "step": 10413 + }, + { + "epoch": 0.3, + "grad_norm": 1.4287490606871853, + "learning_rate": 8.181601085034e-06, + "loss": 0.3609, + "step": 10414 + }, + { + "epoch": 0.3, + "grad_norm": 1.8728492852189271, + "learning_rate": 8.181238726755488e-06, + "loss": 0.3378, + "step": 10415 + }, + { + "epoch": 0.3, + "grad_norm": 1.506735386794936, + "learning_rate": 8.180876340402508e-06, + "loss": 0.3471, + "step": 10416 + }, + { + "epoch": 0.3, + "grad_norm": 1.4636565258340442, + "learning_rate": 8.180513925978259e-06, + "loss": 0.3606, + "step": 10417 + }, + { + "epoch": 0.3, + "grad_norm": 1.4950836815985098, + "learning_rate": 8.180151483485938e-06, + "loss": 0.3401, + "step": 10418 + }, + { + "epoch": 0.3, + "grad_norm": 1.445853780318242, + "learning_rate": 8.179789012928747e-06, + "loss": 0.3806, + "step": 10419 + }, + { + "epoch": 0.3, + "grad_norm": 1.8572892859465586, + "learning_rate": 8.179426514309882e-06, + "loss": 0.3422, + "step": 10420 + }, + { + "epoch": 0.3, + "grad_norm": 1.6183927690976911, + "learning_rate": 8.179063987632542e-06, + "loss": 0.3402, + "step": 10421 + }, + { + "epoch": 0.3, + "grad_norm": 1.4119406723731722, + "learning_rate": 8.17870143289993e-06, + "loss": 0.3359, + "step": 10422 + }, + { + "epoch": 0.3, + "grad_norm": 1.446348259899707, + "learning_rate": 8.178338850115239e-06, + "loss": 0.3562, + "step": 10423 + }, + { + "epoch": 0.3, + "grad_norm": 1.434369190067161, + "learning_rate": 8.177976239281673e-06, + "loss": 0.358, + "step": 10424 + }, + { + "epoch": 0.3, + "grad_norm": 1.3505423351724035, + "learning_rate": 8.177613600402433e-06, + "loss": 0.3574, + "step": 10425 + }, + { + "epoch": 0.3, + "grad_norm": 1.506765604885681, + "learning_rate": 8.177250933480717e-06, + "loss": 0.3913, + "step": 10426 + }, + { + "epoch": 0.3, + "grad_norm": 1.5710646797245211, + "learning_rate": 8.176888238519726e-06, + "loss": 0.3733, + "step": 10427 + }, + { + "epoch": 0.3, + "grad_norm": 1.6147307934763673, + "learning_rate": 8.176525515522662e-06, + "loss": 0.4032, + "step": 10428 + }, + { + "epoch": 0.3, + "grad_norm": 1.4277528335521783, + "learning_rate": 8.176162764492725e-06, + "loss": 0.3751, + "step": 10429 + }, + { + "epoch": 0.3, + "grad_norm": 1.8961473968527216, + "learning_rate": 8.175799985433114e-06, + "loss": 0.387, + "step": 10430 + }, + { + "epoch": 0.3, + "grad_norm": 1.3747846449052592, + "learning_rate": 8.175437178347035e-06, + "loss": 0.3559, + "step": 10431 + }, + { + "epoch": 0.3, + "grad_norm": 0.9482846068801145, + "learning_rate": 8.175074343237687e-06, + "loss": 0.6335, + "step": 10432 + }, + { + "epoch": 0.3, + "grad_norm": 0.9037120286217555, + "learning_rate": 8.174711480108275e-06, + "loss": 0.6027, + "step": 10433 + }, + { + "epoch": 0.3, + "grad_norm": 3.5461303087160876, + "learning_rate": 8.174348588961995e-06, + "loss": 0.3398, + "step": 10434 + }, + { + "epoch": 0.3, + "grad_norm": 1.4749880708543022, + "learning_rate": 8.173985669802056e-06, + "loss": 0.3685, + "step": 10435 + }, + { + "epoch": 0.3, + "grad_norm": 1.5852068915801492, + "learning_rate": 8.173622722631657e-06, + "loss": 0.3725, + "step": 10436 + }, + { + "epoch": 0.3, + "grad_norm": 2.3533614607737574, + "learning_rate": 8.173259747454004e-06, + "loss": 0.3396, + "step": 10437 + }, + { + "epoch": 0.3, + "grad_norm": 1.6520539355846078, + "learning_rate": 8.172896744272296e-06, + "loss": 0.3396, + "step": 10438 + }, + { + "epoch": 0.3, + "grad_norm": 1.561581649382507, + "learning_rate": 8.172533713089742e-06, + "loss": 0.3889, + "step": 10439 + }, + { + "epoch": 0.3, + "grad_norm": 1.4091451458469824, + "learning_rate": 8.172170653909541e-06, + "loss": 0.3543, + "step": 10440 + }, + { + "epoch": 0.3, + "grad_norm": 1.8527339701895025, + "learning_rate": 8.1718075667349e-06, + "loss": 0.3628, + "step": 10441 + }, + { + "epoch": 0.3, + "grad_norm": 1.4630984353907253, + "learning_rate": 8.171444451569019e-06, + "loss": 0.3385, + "step": 10442 + }, + { + "epoch": 0.3, + "grad_norm": 1.3744499986770942, + "learning_rate": 8.17108130841511e-06, + "loss": 0.3359, + "step": 10443 + }, + { + "epoch": 0.3, + "grad_norm": 1.5090304610821772, + "learning_rate": 8.17071813727637e-06, + "loss": 0.3392, + "step": 10444 + }, + { + "epoch": 0.3, + "grad_norm": 1.4282673209928753, + "learning_rate": 8.17035493815601e-06, + "loss": 0.3646, + "step": 10445 + }, + { + "epoch": 0.3, + "grad_norm": 1.48887871229288, + "learning_rate": 8.169991711057229e-06, + "loss": 0.3487, + "step": 10446 + }, + { + "epoch": 0.3, + "grad_norm": 1.333867801178939, + "learning_rate": 8.169628455983238e-06, + "loss": 0.3529, + "step": 10447 + }, + { + "epoch": 0.3, + "grad_norm": 1.5279217993280205, + "learning_rate": 8.16926517293724e-06, + "loss": 0.35, + "step": 10448 + }, + { + "epoch": 0.3, + "grad_norm": 1.464417491826402, + "learning_rate": 8.168901861922443e-06, + "loss": 0.3518, + "step": 10449 + }, + { + "epoch": 0.3, + "grad_norm": 1.1194451158115193, + "learning_rate": 8.168538522942052e-06, + "loss": 0.6015, + "step": 10450 + }, + { + "epoch": 0.3, + "grad_norm": 1.5019418843792645, + "learning_rate": 8.16817515599927e-06, + "loss": 0.3235, + "step": 10451 + }, + { + "epoch": 0.3, + "grad_norm": 1.3515220141053337, + "learning_rate": 8.167811761097311e-06, + "loss": 0.3419, + "step": 10452 + }, + { + "epoch": 0.3, + "grad_norm": 1.6806560298189575, + "learning_rate": 8.167448338239375e-06, + "loss": 0.3791, + "step": 10453 + }, + { + "epoch": 0.3, + "grad_norm": 1.4149962511668657, + "learning_rate": 8.167084887428675e-06, + "loss": 0.362, + "step": 10454 + }, + { + "epoch": 0.3, + "grad_norm": 1.477912813701452, + "learning_rate": 8.166721408668415e-06, + "loss": 0.3427, + "step": 10455 + }, + { + "epoch": 0.3, + "grad_norm": 1.5830652952956474, + "learning_rate": 8.166357901961802e-06, + "loss": 0.3467, + "step": 10456 + }, + { + "epoch": 0.3, + "grad_norm": 1.5440936637813434, + "learning_rate": 8.165994367312046e-06, + "loss": 0.3403, + "step": 10457 + }, + { + "epoch": 0.3, + "grad_norm": 1.492577609940901, + "learning_rate": 8.165630804722356e-06, + "loss": 0.3359, + "step": 10458 + }, + { + "epoch": 0.3, + "grad_norm": 2.234946079481526, + "learning_rate": 8.165267214195938e-06, + "loss": 0.346, + "step": 10459 + }, + { + "epoch": 0.3, + "grad_norm": 1.4383927261855978, + "learning_rate": 8.164903595736002e-06, + "loss": 0.3461, + "step": 10460 + }, + { + "epoch": 0.3, + "grad_norm": 1.4773518796929328, + "learning_rate": 8.164539949345755e-06, + "loss": 0.3704, + "step": 10461 + }, + { + "epoch": 0.3, + "grad_norm": 1.57300739092842, + "learning_rate": 8.16417627502841e-06, + "loss": 0.3458, + "step": 10462 + }, + { + "epoch": 0.3, + "grad_norm": 1.6314527594035582, + "learning_rate": 8.163812572787173e-06, + "loss": 0.3693, + "step": 10463 + }, + { + "epoch": 0.3, + "grad_norm": 1.4707549535473858, + "learning_rate": 8.163448842625257e-06, + "loss": 0.3701, + "step": 10464 + }, + { + "epoch": 0.3, + "grad_norm": 1.3801770965696147, + "learning_rate": 8.163085084545867e-06, + "loss": 0.3663, + "step": 10465 + }, + { + "epoch": 0.3, + "grad_norm": 1.7118423004568637, + "learning_rate": 8.162721298552219e-06, + "loss": 0.336, + "step": 10466 + }, + { + "epoch": 0.3, + "grad_norm": 1.1063265090508638, + "learning_rate": 8.162357484647518e-06, + "loss": 0.6472, + "step": 10467 + }, + { + "epoch": 0.3, + "grad_norm": 4.2545985848077885, + "learning_rate": 8.161993642834978e-06, + "loss": 0.3365, + "step": 10468 + }, + { + "epoch": 0.3, + "grad_norm": 1.4450169443805756, + "learning_rate": 8.161629773117809e-06, + "loss": 0.3493, + "step": 10469 + }, + { + "epoch": 0.3, + "grad_norm": 2.025346003767745, + "learning_rate": 8.161265875499223e-06, + "loss": 0.3866, + "step": 10470 + }, + { + "epoch": 0.3, + "grad_norm": 1.554941116740191, + "learning_rate": 8.160901949982429e-06, + "loss": 0.35, + "step": 10471 + }, + { + "epoch": 0.3, + "grad_norm": 1.4610649617551998, + "learning_rate": 8.160537996570641e-06, + "loss": 0.3162, + "step": 10472 + }, + { + "epoch": 0.3, + "grad_norm": 1.4038182844320184, + "learning_rate": 8.16017401526707e-06, + "loss": 0.3773, + "step": 10473 + }, + { + "epoch": 0.3, + "grad_norm": 2.3990586366307185, + "learning_rate": 8.159810006074929e-06, + "loss": 0.3619, + "step": 10474 + }, + { + "epoch": 0.3, + "grad_norm": 1.6013139072097595, + "learning_rate": 8.159445968997428e-06, + "loss": 0.3578, + "step": 10475 + }, + { + "epoch": 0.3, + "grad_norm": 1.4290257247156994, + "learning_rate": 8.159081904037783e-06, + "loss": 0.3446, + "step": 10476 + }, + { + "epoch": 0.3, + "grad_norm": 1.3201711356784647, + "learning_rate": 8.158717811199203e-06, + "loss": 0.3507, + "step": 10477 + }, + { + "epoch": 0.3, + "grad_norm": 1.5589290522349415, + "learning_rate": 8.158353690484905e-06, + "loss": 0.3655, + "step": 10478 + }, + { + "epoch": 0.3, + "grad_norm": 1.4967829505609271, + "learning_rate": 8.1579895418981e-06, + "loss": 0.3393, + "step": 10479 + }, + { + "epoch": 0.3, + "grad_norm": 1.6085921688407518, + "learning_rate": 8.157625365442004e-06, + "loss": 0.3369, + "step": 10480 + }, + { + "epoch": 0.3, + "grad_norm": 1.5634436511964547, + "learning_rate": 8.157261161119828e-06, + "loss": 0.3416, + "step": 10481 + }, + { + "epoch": 0.3, + "grad_norm": 1.5338753497411728, + "learning_rate": 8.156896928934786e-06, + "loss": 0.3498, + "step": 10482 + }, + { + "epoch": 0.3, + "grad_norm": 1.6626486935396458, + "learning_rate": 8.156532668890094e-06, + "loss": 0.3522, + "step": 10483 + }, + { + "epoch": 0.3, + "grad_norm": 1.4313582537497167, + "learning_rate": 8.156168380988968e-06, + "loss": 0.3595, + "step": 10484 + }, + { + "epoch": 0.3, + "grad_norm": 1.3616637514878112, + "learning_rate": 8.155804065234619e-06, + "loss": 0.3689, + "step": 10485 + }, + { + "epoch": 0.3, + "grad_norm": 5.4485862756139465, + "learning_rate": 8.155439721630265e-06, + "loss": 0.3374, + "step": 10486 + }, + { + "epoch": 0.3, + "grad_norm": 1.467934322476409, + "learning_rate": 8.15507535017912e-06, + "loss": 0.3524, + "step": 10487 + }, + { + "epoch": 0.3, + "grad_norm": 1.5217017401926634, + "learning_rate": 8.1547109508844e-06, + "loss": 0.3462, + "step": 10488 + }, + { + "epoch": 0.3, + "grad_norm": 1.459970578500517, + "learning_rate": 8.154346523749321e-06, + "loss": 0.3463, + "step": 10489 + }, + { + "epoch": 0.3, + "grad_norm": 1.5776047697345903, + "learning_rate": 8.1539820687771e-06, + "loss": 0.3589, + "step": 10490 + }, + { + "epoch": 0.3, + "grad_norm": 1.61757202783703, + "learning_rate": 8.153617585970953e-06, + "loss": 0.3621, + "step": 10491 + }, + { + "epoch": 0.3, + "grad_norm": 1.3772988048565444, + "learning_rate": 8.153253075334092e-06, + "loss": 0.33, + "step": 10492 + }, + { + "epoch": 0.3, + "grad_norm": 1.3358350952750504, + "learning_rate": 8.152888536869741e-06, + "loss": 0.3454, + "step": 10493 + }, + { + "epoch": 0.3, + "grad_norm": 1.4500726298893594, + "learning_rate": 8.152523970581112e-06, + "loss": 0.3429, + "step": 10494 + }, + { + "epoch": 0.3, + "grad_norm": 1.9202985173744858, + "learning_rate": 8.152159376471424e-06, + "loss": 0.3386, + "step": 10495 + }, + { + "epoch": 0.3, + "grad_norm": 1.4891084361030826, + "learning_rate": 8.151794754543896e-06, + "loss": 0.3616, + "step": 10496 + }, + { + "epoch": 0.3, + "grad_norm": 1.4096626225840838, + "learning_rate": 8.151430104801742e-06, + "loss": 0.339, + "step": 10497 + }, + { + "epoch": 0.3, + "grad_norm": 1.6767625906684722, + "learning_rate": 8.151065427248184e-06, + "loss": 0.3383, + "step": 10498 + }, + { + "epoch": 0.3, + "grad_norm": 1.4338540314124713, + "learning_rate": 8.15070072188644e-06, + "loss": 0.3203, + "step": 10499 + }, + { + "epoch": 0.3, + "grad_norm": 1.4505913695551167, + "learning_rate": 8.150335988719724e-06, + "loss": 0.3433, + "step": 10500 + }, + { + "epoch": 0.3, + "grad_norm": 2.0526948298726913, + "learning_rate": 8.14997122775126e-06, + "loss": 0.3603, + "step": 10501 + }, + { + "epoch": 0.3, + "grad_norm": 1.5244703816039697, + "learning_rate": 8.149606438984265e-06, + "loss": 0.3551, + "step": 10502 + }, + { + "epoch": 0.3, + "grad_norm": 1.3223632923598667, + "learning_rate": 8.149241622421956e-06, + "loss": 0.3414, + "step": 10503 + }, + { + "epoch": 0.3, + "grad_norm": 1.8977494179031265, + "learning_rate": 8.148876778067559e-06, + "loss": 0.3538, + "step": 10504 + }, + { + "epoch": 0.3, + "grad_norm": 1.3975693544108556, + "learning_rate": 8.148511905924285e-06, + "loss": 0.34, + "step": 10505 + }, + { + "epoch": 0.3, + "grad_norm": 2.34931749841594, + "learning_rate": 8.148147005995361e-06, + "loss": 0.371, + "step": 10506 + }, + { + "epoch": 0.3, + "grad_norm": 1.8341715290980516, + "learning_rate": 8.147782078284004e-06, + "loss": 0.3531, + "step": 10507 + }, + { + "epoch": 0.3, + "grad_norm": 1.751815306308268, + "learning_rate": 8.147417122793435e-06, + "loss": 0.3493, + "step": 10508 + }, + { + "epoch": 0.3, + "grad_norm": 2.112341839631622, + "learning_rate": 8.147052139526876e-06, + "loss": 0.3477, + "step": 10509 + }, + { + "epoch": 0.3, + "grad_norm": 1.9070580429355544, + "learning_rate": 8.146687128487546e-06, + "loss": 0.3209, + "step": 10510 + }, + { + "epoch": 0.3, + "grad_norm": 1.4258980461548054, + "learning_rate": 8.146322089678668e-06, + "loss": 0.3429, + "step": 10511 + }, + { + "epoch": 0.3, + "grad_norm": 1.3914927113899542, + "learning_rate": 8.145957023103461e-06, + "loss": 0.3281, + "step": 10512 + }, + { + "epoch": 0.3, + "grad_norm": 1.4228988441906716, + "learning_rate": 8.145591928765151e-06, + "loss": 0.349, + "step": 10513 + }, + { + "epoch": 0.3, + "grad_norm": 1.3971638280274983, + "learning_rate": 8.145226806666954e-06, + "loss": 0.38, + "step": 10514 + }, + { + "epoch": 0.3, + "grad_norm": 1.9576806075621975, + "learning_rate": 8.1448616568121e-06, + "loss": 0.3657, + "step": 10515 + }, + { + "epoch": 0.31, + "grad_norm": 1.4676975677147241, + "learning_rate": 8.144496479203803e-06, + "loss": 0.3525, + "step": 10516 + }, + { + "epoch": 0.31, + "grad_norm": 1.0154955107600074, + "learning_rate": 8.144131273845293e-06, + "loss": 0.5748, + "step": 10517 + }, + { + "epoch": 0.31, + "grad_norm": 1.587496842333723, + "learning_rate": 8.143766040739786e-06, + "loss": 0.3539, + "step": 10518 + }, + { + "epoch": 0.31, + "grad_norm": 2.104042621115758, + "learning_rate": 8.143400779890511e-06, + "loss": 0.3324, + "step": 10519 + }, + { + "epoch": 0.31, + "grad_norm": 1.3555875763042242, + "learning_rate": 8.143035491300689e-06, + "loss": 0.3506, + "step": 10520 + }, + { + "epoch": 0.31, + "grad_norm": 1.4674472395620348, + "learning_rate": 8.142670174973545e-06, + "loss": 0.3442, + "step": 10521 + }, + { + "epoch": 0.31, + "grad_norm": 1.843835195658205, + "learning_rate": 8.1423048309123e-06, + "loss": 0.3695, + "step": 10522 + }, + { + "epoch": 0.31, + "grad_norm": 1.3286750224759787, + "learning_rate": 8.141939459120181e-06, + "loss": 0.3528, + "step": 10523 + }, + { + "epoch": 0.31, + "grad_norm": 1.6134330843346658, + "learning_rate": 8.14157405960041e-06, + "loss": 0.3485, + "step": 10524 + }, + { + "epoch": 0.31, + "grad_norm": 1.393378720950132, + "learning_rate": 8.141208632356214e-06, + "loss": 0.3663, + "step": 10525 + }, + { + "epoch": 0.31, + "grad_norm": 1.4492632809752397, + "learning_rate": 8.140843177390816e-06, + "loss": 0.3566, + "step": 10526 + }, + { + "epoch": 0.31, + "grad_norm": 1.4880660932807064, + "learning_rate": 8.140477694707444e-06, + "loss": 0.3448, + "step": 10527 + }, + { + "epoch": 0.31, + "grad_norm": 2.1352463272550475, + "learning_rate": 8.14011218430932e-06, + "loss": 0.3068, + "step": 10528 + }, + { + "epoch": 0.31, + "grad_norm": 1.8092370558301205, + "learning_rate": 8.139746646199674e-06, + "loss": 0.3368, + "step": 10529 + }, + { + "epoch": 0.31, + "grad_norm": 1.406815424448341, + "learning_rate": 8.139381080381726e-06, + "loss": 0.3246, + "step": 10530 + }, + { + "epoch": 0.31, + "grad_norm": 1.4184067701190572, + "learning_rate": 8.139015486858706e-06, + "loss": 0.3536, + "step": 10531 + }, + { + "epoch": 0.31, + "grad_norm": 1.3479718037654296, + "learning_rate": 8.13864986563384e-06, + "loss": 0.3235, + "step": 10532 + }, + { + "epoch": 0.31, + "grad_norm": 1.3588603222978575, + "learning_rate": 8.138284216710353e-06, + "loss": 0.3488, + "step": 10533 + }, + { + "epoch": 0.31, + "grad_norm": 1.3438947128797682, + "learning_rate": 8.137918540091473e-06, + "loss": 0.3321, + "step": 10534 + }, + { + "epoch": 0.31, + "grad_norm": 1.873977961265388, + "learning_rate": 8.137552835780428e-06, + "loss": 0.3397, + "step": 10535 + }, + { + "epoch": 0.31, + "grad_norm": 1.3469984442057323, + "learning_rate": 8.137187103780444e-06, + "loss": 0.3317, + "step": 10536 + }, + { + "epoch": 0.31, + "grad_norm": 1.4390616886921657, + "learning_rate": 8.13682134409475e-06, + "loss": 0.3364, + "step": 10537 + }, + { + "epoch": 0.31, + "grad_norm": 1.5851464340259391, + "learning_rate": 8.13645555672657e-06, + "loss": 0.3364, + "step": 10538 + }, + { + "epoch": 0.31, + "grad_norm": 1.8354802864781372, + "learning_rate": 8.136089741679138e-06, + "loss": 0.3573, + "step": 10539 + }, + { + "epoch": 0.31, + "grad_norm": 1.5998495572101128, + "learning_rate": 8.135723898955678e-06, + "loss": 0.3686, + "step": 10540 + }, + { + "epoch": 0.31, + "grad_norm": 1.6262314806639797, + "learning_rate": 8.13535802855942e-06, + "loss": 0.3635, + "step": 10541 + }, + { + "epoch": 0.31, + "grad_norm": 2.033804842408162, + "learning_rate": 8.134992130493593e-06, + "loss": 0.3514, + "step": 10542 + }, + { + "epoch": 0.31, + "grad_norm": 1.526335487475116, + "learning_rate": 8.134626204761424e-06, + "loss": 0.3559, + "step": 10543 + }, + { + "epoch": 0.31, + "grad_norm": 1.5416193793617896, + "learning_rate": 8.134260251366146e-06, + "loss": 0.3637, + "step": 10544 + }, + { + "epoch": 0.31, + "grad_norm": 1.2452415870800537, + "learning_rate": 8.133894270310986e-06, + "loss": 0.3495, + "step": 10545 + }, + { + "epoch": 0.31, + "grad_norm": 1.3608690196620803, + "learning_rate": 8.133528261599173e-06, + "loss": 0.3623, + "step": 10546 + }, + { + "epoch": 0.31, + "grad_norm": 1.4258097175325568, + "learning_rate": 8.13316222523394e-06, + "loss": 0.3561, + "step": 10547 + }, + { + "epoch": 0.31, + "grad_norm": 1.5172814555191791, + "learning_rate": 8.132796161218515e-06, + "loss": 0.3384, + "step": 10548 + }, + { + "epoch": 0.31, + "grad_norm": 1.395188237884857, + "learning_rate": 8.132430069556129e-06, + "loss": 0.3422, + "step": 10549 + }, + { + "epoch": 0.31, + "grad_norm": 1.6689303951004362, + "learning_rate": 8.132063950250012e-06, + "loss": 0.3379, + "step": 10550 + }, + { + "epoch": 0.31, + "grad_norm": 1.9088126990862828, + "learning_rate": 8.131697803303398e-06, + "loss": 0.3765, + "step": 10551 + }, + { + "epoch": 0.31, + "grad_norm": 1.6204980035069667, + "learning_rate": 8.131331628719515e-06, + "loss": 0.3473, + "step": 10552 + }, + { + "epoch": 0.31, + "grad_norm": 1.462961928437862, + "learning_rate": 8.130965426501595e-06, + "loss": 0.3313, + "step": 10553 + }, + { + "epoch": 0.31, + "grad_norm": 1.4113711150420902, + "learning_rate": 8.130599196652872e-06, + "loss": 0.3529, + "step": 10554 + }, + { + "epoch": 0.31, + "grad_norm": 1.6888199799752013, + "learning_rate": 8.130232939176574e-06, + "loss": 0.3836, + "step": 10555 + }, + { + "epoch": 0.31, + "grad_norm": 1.2288434130285386, + "learning_rate": 8.129866654075937e-06, + "loss": 0.3362, + "step": 10556 + }, + { + "epoch": 0.31, + "grad_norm": 1.7729549363191268, + "learning_rate": 8.129500341354192e-06, + "loss": 0.365, + "step": 10557 + }, + { + "epoch": 0.31, + "grad_norm": 1.6362408954194123, + "learning_rate": 8.129134001014572e-06, + "loss": 0.3404, + "step": 10558 + }, + { + "epoch": 0.31, + "grad_norm": 1.4650229070201632, + "learning_rate": 8.12876763306031e-06, + "loss": 0.3587, + "step": 10559 + }, + { + "epoch": 0.31, + "grad_norm": 1.0418970495947937, + "learning_rate": 8.12840123749464e-06, + "loss": 0.6494, + "step": 10560 + }, + { + "epoch": 0.31, + "grad_norm": 1.3440344308376226, + "learning_rate": 8.128034814320791e-06, + "loss": 0.3483, + "step": 10561 + }, + { + "epoch": 0.31, + "grad_norm": 1.433024703996532, + "learning_rate": 8.127668363542e-06, + "loss": 0.3816, + "step": 10562 + }, + { + "epoch": 0.31, + "grad_norm": 2.3571351981595288, + "learning_rate": 8.127301885161504e-06, + "loss": 0.3645, + "step": 10563 + }, + { + "epoch": 0.31, + "grad_norm": 2.2270456982591282, + "learning_rate": 8.126935379182533e-06, + "loss": 0.4098, + "step": 10564 + }, + { + "epoch": 0.31, + "grad_norm": 1.6730485826744748, + "learning_rate": 8.126568845608323e-06, + "loss": 0.3364, + "step": 10565 + }, + { + "epoch": 0.31, + "grad_norm": 2.299479654011373, + "learning_rate": 8.126202284442108e-06, + "loss": 0.3593, + "step": 10566 + }, + { + "epoch": 0.31, + "grad_norm": 0.9138835326328363, + "learning_rate": 8.125835695687121e-06, + "loss": 0.5945, + "step": 10567 + }, + { + "epoch": 0.31, + "grad_norm": 1.5475932862038522, + "learning_rate": 8.125469079346601e-06, + "loss": 0.3553, + "step": 10568 + }, + { + "epoch": 0.31, + "grad_norm": 2.2528845139864817, + "learning_rate": 8.125102435423783e-06, + "loss": 0.3317, + "step": 10569 + }, + { + "epoch": 0.31, + "grad_norm": 1.3288816136164079, + "learning_rate": 8.124735763921901e-06, + "loss": 0.3237, + "step": 10570 + }, + { + "epoch": 0.31, + "grad_norm": 1.5057560639685657, + "learning_rate": 8.12436906484419e-06, + "loss": 0.3552, + "step": 10571 + }, + { + "epoch": 0.31, + "grad_norm": 0.9501983384740803, + "learning_rate": 8.124002338193889e-06, + "loss": 0.6038, + "step": 10572 + }, + { + "epoch": 0.31, + "grad_norm": 1.476834803447913, + "learning_rate": 8.123635583974232e-06, + "loss": 0.3311, + "step": 10573 + }, + { + "epoch": 0.31, + "grad_norm": 1.5183740212747745, + "learning_rate": 8.123268802188455e-06, + "loss": 0.3218, + "step": 10574 + }, + { + "epoch": 0.31, + "grad_norm": 1.506436485368528, + "learning_rate": 8.122901992839799e-06, + "loss": 0.3595, + "step": 10575 + }, + { + "epoch": 0.31, + "grad_norm": 1.4502540747836785, + "learning_rate": 8.122535155931494e-06, + "loss": 0.3732, + "step": 10576 + }, + { + "epoch": 0.31, + "grad_norm": 1.3846630500719055, + "learning_rate": 8.122168291466784e-06, + "loss": 0.34, + "step": 10577 + }, + { + "epoch": 0.31, + "grad_norm": 1.403578418469908, + "learning_rate": 8.121801399448905e-06, + "loss": 0.3544, + "step": 10578 + }, + { + "epoch": 0.31, + "grad_norm": 1.3507126701754204, + "learning_rate": 8.121434479881093e-06, + "loss": 0.3465, + "step": 10579 + }, + { + "epoch": 0.31, + "grad_norm": 2.108691474985248, + "learning_rate": 8.121067532766587e-06, + "loss": 0.3553, + "step": 10580 + }, + { + "epoch": 0.31, + "grad_norm": 1.526354583881882, + "learning_rate": 8.120700558108625e-06, + "loss": 0.3333, + "step": 10581 + }, + { + "epoch": 0.31, + "grad_norm": 1.3504231136614517, + "learning_rate": 8.120333555910447e-06, + "loss": 0.3581, + "step": 10582 + }, + { + "epoch": 0.31, + "grad_norm": 1.3700202802320014, + "learning_rate": 8.119966526175292e-06, + "loss": 0.3568, + "step": 10583 + }, + { + "epoch": 0.31, + "grad_norm": 1.5658096518709286, + "learning_rate": 8.119599468906396e-06, + "loss": 0.3849, + "step": 10584 + }, + { + "epoch": 0.31, + "grad_norm": 1.5472524217070587, + "learning_rate": 8.119232384107e-06, + "loss": 0.3312, + "step": 10585 + }, + { + "epoch": 0.31, + "grad_norm": 1.315090668663898, + "learning_rate": 8.118865271780344e-06, + "loss": 0.3207, + "step": 10586 + }, + { + "epoch": 0.31, + "grad_norm": 1.2729518314033672, + "learning_rate": 8.118498131929666e-06, + "loss": 0.3296, + "step": 10587 + }, + { + "epoch": 0.31, + "grad_norm": 0.922215612751887, + "learning_rate": 8.11813096455821e-06, + "loss": 0.5827, + "step": 10588 + }, + { + "epoch": 0.31, + "grad_norm": 1.3318642785874115, + "learning_rate": 8.117763769669211e-06, + "loss": 0.326, + "step": 10589 + }, + { + "epoch": 0.31, + "grad_norm": 1.5151190734690678, + "learning_rate": 8.117396547265914e-06, + "loss": 0.3503, + "step": 10590 + }, + { + "epoch": 0.31, + "grad_norm": 1.215116265724466, + "learning_rate": 8.117029297351558e-06, + "loss": 0.3293, + "step": 10591 + }, + { + "epoch": 0.31, + "grad_norm": 1.5773309087889549, + "learning_rate": 8.116662019929382e-06, + "loss": 0.3356, + "step": 10592 + }, + { + "epoch": 0.31, + "grad_norm": 2.3970725000875777, + "learning_rate": 8.116294715002631e-06, + "loss": 0.3181, + "step": 10593 + }, + { + "epoch": 0.31, + "grad_norm": 1.4701591662891837, + "learning_rate": 8.115927382574545e-06, + "loss": 0.3567, + "step": 10594 + }, + { + "epoch": 0.31, + "grad_norm": 1.5068576782057794, + "learning_rate": 8.115560022648363e-06, + "loss": 0.334, + "step": 10595 + }, + { + "epoch": 0.31, + "grad_norm": 1.441963977662421, + "learning_rate": 8.11519263522733e-06, + "loss": 0.3402, + "step": 10596 + }, + { + "epoch": 0.31, + "grad_norm": 1.3001344740991938, + "learning_rate": 8.114825220314688e-06, + "loss": 0.3308, + "step": 10597 + }, + { + "epoch": 0.31, + "grad_norm": 5.818580981960607, + "learning_rate": 8.114457777913679e-06, + "loss": 0.342, + "step": 10598 + }, + { + "epoch": 0.31, + "grad_norm": 1.5190131037338304, + "learning_rate": 8.114090308027546e-06, + "loss": 0.3595, + "step": 10599 + }, + { + "epoch": 0.31, + "grad_norm": 1.4249102520615788, + "learning_rate": 8.11372281065953e-06, + "loss": 0.3409, + "step": 10600 + }, + { + "epoch": 0.31, + "grad_norm": 0.9166805230340055, + "learning_rate": 8.113355285812877e-06, + "loss": 0.6131, + "step": 10601 + }, + { + "epoch": 0.31, + "grad_norm": 2.102995245939152, + "learning_rate": 8.112987733490828e-06, + "loss": 0.3458, + "step": 10602 + }, + { + "epoch": 0.31, + "grad_norm": 1.3160973030305976, + "learning_rate": 8.11262015369663e-06, + "loss": 0.3513, + "step": 10603 + }, + { + "epoch": 0.31, + "grad_norm": 2.0735874666312832, + "learning_rate": 8.112252546433521e-06, + "loss": 0.3443, + "step": 10604 + }, + { + "epoch": 0.31, + "grad_norm": 1.4509220665386287, + "learning_rate": 8.11188491170475e-06, + "loss": 0.3439, + "step": 10605 + }, + { + "epoch": 0.31, + "grad_norm": 1.3328298654865225, + "learning_rate": 8.111517249513562e-06, + "loss": 0.3434, + "step": 10606 + }, + { + "epoch": 0.31, + "grad_norm": 1.4100116581055657, + "learning_rate": 8.111149559863198e-06, + "loss": 0.3501, + "step": 10607 + }, + { + "epoch": 0.31, + "grad_norm": 1.7441123716371096, + "learning_rate": 8.110781842756905e-06, + "loss": 0.3551, + "step": 10608 + }, + { + "epoch": 0.31, + "grad_norm": 1.852523964141937, + "learning_rate": 8.110414098197927e-06, + "loss": 0.3547, + "step": 10609 + }, + { + "epoch": 0.31, + "grad_norm": 1.4189544676585455, + "learning_rate": 8.110046326189511e-06, + "loss": 0.3383, + "step": 10610 + }, + { + "epoch": 0.31, + "grad_norm": 1.2971583271737317, + "learning_rate": 8.109678526734902e-06, + "loss": 0.3219, + "step": 10611 + }, + { + "epoch": 0.31, + "grad_norm": 1.331924978788535, + "learning_rate": 8.109310699837345e-06, + "loss": 0.3335, + "step": 10612 + }, + { + "epoch": 0.31, + "grad_norm": 1.3285693655631483, + "learning_rate": 8.108942845500088e-06, + "loss": 0.3414, + "step": 10613 + }, + { + "epoch": 0.31, + "grad_norm": 1.4761891354803642, + "learning_rate": 8.108574963726372e-06, + "loss": 0.3311, + "step": 10614 + }, + { + "epoch": 0.31, + "grad_norm": 1.3515020689062738, + "learning_rate": 8.10820705451945e-06, + "loss": 0.3373, + "step": 10615 + }, + { + "epoch": 0.31, + "grad_norm": 1.3864487145932831, + "learning_rate": 8.107839117882569e-06, + "loss": 0.33, + "step": 10616 + }, + { + "epoch": 0.31, + "grad_norm": 2.0455462381731873, + "learning_rate": 8.107471153818968e-06, + "loss": 0.341, + "step": 10617 + }, + { + "epoch": 0.31, + "grad_norm": 1.2938013214494908, + "learning_rate": 8.107103162331903e-06, + "loss": 0.3302, + "step": 10618 + }, + { + "epoch": 0.31, + "grad_norm": 1.4858283845758944, + "learning_rate": 8.106735143424618e-06, + "loss": 0.3971, + "step": 10619 + }, + { + "epoch": 0.31, + "grad_norm": 1.3584965806202227, + "learning_rate": 8.10636709710036e-06, + "loss": 0.3442, + "step": 10620 + }, + { + "epoch": 0.31, + "grad_norm": 1.3469202621920673, + "learning_rate": 8.105999023362377e-06, + "loss": 0.3654, + "step": 10621 + }, + { + "epoch": 0.31, + "grad_norm": 1.4715339388784066, + "learning_rate": 8.105630922213919e-06, + "loss": 0.3398, + "step": 10622 + }, + { + "epoch": 0.31, + "grad_norm": 1.6513474809091315, + "learning_rate": 8.105262793658235e-06, + "loss": 0.3262, + "step": 10623 + }, + { + "epoch": 0.31, + "grad_norm": 1.2683165589122884, + "learning_rate": 8.104894637698571e-06, + "loss": 0.3671, + "step": 10624 + }, + { + "epoch": 0.31, + "grad_norm": 1.71607580251776, + "learning_rate": 8.104526454338178e-06, + "loss": 0.3574, + "step": 10625 + }, + { + "epoch": 0.31, + "grad_norm": 1.4332897460402314, + "learning_rate": 8.104158243580305e-06, + "loss": 0.3389, + "step": 10626 + }, + { + "epoch": 0.31, + "grad_norm": 1.371341465508268, + "learning_rate": 8.1037900054282e-06, + "loss": 0.3488, + "step": 10627 + }, + { + "epoch": 0.31, + "grad_norm": 1.427770101931743, + "learning_rate": 8.103421739885113e-06, + "loss": 0.3622, + "step": 10628 + }, + { + "epoch": 0.31, + "grad_norm": 1.4215847099573724, + "learning_rate": 8.103053446954297e-06, + "loss": 0.3435, + "step": 10629 + }, + { + "epoch": 0.31, + "grad_norm": 2.010691719276131, + "learning_rate": 8.102685126638998e-06, + "loss": 0.392, + "step": 10630 + }, + { + "epoch": 0.31, + "grad_norm": 1.4247660171860015, + "learning_rate": 8.10231677894247e-06, + "loss": 0.3399, + "step": 10631 + }, + { + "epoch": 0.31, + "grad_norm": 1.4221011120980052, + "learning_rate": 8.10194840386796e-06, + "loss": 0.3567, + "step": 10632 + }, + { + "epoch": 0.31, + "grad_norm": 1.3797228981852074, + "learning_rate": 8.101580001418723e-06, + "loss": 0.3468, + "step": 10633 + }, + { + "epoch": 0.31, + "grad_norm": 1.5066721763279514, + "learning_rate": 8.101211571598007e-06, + "loss": 0.3662, + "step": 10634 + }, + { + "epoch": 0.31, + "grad_norm": 1.3761215181999882, + "learning_rate": 8.100843114409063e-06, + "loss": 0.3578, + "step": 10635 + }, + { + "epoch": 0.31, + "grad_norm": 1.2985428818944629, + "learning_rate": 8.100474629855147e-06, + "loss": 0.3699, + "step": 10636 + }, + { + "epoch": 0.31, + "grad_norm": 1.4078244306184335, + "learning_rate": 8.100106117939506e-06, + "loss": 0.3523, + "step": 10637 + }, + { + "epoch": 0.31, + "grad_norm": 1.6783799745110262, + "learning_rate": 8.099737578665394e-06, + "loss": 0.3333, + "step": 10638 + }, + { + "epoch": 0.31, + "grad_norm": 1.4977040593735047, + "learning_rate": 8.099369012036065e-06, + "loss": 0.3648, + "step": 10639 + }, + { + "epoch": 0.31, + "grad_norm": 1.268485160974864, + "learning_rate": 8.09900041805477e-06, + "loss": 0.34, + "step": 10640 + }, + { + "epoch": 0.31, + "grad_norm": 1.57083950689939, + "learning_rate": 8.09863179672476e-06, + "loss": 0.3641, + "step": 10641 + }, + { + "epoch": 0.31, + "grad_norm": 1.3947331731859642, + "learning_rate": 8.098263148049293e-06, + "loss": 0.3221, + "step": 10642 + }, + { + "epoch": 0.31, + "grad_norm": 1.410762167816367, + "learning_rate": 8.097894472031618e-06, + "loss": 0.3329, + "step": 10643 + }, + { + "epoch": 0.31, + "grad_norm": 1.382016568858268, + "learning_rate": 8.097525768674988e-06, + "loss": 0.3623, + "step": 10644 + }, + { + "epoch": 0.31, + "grad_norm": 0.9793625579923836, + "learning_rate": 8.097157037982662e-06, + "loss": 0.5956, + "step": 10645 + }, + { + "epoch": 0.31, + "grad_norm": 1.2984808971074344, + "learning_rate": 8.096788279957888e-06, + "loss": 0.3265, + "step": 10646 + }, + { + "epoch": 0.31, + "grad_norm": 1.8366004829076952, + "learning_rate": 8.096419494603925e-06, + "loss": 0.3217, + "step": 10647 + }, + { + "epoch": 0.31, + "grad_norm": 1.2373090566174854, + "learning_rate": 8.096050681924024e-06, + "loss": 0.33, + "step": 10648 + }, + { + "epoch": 0.31, + "grad_norm": 1.6312453177530752, + "learning_rate": 8.095681841921441e-06, + "loss": 0.3433, + "step": 10649 + }, + { + "epoch": 0.31, + "grad_norm": 1.5755365364946399, + "learning_rate": 8.095312974599433e-06, + "loss": 0.3504, + "step": 10650 + }, + { + "epoch": 0.31, + "grad_norm": 1.426014017827436, + "learning_rate": 8.094944079961253e-06, + "loss": 0.3649, + "step": 10651 + }, + { + "epoch": 0.31, + "grad_norm": 1.8113405614377698, + "learning_rate": 8.094575158010158e-06, + "loss": 0.377, + "step": 10652 + }, + { + "epoch": 0.31, + "grad_norm": 1.2637871205602296, + "learning_rate": 8.094206208749402e-06, + "loss": 0.3992, + "step": 10653 + }, + { + "epoch": 0.31, + "grad_norm": 1.6279259021245747, + "learning_rate": 8.093837232182242e-06, + "loss": 0.3726, + "step": 10654 + }, + { + "epoch": 0.31, + "grad_norm": 1.3806991441982466, + "learning_rate": 8.093468228311936e-06, + "loss": 0.3598, + "step": 10655 + }, + { + "epoch": 0.31, + "grad_norm": 1.41440475207063, + "learning_rate": 8.093099197141736e-06, + "loss": 0.3482, + "step": 10656 + }, + { + "epoch": 0.31, + "grad_norm": 1.621703360001662, + "learning_rate": 8.092730138674903e-06, + "loss": 0.3506, + "step": 10657 + }, + { + "epoch": 0.31, + "grad_norm": 1.3258898287303638, + "learning_rate": 8.09236105291469e-06, + "loss": 0.3287, + "step": 10658 + }, + { + "epoch": 0.31, + "grad_norm": 1.3132836348924553, + "learning_rate": 8.091991939864358e-06, + "loss": 0.3354, + "step": 10659 + }, + { + "epoch": 0.31, + "grad_norm": 1.309288722468368, + "learning_rate": 8.091622799527163e-06, + "loss": 0.3328, + "step": 10660 + }, + { + "epoch": 0.31, + "grad_norm": 1.4775532651765817, + "learning_rate": 8.091253631906362e-06, + "loss": 0.3373, + "step": 10661 + }, + { + "epoch": 0.31, + "grad_norm": 1.320276916329198, + "learning_rate": 8.090884437005213e-06, + "loss": 0.3428, + "step": 10662 + }, + { + "epoch": 0.31, + "grad_norm": 1.3215911396977493, + "learning_rate": 8.090515214826975e-06, + "loss": 0.3656, + "step": 10663 + }, + { + "epoch": 0.31, + "grad_norm": 1.396415246243737, + "learning_rate": 8.090145965374906e-06, + "loss": 0.3621, + "step": 10664 + }, + { + "epoch": 0.31, + "grad_norm": 1.3377960662095965, + "learning_rate": 8.089776688652265e-06, + "loss": 0.3583, + "step": 10665 + }, + { + "epoch": 0.31, + "grad_norm": 1.4132999332982852, + "learning_rate": 8.089407384662308e-06, + "loss": 0.3398, + "step": 10666 + }, + { + "epoch": 0.31, + "grad_norm": 1.3574640559947773, + "learning_rate": 8.0890380534083e-06, + "loss": 0.329, + "step": 10667 + }, + { + "epoch": 0.31, + "grad_norm": 1.4873615825844066, + "learning_rate": 8.088668694893496e-06, + "loss": 0.3309, + "step": 10668 + }, + { + "epoch": 0.31, + "grad_norm": 1.2765203611829588, + "learning_rate": 8.088299309121156e-06, + "loss": 0.3566, + "step": 10669 + }, + { + "epoch": 0.31, + "grad_norm": 1.313924352140365, + "learning_rate": 8.08792989609454e-06, + "loss": 0.3437, + "step": 10670 + }, + { + "epoch": 0.31, + "grad_norm": 1.424008523184571, + "learning_rate": 8.087560455816909e-06, + "loss": 0.3673, + "step": 10671 + }, + { + "epoch": 0.31, + "grad_norm": 1.4008208916196248, + "learning_rate": 8.087190988291523e-06, + "loss": 0.3951, + "step": 10672 + }, + { + "epoch": 0.31, + "grad_norm": 1.343759221953751, + "learning_rate": 8.086821493521642e-06, + "loss": 0.3307, + "step": 10673 + }, + { + "epoch": 0.31, + "grad_norm": 1.4899209511748577, + "learning_rate": 8.086451971510526e-06, + "loss": 0.3488, + "step": 10674 + }, + { + "epoch": 0.31, + "grad_norm": 1.3317195791791752, + "learning_rate": 8.086082422261438e-06, + "loss": 0.3383, + "step": 10675 + }, + { + "epoch": 0.31, + "grad_norm": 1.3264573337053642, + "learning_rate": 8.085712845777638e-06, + "loss": 0.339, + "step": 10676 + }, + { + "epoch": 0.31, + "grad_norm": 1.6559261075768381, + "learning_rate": 8.085343242062388e-06, + "loss": 0.3476, + "step": 10677 + }, + { + "epoch": 0.31, + "grad_norm": 1.464564010865818, + "learning_rate": 8.08497361111895e-06, + "loss": 0.3626, + "step": 10678 + }, + { + "epoch": 0.31, + "grad_norm": 1.2408401620083944, + "learning_rate": 8.084603952950586e-06, + "loss": 0.3607, + "step": 10679 + }, + { + "epoch": 0.31, + "grad_norm": 1.6210566770198143, + "learning_rate": 8.084234267560555e-06, + "loss": 0.346, + "step": 10680 + }, + { + "epoch": 0.31, + "grad_norm": 1.3852860028196121, + "learning_rate": 8.083864554952127e-06, + "loss": 0.3286, + "step": 10681 + }, + { + "epoch": 0.31, + "grad_norm": 1.2874548395745007, + "learning_rate": 8.083494815128559e-06, + "loss": 0.3475, + "step": 10682 + }, + { + "epoch": 0.31, + "grad_norm": 1.4069773865468007, + "learning_rate": 8.083125048093112e-06, + "loss": 0.3697, + "step": 10683 + }, + { + "epoch": 0.31, + "grad_norm": 1.2149749592681962, + "learning_rate": 8.082755253849056e-06, + "loss": 0.3306, + "step": 10684 + }, + { + "epoch": 0.31, + "grad_norm": 1.2959318154505337, + "learning_rate": 8.082385432399648e-06, + "loss": 0.3444, + "step": 10685 + }, + { + "epoch": 0.31, + "grad_norm": 0.9726289471242838, + "learning_rate": 8.082015583748156e-06, + "loss": 0.6201, + "step": 10686 + }, + { + "epoch": 0.31, + "grad_norm": 1.5561814619782075, + "learning_rate": 8.08164570789784e-06, + "loss": 0.3514, + "step": 10687 + }, + { + "epoch": 0.31, + "grad_norm": 1.4037076561645079, + "learning_rate": 8.081275804851969e-06, + "loss": 0.3571, + "step": 10688 + }, + { + "epoch": 0.31, + "grad_norm": 1.3174020739971009, + "learning_rate": 8.080905874613803e-06, + "loss": 0.3548, + "step": 10689 + }, + { + "epoch": 0.31, + "grad_norm": 1.2682538969130261, + "learning_rate": 8.080535917186609e-06, + "loss": 0.3284, + "step": 10690 + }, + { + "epoch": 0.31, + "grad_norm": 1.530548207924241, + "learning_rate": 8.080165932573651e-06, + "loss": 0.3329, + "step": 10691 + }, + { + "epoch": 0.31, + "grad_norm": 1.50715049214734, + "learning_rate": 8.079795920778197e-06, + "loss": 0.3364, + "step": 10692 + }, + { + "epoch": 0.31, + "grad_norm": 1.3598431282573618, + "learning_rate": 8.079425881803507e-06, + "loss": 0.3594, + "step": 10693 + }, + { + "epoch": 0.31, + "grad_norm": 1.4727399232777723, + "learning_rate": 8.07905581565285e-06, + "loss": 0.3439, + "step": 10694 + }, + { + "epoch": 0.31, + "grad_norm": 1.3164273371028388, + "learning_rate": 8.07868572232949e-06, + "loss": 0.3489, + "step": 10695 + }, + { + "epoch": 0.31, + "grad_norm": 1.2071560812489668, + "learning_rate": 8.078315601836699e-06, + "loss": 0.3282, + "step": 10696 + }, + { + "epoch": 0.31, + "grad_norm": 2.6627193732418832, + "learning_rate": 8.077945454177736e-06, + "loss": 0.3344, + "step": 10697 + }, + { + "epoch": 0.31, + "grad_norm": 2.030161527474595, + "learning_rate": 8.07757527935587e-06, + "loss": 0.3612, + "step": 10698 + }, + { + "epoch": 0.31, + "grad_norm": 1.4817057538129883, + "learning_rate": 8.077205077374367e-06, + "loss": 0.3945, + "step": 10699 + }, + { + "epoch": 0.31, + "grad_norm": 1.3267404019507747, + "learning_rate": 8.076834848236496e-06, + "loss": 0.331, + "step": 10700 + }, + { + "epoch": 0.31, + "grad_norm": 0.9258132785719666, + "learning_rate": 8.076464591945524e-06, + "loss": 0.6327, + "step": 10701 + }, + { + "epoch": 0.31, + "grad_norm": 1.1839471956969423, + "learning_rate": 8.076094308504716e-06, + "loss": 0.3421, + "step": 10702 + }, + { + "epoch": 0.31, + "grad_norm": 1.4559416357702402, + "learning_rate": 8.075723997917341e-06, + "loss": 0.3355, + "step": 10703 + }, + { + "epoch": 0.31, + "grad_norm": 1.2674375917452203, + "learning_rate": 8.075353660186671e-06, + "loss": 0.3289, + "step": 10704 + }, + { + "epoch": 0.31, + "grad_norm": 1.4006511886093849, + "learning_rate": 8.074983295315969e-06, + "loss": 0.3773, + "step": 10705 + }, + { + "epoch": 0.31, + "grad_norm": 1.3107771240771486, + "learning_rate": 8.074612903308505e-06, + "loss": 0.362, + "step": 10706 + }, + { + "epoch": 0.31, + "grad_norm": 1.2505257409320447, + "learning_rate": 8.074242484167549e-06, + "loss": 0.3366, + "step": 10707 + }, + { + "epoch": 0.31, + "grad_norm": 1.3138399935883975, + "learning_rate": 8.073872037896368e-06, + "loss": 0.3705, + "step": 10708 + }, + { + "epoch": 0.31, + "grad_norm": 1.2828193654899385, + "learning_rate": 8.073501564498232e-06, + "loss": 0.3504, + "step": 10709 + }, + { + "epoch": 0.31, + "grad_norm": 1.3148904842983768, + "learning_rate": 8.073131063976413e-06, + "loss": 0.3388, + "step": 10710 + }, + { + "epoch": 0.31, + "grad_norm": 1.5518566884752216, + "learning_rate": 8.072760536334177e-06, + "loss": 0.325, + "step": 10711 + }, + { + "epoch": 0.31, + "grad_norm": 1.365861612322021, + "learning_rate": 8.072389981574794e-06, + "loss": 0.3389, + "step": 10712 + }, + { + "epoch": 0.31, + "grad_norm": 1.7108337024979121, + "learning_rate": 8.072019399701536e-06, + "loss": 0.3597, + "step": 10713 + }, + { + "epoch": 0.31, + "grad_norm": 2.047927895243195, + "learning_rate": 8.071648790717672e-06, + "loss": 0.3816, + "step": 10714 + }, + { + "epoch": 0.31, + "grad_norm": 1.3649280951692384, + "learning_rate": 8.071278154626474e-06, + "loss": 0.3267, + "step": 10715 + }, + { + "epoch": 0.31, + "grad_norm": 1.374838906976968, + "learning_rate": 8.070907491431213e-06, + "loss": 0.3545, + "step": 10716 + }, + { + "epoch": 0.31, + "grad_norm": 1.3843498138545813, + "learning_rate": 8.070536801135158e-06, + "loss": 0.3794, + "step": 10717 + }, + { + "epoch": 0.31, + "grad_norm": 1.6212967725825553, + "learning_rate": 8.070166083741583e-06, + "loss": 0.3658, + "step": 10718 + }, + { + "epoch": 0.31, + "grad_norm": 1.2815356474938764, + "learning_rate": 8.069795339253756e-06, + "loss": 0.3484, + "step": 10719 + }, + { + "epoch": 0.31, + "grad_norm": 3.980021075079315, + "learning_rate": 8.069424567674954e-06, + "loss": 0.3903, + "step": 10720 + }, + { + "epoch": 0.31, + "grad_norm": 2.433259511049197, + "learning_rate": 8.069053769008443e-06, + "loss": 0.3586, + "step": 10721 + }, + { + "epoch": 0.31, + "grad_norm": 1.7167703479133645, + "learning_rate": 8.068682943257501e-06, + "loss": 0.3566, + "step": 10722 + }, + { + "epoch": 0.31, + "grad_norm": 1.3155722556580165, + "learning_rate": 8.068312090425397e-06, + "loss": 0.3406, + "step": 10723 + }, + { + "epoch": 0.31, + "grad_norm": 1.2768978796957253, + "learning_rate": 8.067941210515406e-06, + "loss": 0.3432, + "step": 10724 + }, + { + "epoch": 0.31, + "grad_norm": 1.3412483404112288, + "learning_rate": 8.067570303530799e-06, + "loss": 0.3407, + "step": 10725 + }, + { + "epoch": 0.31, + "grad_norm": 1.2829671711673214, + "learning_rate": 8.06719936947485e-06, + "loss": 0.311, + "step": 10726 + }, + { + "epoch": 0.31, + "grad_norm": 1.2529744062413513, + "learning_rate": 8.066828408350832e-06, + "loss": 0.3284, + "step": 10727 + }, + { + "epoch": 0.31, + "grad_norm": 1.2365170409460478, + "learning_rate": 8.066457420162019e-06, + "loss": 0.3291, + "step": 10728 + }, + { + "epoch": 0.31, + "grad_norm": 1.5359437236944802, + "learning_rate": 8.066086404911685e-06, + "loss": 0.3474, + "step": 10729 + }, + { + "epoch": 0.31, + "grad_norm": 1.2813979373658833, + "learning_rate": 8.065715362603106e-06, + "loss": 0.3465, + "step": 10730 + }, + { + "epoch": 0.31, + "grad_norm": 1.9662353072406482, + "learning_rate": 8.065344293239555e-06, + "loss": 0.3746, + "step": 10731 + }, + { + "epoch": 0.31, + "grad_norm": 1.2381270645884461, + "learning_rate": 8.064973196824307e-06, + "loss": 0.3611, + "step": 10732 + }, + { + "epoch": 0.31, + "grad_norm": 1.6790159503628719, + "learning_rate": 8.064602073360635e-06, + "loss": 0.3636, + "step": 10733 + }, + { + "epoch": 0.31, + "grad_norm": 1.4407006892303822, + "learning_rate": 8.064230922851815e-06, + "loss": 0.3431, + "step": 10734 + }, + { + "epoch": 0.31, + "grad_norm": 1.2903834941440988, + "learning_rate": 8.063859745301124e-06, + "loss": 0.3411, + "step": 10735 + }, + { + "epoch": 0.31, + "grad_norm": 1.9475609890175682, + "learning_rate": 8.063488540711837e-06, + "loss": 0.3625, + "step": 10736 + }, + { + "epoch": 0.31, + "grad_norm": 1.3325714405060263, + "learning_rate": 8.063117309087229e-06, + "loss": 0.3468, + "step": 10737 + }, + { + "epoch": 0.31, + "grad_norm": 1.336381935220045, + "learning_rate": 8.062746050430579e-06, + "loss": 0.3383, + "step": 10738 + }, + { + "epoch": 0.31, + "grad_norm": 1.5814006930206592, + "learning_rate": 8.062374764745158e-06, + "loss": 0.3331, + "step": 10739 + }, + { + "epoch": 0.31, + "grad_norm": 1.5735147683692978, + "learning_rate": 8.062003452034248e-06, + "loss": 0.3473, + "step": 10740 + }, + { + "epoch": 0.31, + "grad_norm": 1.2431895529062231, + "learning_rate": 8.061632112301122e-06, + "loss": 0.3425, + "step": 10741 + }, + { + "epoch": 0.31, + "grad_norm": 1.501834760930911, + "learning_rate": 8.06126074554906e-06, + "loss": 0.3521, + "step": 10742 + }, + { + "epoch": 0.31, + "grad_norm": 1.4446330186124088, + "learning_rate": 8.060889351781337e-06, + "loss": 0.3481, + "step": 10743 + }, + { + "epoch": 0.31, + "grad_norm": 1.3755558140390818, + "learning_rate": 8.060517931001233e-06, + "loss": 0.37, + "step": 10744 + }, + { + "epoch": 0.31, + "grad_norm": 1.3963498413981534, + "learning_rate": 8.060146483212022e-06, + "loss": 0.3619, + "step": 10745 + }, + { + "epoch": 0.31, + "grad_norm": 1.2422541916874115, + "learning_rate": 8.059775008416985e-06, + "loss": 0.3253, + "step": 10746 + }, + { + "epoch": 0.31, + "grad_norm": 1.687542465511433, + "learning_rate": 8.0594035066194e-06, + "loss": 0.3621, + "step": 10747 + }, + { + "epoch": 0.31, + "grad_norm": 1.4573908213138045, + "learning_rate": 8.059031977822546e-06, + "loss": 0.3433, + "step": 10748 + }, + { + "epoch": 0.31, + "grad_norm": 1.5010674451593449, + "learning_rate": 8.0586604220297e-06, + "loss": 0.3796, + "step": 10749 + }, + { + "epoch": 0.31, + "grad_norm": 1.9268556140294066, + "learning_rate": 8.058288839244141e-06, + "loss": 0.3425, + "step": 10750 + }, + { + "epoch": 0.31, + "grad_norm": 1.3210363095245155, + "learning_rate": 8.057917229469151e-06, + "loss": 0.328, + "step": 10751 + }, + { + "epoch": 0.31, + "grad_norm": 1.4589657873013402, + "learning_rate": 8.057545592708007e-06, + "loss": 0.369, + "step": 10752 + }, + { + "epoch": 0.31, + "grad_norm": 1.345874639567256, + "learning_rate": 8.057173928963989e-06, + "loss": 0.3404, + "step": 10753 + }, + { + "epoch": 0.31, + "grad_norm": 1.3506824653966814, + "learning_rate": 8.056802238240377e-06, + "loss": 0.3324, + "step": 10754 + }, + { + "epoch": 0.31, + "grad_norm": 1.3457545600159448, + "learning_rate": 8.05643052054045e-06, + "loss": 0.3397, + "step": 10755 + }, + { + "epoch": 0.31, + "grad_norm": 1.2947600557669634, + "learning_rate": 8.056058775867493e-06, + "loss": 0.3224, + "step": 10756 + }, + { + "epoch": 0.31, + "grad_norm": 2.4966443979939394, + "learning_rate": 8.055687004224781e-06, + "loss": 0.3493, + "step": 10757 + }, + { + "epoch": 0.31, + "grad_norm": 1.5545482238468893, + "learning_rate": 8.055315205615596e-06, + "loss": 0.343, + "step": 10758 + }, + { + "epoch": 0.31, + "grad_norm": 1.3508269211427082, + "learning_rate": 8.054943380043223e-06, + "loss": 0.3428, + "step": 10759 + }, + { + "epoch": 0.31, + "grad_norm": 1.4432435286711338, + "learning_rate": 8.05457152751094e-06, + "loss": 0.385, + "step": 10760 + }, + { + "epoch": 0.31, + "grad_norm": 1.4802620244658693, + "learning_rate": 8.05419964802203e-06, + "loss": 0.3572, + "step": 10761 + }, + { + "epoch": 0.31, + "grad_norm": 1.3375445729445528, + "learning_rate": 8.053827741579772e-06, + "loss": 0.3527, + "step": 10762 + }, + { + "epoch": 0.31, + "grad_norm": 1.6321494551570446, + "learning_rate": 8.053455808187452e-06, + "loss": 0.3453, + "step": 10763 + }, + { + "epoch": 0.31, + "grad_norm": 1.4272123829234056, + "learning_rate": 8.053083847848351e-06, + "loss": 0.3591, + "step": 10764 + }, + { + "epoch": 0.31, + "grad_norm": 0.9728328718450383, + "learning_rate": 8.05271186056575e-06, + "loss": 0.6078, + "step": 10765 + }, + { + "epoch": 0.31, + "grad_norm": 1.4119717416923545, + "learning_rate": 8.052339846342935e-06, + "loss": 0.3554, + "step": 10766 + }, + { + "epoch": 0.31, + "grad_norm": 1.3325210519355355, + "learning_rate": 8.051967805183185e-06, + "loss": 0.3336, + "step": 10767 + }, + { + "epoch": 0.31, + "grad_norm": 1.5051176937138095, + "learning_rate": 8.051595737089786e-06, + "loss": 0.3502, + "step": 10768 + }, + { + "epoch": 0.31, + "grad_norm": 1.5461320530281342, + "learning_rate": 8.05122364206602e-06, + "loss": 0.3407, + "step": 10769 + }, + { + "epoch": 0.31, + "grad_norm": 2.497430398071713, + "learning_rate": 8.050851520115173e-06, + "loss": 0.3173, + "step": 10770 + }, + { + "epoch": 0.31, + "grad_norm": 1.315086074510462, + "learning_rate": 8.050479371240525e-06, + "loss": 0.3325, + "step": 10771 + }, + { + "epoch": 0.31, + "grad_norm": 1.4888849441815146, + "learning_rate": 8.050107195445365e-06, + "loss": 0.3557, + "step": 10772 + }, + { + "epoch": 0.31, + "grad_norm": 1.3695639091222878, + "learning_rate": 8.049734992732975e-06, + "loss": 0.3409, + "step": 10773 + }, + { + "epoch": 0.31, + "grad_norm": 0.9999841406572839, + "learning_rate": 8.04936276310664e-06, + "loss": 0.5701, + "step": 10774 + }, + { + "epoch": 0.31, + "grad_norm": 1.5807296674257456, + "learning_rate": 8.048990506569643e-06, + "loss": 0.3285, + "step": 10775 + }, + { + "epoch": 0.31, + "grad_norm": 1.9983513233756967, + "learning_rate": 8.048618223125272e-06, + "loss": 0.3502, + "step": 10776 + }, + { + "epoch": 0.31, + "grad_norm": 1.3604271114486917, + "learning_rate": 8.04824591277681e-06, + "loss": 0.35, + "step": 10777 + }, + { + "epoch": 0.31, + "grad_norm": 1.7604381893375005, + "learning_rate": 8.047873575527547e-06, + "loss": 0.3348, + "step": 10778 + }, + { + "epoch": 0.31, + "grad_norm": 1.4769871520283098, + "learning_rate": 8.047501211380763e-06, + "loss": 0.3468, + "step": 10779 + }, + { + "epoch": 0.31, + "grad_norm": 1.4803471514726558, + "learning_rate": 8.04712882033975e-06, + "loss": 0.3847, + "step": 10780 + }, + { + "epoch": 0.31, + "grad_norm": 2.346863793667124, + "learning_rate": 8.04675640240779e-06, + "loss": 0.3546, + "step": 10781 + }, + { + "epoch": 0.31, + "grad_norm": 1.4839250291154933, + "learning_rate": 8.04638395758817e-06, + "loss": 0.3559, + "step": 10782 + }, + { + "epoch": 0.31, + "grad_norm": 1.3230039397255833, + "learning_rate": 8.04601148588418e-06, + "loss": 0.3598, + "step": 10783 + }, + { + "epoch": 0.31, + "grad_norm": 1.5706274176563928, + "learning_rate": 8.045638987299103e-06, + "loss": 0.3913, + "step": 10784 + }, + { + "epoch": 0.31, + "grad_norm": 1.3494631921926274, + "learning_rate": 8.045266461836228e-06, + "loss": 0.3727, + "step": 10785 + }, + { + "epoch": 0.31, + "grad_norm": 1.287289988164358, + "learning_rate": 8.044893909498843e-06, + "loss": 0.3382, + "step": 10786 + }, + { + "epoch": 0.31, + "grad_norm": 1.2920015844199313, + "learning_rate": 8.044521330290235e-06, + "loss": 0.3296, + "step": 10787 + }, + { + "epoch": 0.31, + "grad_norm": 1.7704328179989097, + "learning_rate": 8.044148724213694e-06, + "loss": 0.365, + "step": 10788 + }, + { + "epoch": 0.31, + "grad_norm": 1.3193608410572857, + "learning_rate": 8.043776091272505e-06, + "loss": 0.3881, + "step": 10789 + }, + { + "epoch": 0.31, + "grad_norm": 1.3096991620840457, + "learning_rate": 8.043403431469959e-06, + "loss": 0.3576, + "step": 10790 + }, + { + "epoch": 0.31, + "grad_norm": 1.348786460868135, + "learning_rate": 8.043030744809343e-06, + "loss": 0.3335, + "step": 10791 + }, + { + "epoch": 0.31, + "grad_norm": 1.5879629609589385, + "learning_rate": 8.04265803129395e-06, + "loss": 0.3726, + "step": 10792 + }, + { + "epoch": 0.31, + "grad_norm": 1.3329093229951494, + "learning_rate": 8.042285290927064e-06, + "loss": 0.3392, + "step": 10793 + }, + { + "epoch": 0.31, + "grad_norm": 1.214129782425606, + "learning_rate": 8.041912523711977e-06, + "loss": 0.3295, + "step": 10794 + }, + { + "epoch": 0.31, + "grad_norm": 1.3573562959306094, + "learning_rate": 8.041539729651978e-06, + "loss": 0.3297, + "step": 10795 + }, + { + "epoch": 0.31, + "grad_norm": 1.296611206873712, + "learning_rate": 8.041166908750355e-06, + "loss": 0.319, + "step": 10796 + }, + { + "epoch": 0.31, + "grad_norm": 1.3630440741794119, + "learning_rate": 8.040794061010402e-06, + "loss": 0.3496, + "step": 10797 + }, + { + "epoch": 0.31, + "grad_norm": 1.3254833267555144, + "learning_rate": 8.040421186435409e-06, + "loss": 0.3389, + "step": 10798 + }, + { + "epoch": 0.31, + "grad_norm": 1.2134779773205133, + "learning_rate": 8.040048285028663e-06, + "loss": 0.3434, + "step": 10799 + }, + { + "epoch": 0.31, + "grad_norm": 2.469651410425285, + "learning_rate": 8.03967535679346e-06, + "loss": 0.3457, + "step": 10800 + }, + { + "epoch": 0.31, + "grad_norm": 1.291540263881308, + "learning_rate": 8.039302401733085e-06, + "loss": 0.3634, + "step": 10801 + }, + { + "epoch": 0.31, + "grad_norm": 1.3579932100600896, + "learning_rate": 8.038929419850835e-06, + "loss": 0.3543, + "step": 10802 + }, + { + "epoch": 0.31, + "grad_norm": 1.27067562602353, + "learning_rate": 8.038556411149998e-06, + "loss": 0.3345, + "step": 10803 + }, + { + "epoch": 0.31, + "grad_norm": 2.3884080176723645, + "learning_rate": 8.038183375633868e-06, + "loss": 0.4006, + "step": 10804 + }, + { + "epoch": 0.31, + "grad_norm": 1.409939445957659, + "learning_rate": 8.037810313305736e-06, + "loss": 0.3489, + "step": 10805 + }, + { + "epoch": 0.31, + "grad_norm": 1.5414684453383802, + "learning_rate": 8.037437224168892e-06, + "loss": 0.3587, + "step": 10806 + }, + { + "epoch": 0.31, + "grad_norm": 1.3121464493622863, + "learning_rate": 8.037064108226632e-06, + "loss": 0.3562, + "step": 10807 + }, + { + "epoch": 0.31, + "grad_norm": 3.5139100921809656, + "learning_rate": 8.03669096548225e-06, + "loss": 0.3457, + "step": 10808 + }, + { + "epoch": 0.31, + "grad_norm": 1.62318648712254, + "learning_rate": 8.036317795939035e-06, + "loss": 0.3319, + "step": 10809 + }, + { + "epoch": 0.31, + "grad_norm": 1.3197240014993772, + "learning_rate": 8.03594459960028e-06, + "loss": 0.3589, + "step": 10810 + }, + { + "epoch": 0.31, + "grad_norm": 1.425110911063953, + "learning_rate": 8.035571376469283e-06, + "loss": 0.3832, + "step": 10811 + }, + { + "epoch": 0.31, + "grad_norm": 1.304248230511152, + "learning_rate": 8.035198126549333e-06, + "loss": 0.3693, + "step": 10812 + }, + { + "epoch": 0.31, + "grad_norm": 1.3101280194309448, + "learning_rate": 8.034824849843725e-06, + "loss": 0.3516, + "step": 10813 + }, + { + "epoch": 0.31, + "grad_norm": 1.3530749480072357, + "learning_rate": 8.034451546355756e-06, + "loss": 0.359, + "step": 10814 + }, + { + "epoch": 0.31, + "grad_norm": 1.4349213910520486, + "learning_rate": 8.034078216088718e-06, + "loss": 0.3561, + "step": 10815 + }, + { + "epoch": 0.31, + "grad_norm": 1.2991257181000304, + "learning_rate": 8.033704859045906e-06, + "loss": 0.3448, + "step": 10816 + }, + { + "epoch": 0.31, + "grad_norm": 1.379379074515039, + "learning_rate": 8.033331475230615e-06, + "loss": 0.339, + "step": 10817 + }, + { + "epoch": 0.31, + "grad_norm": 1.3294915930249347, + "learning_rate": 8.03295806464614e-06, + "loss": 0.3477, + "step": 10818 + }, + { + "epoch": 0.31, + "grad_norm": 1.4492898891853176, + "learning_rate": 8.032584627295775e-06, + "loss": 0.3369, + "step": 10819 + }, + { + "epoch": 0.31, + "grad_norm": 1.3252082724998528, + "learning_rate": 8.03221116318282e-06, + "loss": 0.3332, + "step": 10820 + }, + { + "epoch": 0.31, + "grad_norm": 1.3410687237446204, + "learning_rate": 8.031837672310565e-06, + "loss": 0.3238, + "step": 10821 + }, + { + "epoch": 0.31, + "grad_norm": 1.5490180783843035, + "learning_rate": 8.031464154682312e-06, + "loss": 0.3498, + "step": 10822 + }, + { + "epoch": 0.31, + "grad_norm": 1.2323768440219807, + "learning_rate": 8.031090610301351e-06, + "loss": 0.3209, + "step": 10823 + }, + { + "epoch": 0.31, + "grad_norm": 1.420017821276226, + "learning_rate": 8.030717039170983e-06, + "loss": 0.3353, + "step": 10824 + }, + { + "epoch": 0.31, + "grad_norm": 1.2752833733067162, + "learning_rate": 8.030343441294504e-06, + "loss": 0.3504, + "step": 10825 + }, + { + "epoch": 0.31, + "grad_norm": 1.5054204175728922, + "learning_rate": 8.02996981667521e-06, + "loss": 0.3469, + "step": 10826 + }, + { + "epoch": 0.31, + "grad_norm": 1.6305928318995506, + "learning_rate": 8.029596165316399e-06, + "loss": 0.3259, + "step": 10827 + }, + { + "epoch": 0.31, + "grad_norm": 1.3400902028533725, + "learning_rate": 8.029222487221369e-06, + "loss": 0.3409, + "step": 10828 + }, + { + "epoch": 0.31, + "grad_norm": 1.7996525328984656, + "learning_rate": 8.028848782393415e-06, + "loss": 0.3298, + "step": 10829 + }, + { + "epoch": 0.31, + "grad_norm": 1.512017610189686, + "learning_rate": 8.028475050835837e-06, + "loss": 0.3481, + "step": 10830 + }, + { + "epoch": 0.31, + "grad_norm": 1.4637029243299016, + "learning_rate": 8.028101292551935e-06, + "loss": 0.3327, + "step": 10831 + }, + { + "epoch": 0.31, + "grad_norm": 1.4113943470121944, + "learning_rate": 8.027727507545005e-06, + "loss": 0.3436, + "step": 10832 + }, + { + "epoch": 0.31, + "grad_norm": 1.647569580830791, + "learning_rate": 8.027353695818345e-06, + "loss": 0.3463, + "step": 10833 + }, + { + "epoch": 0.31, + "grad_norm": 1.4186458572248835, + "learning_rate": 8.026979857375256e-06, + "loss": 0.3358, + "step": 10834 + }, + { + "epoch": 0.31, + "grad_norm": 1.5599845077466648, + "learning_rate": 8.026605992219035e-06, + "loss": 0.3581, + "step": 10835 + }, + { + "epoch": 0.31, + "grad_norm": 1.2974931617201606, + "learning_rate": 8.026232100352984e-06, + "loss": 0.3434, + "step": 10836 + }, + { + "epoch": 0.31, + "grad_norm": 1.336084484427155, + "learning_rate": 8.025858181780401e-06, + "loss": 0.3535, + "step": 10837 + }, + { + "epoch": 0.31, + "grad_norm": 1.4950846681359313, + "learning_rate": 8.025484236504586e-06, + "loss": 0.3304, + "step": 10838 + }, + { + "epoch": 0.31, + "grad_norm": 1.5190400322058073, + "learning_rate": 8.025110264528838e-06, + "loss": 0.3776, + "step": 10839 + }, + { + "epoch": 0.31, + "grad_norm": 1.6122299057454266, + "learning_rate": 8.024736265856459e-06, + "loss": 0.3607, + "step": 10840 + }, + { + "epoch": 0.31, + "grad_norm": 1.8733469290951805, + "learning_rate": 8.024362240490748e-06, + "loss": 0.3442, + "step": 10841 + }, + { + "epoch": 0.31, + "grad_norm": 0.9313750219190716, + "learning_rate": 8.023988188435009e-06, + "loss": 0.64, + "step": 10842 + }, + { + "epoch": 0.31, + "grad_norm": 1.561856443790402, + "learning_rate": 8.02361410969254e-06, + "loss": 0.3567, + "step": 10843 + }, + { + "epoch": 0.31, + "grad_norm": 1.3458652016654558, + "learning_rate": 8.023240004266641e-06, + "loss": 0.352, + "step": 10844 + }, + { + "epoch": 0.31, + "grad_norm": 1.5012106334524655, + "learning_rate": 8.022865872160617e-06, + "loss": 0.3763, + "step": 10845 + }, + { + "epoch": 0.31, + "grad_norm": 1.5590906312536357, + "learning_rate": 8.022491713377767e-06, + "loss": 0.3358, + "step": 10846 + }, + { + "epoch": 0.31, + "grad_norm": 1.297050057188776, + "learning_rate": 8.022117527921396e-06, + "loss": 0.3418, + "step": 10847 + }, + { + "epoch": 0.31, + "grad_norm": 1.2463775423592949, + "learning_rate": 8.021743315794802e-06, + "loss": 0.3411, + "step": 10848 + }, + { + "epoch": 0.31, + "grad_norm": 1.2513433130726428, + "learning_rate": 8.021369077001291e-06, + "loss": 0.3255, + "step": 10849 + }, + { + "epoch": 0.31, + "grad_norm": 1.4119139667980751, + "learning_rate": 8.020994811544164e-06, + "loss": 0.3309, + "step": 10850 + }, + { + "epoch": 0.31, + "grad_norm": 1.3240158464389225, + "learning_rate": 8.020620519426725e-06, + "loss": 0.3185, + "step": 10851 + }, + { + "epoch": 0.31, + "grad_norm": 1.4222933533840054, + "learning_rate": 8.020246200652275e-06, + "loss": 0.3224, + "step": 10852 + }, + { + "epoch": 0.31, + "grad_norm": 0.9226452347388112, + "learning_rate": 8.019871855224122e-06, + "loss": 0.6268, + "step": 10853 + }, + { + "epoch": 0.31, + "grad_norm": 1.805186209595774, + "learning_rate": 8.019497483145564e-06, + "loss": 0.3347, + "step": 10854 + }, + { + "epoch": 0.31, + "grad_norm": 1.259742932651502, + "learning_rate": 8.019123084419907e-06, + "loss": 0.3471, + "step": 10855 + }, + { + "epoch": 0.31, + "grad_norm": 1.5709345805498265, + "learning_rate": 8.018748659050456e-06, + "loss": 0.3809, + "step": 10856 + }, + { + "epoch": 0.31, + "grad_norm": 1.436191811680819, + "learning_rate": 8.018374207040514e-06, + "loss": 0.3371, + "step": 10857 + }, + { + "epoch": 0.31, + "grad_norm": 1.4953322490923946, + "learning_rate": 8.017999728393385e-06, + "loss": 0.3046, + "step": 10858 + }, + { + "epoch": 0.31, + "grad_norm": 1.464419441766682, + "learning_rate": 8.017625223112376e-06, + "loss": 0.3287, + "step": 10859 + }, + { + "epoch": 0.31, + "grad_norm": 1.7955721921317573, + "learning_rate": 8.017250691200791e-06, + "loss": 0.3577, + "step": 10860 + }, + { + "epoch": 0.32, + "grad_norm": 1.244958228077306, + "learning_rate": 8.016876132661936e-06, + "loss": 0.3422, + "step": 10861 + }, + { + "epoch": 0.32, + "grad_norm": 2.7332262144216948, + "learning_rate": 8.016501547499115e-06, + "loss": 0.3459, + "step": 10862 + }, + { + "epoch": 0.32, + "grad_norm": 1.3530161776303595, + "learning_rate": 8.016126935715634e-06, + "loss": 0.3469, + "step": 10863 + }, + { + "epoch": 0.32, + "grad_norm": 1.2198058882193763, + "learning_rate": 8.015752297314801e-06, + "loss": 0.34, + "step": 10864 + }, + { + "epoch": 0.32, + "grad_norm": 1.436011098496844, + "learning_rate": 8.015377632299919e-06, + "loss": 0.3535, + "step": 10865 + }, + { + "epoch": 0.32, + "grad_norm": 1.242970967517421, + "learning_rate": 8.015002940674295e-06, + "loss": 0.3207, + "step": 10866 + }, + { + "epoch": 0.32, + "grad_norm": 1.5404386643790167, + "learning_rate": 8.01462822244124e-06, + "loss": 0.3217, + "step": 10867 + }, + { + "epoch": 0.32, + "grad_norm": 1.3544108952047362, + "learning_rate": 8.014253477604054e-06, + "loss": 0.3468, + "step": 10868 + }, + { + "epoch": 0.32, + "grad_norm": 1.3020436164208589, + "learning_rate": 8.013878706166048e-06, + "loss": 0.3492, + "step": 10869 + }, + { + "epoch": 0.32, + "grad_norm": 1.3022360493814729, + "learning_rate": 8.01350390813053e-06, + "loss": 0.4134, + "step": 10870 + }, + { + "epoch": 0.32, + "grad_norm": 1.220729728434775, + "learning_rate": 8.013129083500807e-06, + "loss": 0.3282, + "step": 10871 + }, + { + "epoch": 0.32, + "grad_norm": 2.0077615946983434, + "learning_rate": 8.012754232280185e-06, + "loss": 0.3326, + "step": 10872 + }, + { + "epoch": 0.32, + "grad_norm": 1.3097921125048009, + "learning_rate": 8.012379354471976e-06, + "loss": 0.3422, + "step": 10873 + }, + { + "epoch": 0.32, + "grad_norm": 1.8511004471785446, + "learning_rate": 8.012004450079483e-06, + "loss": 0.3419, + "step": 10874 + }, + { + "epoch": 0.32, + "grad_norm": 1.6105517247382006, + "learning_rate": 8.01162951910602e-06, + "loss": 0.3387, + "step": 10875 + }, + { + "epoch": 0.32, + "grad_norm": 1.398684254794178, + "learning_rate": 8.01125456155489e-06, + "loss": 0.3508, + "step": 10876 + }, + { + "epoch": 0.32, + "grad_norm": 1.3295112999617884, + "learning_rate": 8.010879577429406e-06, + "loss": 0.3239, + "step": 10877 + }, + { + "epoch": 0.32, + "grad_norm": 1.6338238531902032, + "learning_rate": 8.010504566732879e-06, + "loss": 0.3662, + "step": 10878 + }, + { + "epoch": 0.32, + "grad_norm": 1.2916690061821734, + "learning_rate": 8.010129529468614e-06, + "loss": 0.3436, + "step": 10879 + }, + { + "epoch": 0.32, + "grad_norm": 1.3239741149400523, + "learning_rate": 8.00975446563992e-06, + "loss": 0.3162, + "step": 10880 + }, + { + "epoch": 0.32, + "grad_norm": 2.424769879607053, + "learning_rate": 8.009379375250113e-06, + "loss": 0.3507, + "step": 10881 + }, + { + "epoch": 0.32, + "grad_norm": 1.8716650615334596, + "learning_rate": 8.009004258302497e-06, + "loss": 0.3569, + "step": 10882 + }, + { + "epoch": 0.32, + "grad_norm": 1.8240464571111719, + "learning_rate": 8.008629114800389e-06, + "loss": 0.3399, + "step": 10883 + }, + { + "epoch": 0.32, + "grad_norm": 1.2591759072159536, + "learning_rate": 8.00825394474709e-06, + "loss": 0.3332, + "step": 10884 + }, + { + "epoch": 0.32, + "grad_norm": 1.3713281381154097, + "learning_rate": 8.007878748145921e-06, + "loss": 0.3499, + "step": 10885 + }, + { + "epoch": 0.32, + "grad_norm": 1.3780179067469873, + "learning_rate": 8.007503525000188e-06, + "loss": 0.3009, + "step": 10886 + }, + { + "epoch": 0.32, + "grad_norm": 1.3695272171707433, + "learning_rate": 8.0071282753132e-06, + "loss": 0.3281, + "step": 10887 + }, + { + "epoch": 0.32, + "grad_norm": 1.28047738386142, + "learning_rate": 8.006752999088275e-06, + "loss": 0.3477, + "step": 10888 + }, + { + "epoch": 0.32, + "grad_norm": 1.2745347478093643, + "learning_rate": 8.00637769632872e-06, + "loss": 0.3333, + "step": 10889 + }, + { + "epoch": 0.32, + "grad_norm": 1.41520906331898, + "learning_rate": 8.00600236703785e-06, + "loss": 0.3442, + "step": 10890 + }, + { + "epoch": 0.32, + "grad_norm": 1.2764399073478625, + "learning_rate": 8.005627011218976e-06, + "loss": 0.3524, + "step": 10891 + }, + { + "epoch": 0.32, + "grad_norm": 1.2920458923430218, + "learning_rate": 8.005251628875407e-06, + "loss": 0.3657, + "step": 10892 + }, + { + "epoch": 0.32, + "grad_norm": 1.3740116388498187, + "learning_rate": 8.004876220010462e-06, + "loss": 0.3312, + "step": 10893 + }, + { + "epoch": 0.32, + "grad_norm": 1.4053558615831039, + "learning_rate": 8.004500784627449e-06, + "loss": 0.3351, + "step": 10894 + }, + { + "epoch": 0.32, + "grad_norm": 1.345105552751702, + "learning_rate": 8.004125322729684e-06, + "loss": 0.3413, + "step": 10895 + }, + { + "epoch": 0.32, + "grad_norm": 1.4952177985421138, + "learning_rate": 8.00374983432048e-06, + "loss": 0.3556, + "step": 10896 + }, + { + "epoch": 0.32, + "grad_norm": 1.4746689650033555, + "learning_rate": 8.00337431940315e-06, + "loss": 0.364, + "step": 10897 + }, + { + "epoch": 0.32, + "grad_norm": 1.315199468614717, + "learning_rate": 8.002998777981011e-06, + "loss": 0.3326, + "step": 10898 + }, + { + "epoch": 0.32, + "grad_norm": 1.4829459316636728, + "learning_rate": 8.002623210057369e-06, + "loss": 0.342, + "step": 10899 + }, + { + "epoch": 0.32, + "grad_norm": 1.3822503349173692, + "learning_rate": 8.002247615635548e-06, + "loss": 0.3616, + "step": 10900 + }, + { + "epoch": 0.32, + "grad_norm": 2.0182794586793733, + "learning_rate": 8.001871994718857e-06, + "loss": 0.3537, + "step": 10901 + }, + { + "epoch": 0.32, + "grad_norm": 1.3292726443986609, + "learning_rate": 8.001496347310614e-06, + "loss": 0.3421, + "step": 10902 + }, + { + "epoch": 0.32, + "grad_norm": 1.5105340206830569, + "learning_rate": 8.00112067341413e-06, + "loss": 0.3605, + "step": 10903 + }, + { + "epoch": 0.32, + "grad_norm": 1.7931837611777999, + "learning_rate": 8.000744973032725e-06, + "loss": 0.3787, + "step": 10904 + }, + { + "epoch": 0.32, + "grad_norm": 1.4332563063778212, + "learning_rate": 8.00036924616971e-06, + "loss": 0.359, + "step": 10905 + }, + { + "epoch": 0.32, + "grad_norm": 1.811719033021975, + "learning_rate": 7.999993492828404e-06, + "loss": 0.3673, + "step": 10906 + }, + { + "epoch": 0.32, + "grad_norm": 1.2136915123058392, + "learning_rate": 7.999617713012125e-06, + "loss": 0.3373, + "step": 10907 + }, + { + "epoch": 0.32, + "grad_norm": 1.2842186560165711, + "learning_rate": 7.999241906724184e-06, + "loss": 0.3441, + "step": 10908 + }, + { + "epoch": 0.32, + "grad_norm": 1.9604151879705185, + "learning_rate": 7.998866073967898e-06, + "loss": 0.3766, + "step": 10909 + }, + { + "epoch": 0.32, + "grad_norm": 1.317961723117389, + "learning_rate": 7.998490214746588e-06, + "loss": 0.382, + "step": 10910 + }, + { + "epoch": 0.32, + "grad_norm": 1.5832567122814478, + "learning_rate": 7.99811432906357e-06, + "loss": 0.3461, + "step": 10911 + }, + { + "epoch": 0.32, + "grad_norm": 1.3673776752177924, + "learning_rate": 7.997738416922156e-06, + "loss": 0.3506, + "step": 10912 + }, + { + "epoch": 0.32, + "grad_norm": 4.057468776957321, + "learning_rate": 7.99736247832567e-06, + "loss": 0.3301, + "step": 10913 + }, + { + "epoch": 0.32, + "grad_norm": 1.2809922562777205, + "learning_rate": 7.996986513277426e-06, + "loss": 0.3378, + "step": 10914 + }, + { + "epoch": 0.32, + "grad_norm": 1.7424068476726449, + "learning_rate": 7.996610521780742e-06, + "loss": 0.3717, + "step": 10915 + }, + { + "epoch": 0.32, + "grad_norm": 1.497707897242206, + "learning_rate": 7.996234503838937e-06, + "loss": 0.3425, + "step": 10916 + }, + { + "epoch": 0.32, + "grad_norm": 1.4817905801849374, + "learning_rate": 7.99585845945533e-06, + "loss": 0.3648, + "step": 10917 + }, + { + "epoch": 0.32, + "grad_norm": 1.333974198540138, + "learning_rate": 7.99548238863324e-06, + "loss": 0.3397, + "step": 10918 + }, + { + "epoch": 0.32, + "grad_norm": 1.2925071757976385, + "learning_rate": 7.995106291375982e-06, + "loss": 0.3046, + "step": 10919 + }, + { + "epoch": 0.32, + "grad_norm": 1.3367486441641998, + "learning_rate": 7.99473016768688e-06, + "loss": 0.3348, + "step": 10920 + }, + { + "epoch": 0.32, + "grad_norm": 1.32955923197146, + "learning_rate": 7.99435401756925e-06, + "loss": 0.354, + "step": 10921 + }, + { + "epoch": 0.32, + "grad_norm": 1.6026269244209, + "learning_rate": 7.993977841026415e-06, + "loss": 0.3532, + "step": 10922 + }, + { + "epoch": 0.32, + "grad_norm": 1.9691165474274182, + "learning_rate": 7.99360163806169e-06, + "loss": 0.3616, + "step": 10923 + }, + { + "epoch": 0.32, + "grad_norm": 1.6456792082630904, + "learning_rate": 7.993225408678396e-06, + "loss": 0.3777, + "step": 10924 + }, + { + "epoch": 0.32, + "grad_norm": 1.3954585824804495, + "learning_rate": 7.992849152879857e-06, + "loss": 0.3344, + "step": 10925 + }, + { + "epoch": 0.32, + "grad_norm": 1.3521622549572518, + "learning_rate": 7.99247287066939e-06, + "loss": 0.363, + "step": 10926 + }, + { + "epoch": 0.32, + "grad_norm": 1.3357090289832323, + "learning_rate": 7.992096562050316e-06, + "loss": 0.3503, + "step": 10927 + }, + { + "epoch": 0.32, + "grad_norm": 1.3449723663692976, + "learning_rate": 7.991720227025958e-06, + "loss": 0.3518, + "step": 10928 + }, + { + "epoch": 0.32, + "grad_norm": 1.4438643390123538, + "learning_rate": 7.991343865599635e-06, + "loss": 0.3653, + "step": 10929 + }, + { + "epoch": 0.32, + "grad_norm": 1.4608337536040643, + "learning_rate": 7.99096747777467e-06, + "loss": 0.3477, + "step": 10930 + }, + { + "epoch": 0.32, + "grad_norm": 1.377859213541068, + "learning_rate": 7.990591063554383e-06, + "loss": 0.3535, + "step": 10931 + }, + { + "epoch": 0.32, + "grad_norm": 1.4563713115677601, + "learning_rate": 7.990214622942096e-06, + "loss": 0.3699, + "step": 10932 + }, + { + "epoch": 0.32, + "grad_norm": 1.388005962705846, + "learning_rate": 7.989838155941134e-06, + "loss": 0.352, + "step": 10933 + }, + { + "epoch": 0.32, + "grad_norm": 1.2836735817131857, + "learning_rate": 7.989461662554816e-06, + "loss": 0.3469, + "step": 10934 + }, + { + "epoch": 0.32, + "grad_norm": 1.5274370890785756, + "learning_rate": 7.989085142786465e-06, + "loss": 0.3471, + "step": 10935 + }, + { + "epoch": 0.32, + "grad_norm": 1.4497900662118701, + "learning_rate": 7.988708596639405e-06, + "loss": 0.3527, + "step": 10936 + }, + { + "epoch": 0.32, + "grad_norm": 1.2569358330023215, + "learning_rate": 7.98833202411696e-06, + "loss": 0.327, + "step": 10937 + }, + { + "epoch": 0.32, + "grad_norm": 1.2977616669963685, + "learning_rate": 7.98795542522245e-06, + "loss": 0.3304, + "step": 10938 + }, + { + "epoch": 0.32, + "grad_norm": 1.225060791008355, + "learning_rate": 7.987578799959199e-06, + "loss": 0.3372, + "step": 10939 + }, + { + "epoch": 0.32, + "grad_norm": 1.3691626928044813, + "learning_rate": 7.987202148330534e-06, + "loss": 0.3414, + "step": 10940 + }, + { + "epoch": 0.32, + "grad_norm": 1.4157620534444983, + "learning_rate": 7.986825470339777e-06, + "loss": 0.3539, + "step": 10941 + }, + { + "epoch": 0.32, + "grad_norm": 1.4039837077884971, + "learning_rate": 7.986448765990251e-06, + "loss": 0.3441, + "step": 10942 + }, + { + "epoch": 0.32, + "grad_norm": 1.4219034299481574, + "learning_rate": 7.986072035285282e-06, + "loss": 0.3412, + "step": 10943 + }, + { + "epoch": 0.32, + "grad_norm": 1.5303489292100434, + "learning_rate": 7.985695278228194e-06, + "loss": 0.3486, + "step": 10944 + }, + { + "epoch": 0.32, + "grad_norm": 1.9672517002987615, + "learning_rate": 7.985318494822311e-06, + "loss": 0.3413, + "step": 10945 + }, + { + "epoch": 0.32, + "grad_norm": 1.289438694122505, + "learning_rate": 7.984941685070962e-06, + "loss": 0.3515, + "step": 10946 + }, + { + "epoch": 0.32, + "grad_norm": 4.044185886445124, + "learning_rate": 7.984564848977467e-06, + "loss": 0.3308, + "step": 10947 + }, + { + "epoch": 0.32, + "grad_norm": 1.2919313144094102, + "learning_rate": 7.984187986545154e-06, + "loss": 0.3398, + "step": 10948 + }, + { + "epoch": 0.32, + "grad_norm": 1.2056939715663273, + "learning_rate": 7.983811097777351e-06, + "loss": 0.3308, + "step": 10949 + }, + { + "epoch": 0.32, + "grad_norm": 1.2896964382560108, + "learning_rate": 7.98343418267738e-06, + "loss": 0.3326, + "step": 10950 + }, + { + "epoch": 0.32, + "grad_norm": 1.3482734438393693, + "learning_rate": 7.98305724124857e-06, + "loss": 0.3453, + "step": 10951 + }, + { + "epoch": 0.32, + "grad_norm": 1.2992181756401795, + "learning_rate": 7.982680273494248e-06, + "loss": 0.3426, + "step": 10952 + }, + { + "epoch": 0.32, + "grad_norm": 1.4937397012284972, + "learning_rate": 7.982303279417738e-06, + "loss": 0.3387, + "step": 10953 + }, + { + "epoch": 0.32, + "grad_norm": 1.3462840290971376, + "learning_rate": 7.981926259022367e-06, + "loss": 0.3397, + "step": 10954 + }, + { + "epoch": 0.32, + "grad_norm": 1.346899903717678, + "learning_rate": 7.981549212311467e-06, + "loss": 0.3637, + "step": 10955 + }, + { + "epoch": 0.32, + "grad_norm": 1.2293791516018313, + "learning_rate": 7.98117213928836e-06, + "loss": 0.3313, + "step": 10956 + }, + { + "epoch": 0.32, + "grad_norm": 1.5568933010835038, + "learning_rate": 7.980795039956377e-06, + "loss": 0.3633, + "step": 10957 + }, + { + "epoch": 0.32, + "grad_norm": 1.3374373400288864, + "learning_rate": 7.980417914318843e-06, + "loss": 0.3527, + "step": 10958 + }, + { + "epoch": 0.32, + "grad_norm": 1.3517728815831573, + "learning_rate": 7.98004076237909e-06, + "loss": 0.377, + "step": 10959 + }, + { + "epoch": 0.32, + "grad_norm": 1.3812980689328207, + "learning_rate": 7.979663584140442e-06, + "loss": 0.3264, + "step": 10960 + }, + { + "epoch": 0.32, + "grad_norm": 1.3828995779690942, + "learning_rate": 7.979286379606231e-06, + "loss": 0.3529, + "step": 10961 + }, + { + "epoch": 0.32, + "grad_norm": 1.3336444112953345, + "learning_rate": 7.978909148779786e-06, + "loss": 0.3641, + "step": 10962 + }, + { + "epoch": 0.32, + "grad_norm": 1.3206349382496991, + "learning_rate": 7.978531891664433e-06, + "loss": 0.3289, + "step": 10963 + }, + { + "epoch": 0.32, + "grad_norm": 1.471688453185538, + "learning_rate": 7.978154608263502e-06, + "loss": 0.3479, + "step": 10964 + }, + { + "epoch": 0.32, + "grad_norm": 1.802980685483335, + "learning_rate": 7.977777298580326e-06, + "loss": 0.3474, + "step": 10965 + }, + { + "epoch": 0.32, + "grad_norm": 1.2100855280225609, + "learning_rate": 7.977399962618231e-06, + "loss": 0.3451, + "step": 10966 + }, + { + "epoch": 0.32, + "grad_norm": 1.2713117392510627, + "learning_rate": 7.977022600380548e-06, + "loss": 0.3401, + "step": 10967 + }, + { + "epoch": 0.32, + "grad_norm": 1.2858833274611399, + "learning_rate": 7.976645211870608e-06, + "loss": 0.3548, + "step": 10968 + }, + { + "epoch": 0.32, + "grad_norm": 1.3197480110392974, + "learning_rate": 7.97626779709174e-06, + "loss": 0.3228, + "step": 10969 + }, + { + "epoch": 0.32, + "grad_norm": 1.3111882903779615, + "learning_rate": 7.975890356047277e-06, + "loss": 0.3591, + "step": 10970 + }, + { + "epoch": 0.32, + "grad_norm": 1.5949991483807175, + "learning_rate": 7.97551288874055e-06, + "loss": 0.3359, + "step": 10971 + }, + { + "epoch": 0.32, + "grad_norm": 1.3210117805804462, + "learning_rate": 7.975135395174887e-06, + "loss": 0.3299, + "step": 10972 + }, + { + "epoch": 0.32, + "grad_norm": 1.224382962067058, + "learning_rate": 7.97475787535362e-06, + "loss": 0.3232, + "step": 10973 + }, + { + "epoch": 0.32, + "grad_norm": 1.3550717427773389, + "learning_rate": 7.974380329280082e-06, + "loss": 0.3848, + "step": 10974 + }, + { + "epoch": 0.32, + "grad_norm": 1.4841437151096075, + "learning_rate": 7.974002756957606e-06, + "loss": 0.3408, + "step": 10975 + }, + { + "epoch": 0.32, + "grad_norm": 1.3797773101483377, + "learning_rate": 7.973625158389523e-06, + "loss": 0.3516, + "step": 10976 + }, + { + "epoch": 0.32, + "grad_norm": 1.238746528957636, + "learning_rate": 7.973247533579164e-06, + "loss": 0.3318, + "step": 10977 + }, + { + "epoch": 0.32, + "grad_norm": 1.9769194505587748, + "learning_rate": 7.972869882529863e-06, + "loss": 0.3222, + "step": 10978 + }, + { + "epoch": 0.32, + "grad_norm": 1.2473117429104914, + "learning_rate": 7.972492205244952e-06, + "loss": 0.3365, + "step": 10979 + }, + { + "epoch": 0.32, + "grad_norm": 1.3849065095064705, + "learning_rate": 7.972114501727765e-06, + "loss": 0.3723, + "step": 10980 + }, + { + "epoch": 0.32, + "grad_norm": 1.3074507985164363, + "learning_rate": 7.971736771981632e-06, + "loss": 0.3827, + "step": 10981 + }, + { + "epoch": 0.32, + "grad_norm": 2.176097201276052, + "learning_rate": 7.971359016009893e-06, + "loss": 0.3413, + "step": 10982 + }, + { + "epoch": 0.32, + "grad_norm": 1.6205310431929991, + "learning_rate": 7.970981233815874e-06, + "loss": 0.3517, + "step": 10983 + }, + { + "epoch": 0.32, + "grad_norm": 1.6263077585715442, + "learning_rate": 7.970603425402914e-06, + "loss": 0.3436, + "step": 10984 + }, + { + "epoch": 0.32, + "grad_norm": 1.6484542738655148, + "learning_rate": 7.970225590774346e-06, + "loss": 0.3421, + "step": 10985 + }, + { + "epoch": 0.32, + "grad_norm": 1.3991663942108845, + "learning_rate": 7.969847729933505e-06, + "loss": 0.3279, + "step": 10986 + }, + { + "epoch": 0.32, + "grad_norm": 1.366024111285152, + "learning_rate": 7.969469842883723e-06, + "loss": 0.3605, + "step": 10987 + }, + { + "epoch": 0.32, + "grad_norm": 1.349074932618386, + "learning_rate": 7.969091929628338e-06, + "loss": 0.3476, + "step": 10988 + }, + { + "epoch": 0.32, + "grad_norm": 1.4668901875832445, + "learning_rate": 7.968713990170685e-06, + "loss": 0.3436, + "step": 10989 + }, + { + "epoch": 0.32, + "grad_norm": 1.6330929092678077, + "learning_rate": 7.968336024514095e-06, + "loss": 0.3575, + "step": 10990 + }, + { + "epoch": 0.32, + "grad_norm": 1.3262049587183613, + "learning_rate": 7.967958032661908e-06, + "loss": 0.326, + "step": 10991 + }, + { + "epoch": 0.32, + "grad_norm": 1.298579235273806, + "learning_rate": 7.96758001461746e-06, + "loss": 0.3317, + "step": 10992 + }, + { + "epoch": 0.32, + "grad_norm": 1.4759376726910878, + "learning_rate": 7.967201970384084e-06, + "loss": 0.3283, + "step": 10993 + }, + { + "epoch": 0.32, + "grad_norm": 1.5580935132177178, + "learning_rate": 7.96682389996512e-06, + "loss": 0.33, + "step": 10994 + }, + { + "epoch": 0.32, + "grad_norm": 1.496540873628861, + "learning_rate": 7.966445803363898e-06, + "loss": 0.3377, + "step": 10995 + }, + { + "epoch": 0.32, + "grad_norm": 1.7055853875631557, + "learning_rate": 7.966067680583763e-06, + "loss": 0.3351, + "step": 10996 + }, + { + "epoch": 0.32, + "grad_norm": 1.4332792327990127, + "learning_rate": 7.965689531628044e-06, + "loss": 0.3339, + "step": 10997 + }, + { + "epoch": 0.32, + "grad_norm": 1.953496167986888, + "learning_rate": 7.965311356500084e-06, + "loss": 0.3637, + "step": 10998 + }, + { + "epoch": 0.32, + "grad_norm": 1.3295326509531626, + "learning_rate": 7.964933155203218e-06, + "loss": 0.3619, + "step": 10999 + }, + { + "epoch": 0.32, + "grad_norm": 1.825272968535574, + "learning_rate": 7.964554927740783e-06, + "loss": 0.3194, + "step": 11000 + }, + { + "epoch": 0.32, + "grad_norm": 1.3092360831003929, + "learning_rate": 7.96417667411612e-06, + "loss": 0.3808, + "step": 11001 + }, + { + "epoch": 0.32, + "grad_norm": 1.4715618315827161, + "learning_rate": 7.963798394332564e-06, + "loss": 0.4204, + "step": 11002 + }, + { + "epoch": 0.32, + "grad_norm": 1.4686410599906545, + "learning_rate": 7.963420088393453e-06, + "loss": 0.3261, + "step": 11003 + }, + { + "epoch": 0.32, + "grad_norm": 1.311851538051591, + "learning_rate": 7.96304175630213e-06, + "loss": 0.3738, + "step": 11004 + }, + { + "epoch": 0.32, + "grad_norm": 1.4934220964744405, + "learning_rate": 7.962663398061929e-06, + "loss": 0.3627, + "step": 11005 + }, + { + "epoch": 0.32, + "grad_norm": 10.16075132990031, + "learning_rate": 7.96228501367619e-06, + "loss": 0.3352, + "step": 11006 + }, + { + "epoch": 0.32, + "grad_norm": 1.6700479191754953, + "learning_rate": 7.961906603148252e-06, + "loss": 0.3578, + "step": 11007 + }, + { + "epoch": 0.32, + "grad_norm": 1.3847114187619067, + "learning_rate": 7.961528166481458e-06, + "loss": 0.3294, + "step": 11008 + }, + { + "epoch": 0.32, + "grad_norm": 1.33213109103999, + "learning_rate": 7.961149703679143e-06, + "loss": 0.3265, + "step": 11009 + }, + { + "epoch": 0.32, + "grad_norm": 1.4560834829740452, + "learning_rate": 7.960771214744652e-06, + "loss": 0.3413, + "step": 11010 + }, + { + "epoch": 0.32, + "grad_norm": 1.7948029735477835, + "learning_rate": 7.96039269968132e-06, + "loss": 0.3715, + "step": 11011 + }, + { + "epoch": 0.32, + "grad_norm": 1.5767234272277222, + "learning_rate": 7.960014158492489e-06, + "loss": 0.3742, + "step": 11012 + }, + { + "epoch": 0.32, + "grad_norm": 1.5090746189374775, + "learning_rate": 7.9596355911815e-06, + "loss": 0.3782, + "step": 11013 + }, + { + "epoch": 0.32, + "grad_norm": 1.3355574856416923, + "learning_rate": 7.959256997751694e-06, + "loss": 0.3458, + "step": 11014 + }, + { + "epoch": 0.32, + "grad_norm": 1.3925577201849608, + "learning_rate": 7.958878378206414e-06, + "loss": 0.3388, + "step": 11015 + }, + { + "epoch": 0.32, + "grad_norm": 1.3469495258515196, + "learning_rate": 7.958499732548998e-06, + "loss": 0.3243, + "step": 11016 + }, + { + "epoch": 0.32, + "grad_norm": 1.2574781290854071, + "learning_rate": 7.95812106078279e-06, + "loss": 0.3656, + "step": 11017 + }, + { + "epoch": 0.32, + "grad_norm": 1.3372129358375187, + "learning_rate": 7.957742362911129e-06, + "loss": 0.3563, + "step": 11018 + }, + { + "epoch": 0.32, + "grad_norm": 1.2602117786573896, + "learning_rate": 7.957363638937362e-06, + "loss": 0.3356, + "step": 11019 + }, + { + "epoch": 0.32, + "grad_norm": 1.2976938195265069, + "learning_rate": 7.956984888864826e-06, + "loss": 0.3476, + "step": 11020 + }, + { + "epoch": 0.32, + "grad_norm": 1.3479647241544497, + "learning_rate": 7.956606112696865e-06, + "loss": 0.3393, + "step": 11021 + }, + { + "epoch": 0.32, + "grad_norm": 1.5303707973353193, + "learning_rate": 7.956227310436824e-06, + "loss": 0.3689, + "step": 11022 + }, + { + "epoch": 0.32, + "grad_norm": 1.5310674492392593, + "learning_rate": 7.955848482088042e-06, + "loss": 0.363, + "step": 11023 + }, + { + "epoch": 0.32, + "grad_norm": 1.4821240983923245, + "learning_rate": 7.955469627653867e-06, + "loss": 0.331, + "step": 11024 + }, + { + "epoch": 0.32, + "grad_norm": 1.4702122380125267, + "learning_rate": 7.955090747137638e-06, + "loss": 0.3775, + "step": 11025 + }, + { + "epoch": 0.32, + "grad_norm": 1.2099016245740586, + "learning_rate": 7.9547118405427e-06, + "loss": 0.3236, + "step": 11026 + }, + { + "epoch": 0.32, + "grad_norm": 1.3683443486543552, + "learning_rate": 7.954332907872398e-06, + "loss": 0.3466, + "step": 11027 + }, + { + "epoch": 0.32, + "grad_norm": 1.862535181192959, + "learning_rate": 7.953953949130076e-06, + "loss": 0.3372, + "step": 11028 + }, + { + "epoch": 0.32, + "grad_norm": 1.3914521059243374, + "learning_rate": 7.953574964319077e-06, + "loss": 0.3543, + "step": 11029 + }, + { + "epoch": 0.32, + "grad_norm": 1.3109598941678875, + "learning_rate": 7.953195953442745e-06, + "loss": 0.3509, + "step": 11030 + }, + { + "epoch": 0.32, + "grad_norm": 1.3295291012965516, + "learning_rate": 7.952816916504426e-06, + "loss": 0.3437, + "step": 11031 + }, + { + "epoch": 0.32, + "grad_norm": 1.2604049083618558, + "learning_rate": 7.952437853507467e-06, + "loss": 0.3484, + "step": 11032 + }, + { + "epoch": 0.32, + "grad_norm": 1.7811444610801743, + "learning_rate": 7.952058764455212e-06, + "loss": 0.3317, + "step": 11033 + }, + { + "epoch": 0.32, + "grad_norm": 1.494055583491827, + "learning_rate": 7.951679649351003e-06, + "loss": 0.344, + "step": 11034 + }, + { + "epoch": 0.32, + "grad_norm": 1.4462734495823246, + "learning_rate": 7.95130050819819e-06, + "loss": 0.352, + "step": 11035 + }, + { + "epoch": 0.32, + "grad_norm": 1.5035083216186798, + "learning_rate": 7.950921341000116e-06, + "loss": 0.3339, + "step": 11036 + }, + { + "epoch": 0.32, + "grad_norm": 1.3419630665338609, + "learning_rate": 7.950542147760128e-06, + "loss": 0.3253, + "step": 11037 + }, + { + "epoch": 0.32, + "grad_norm": 1.335353275102605, + "learning_rate": 7.950162928481574e-06, + "loss": 0.3543, + "step": 11038 + }, + { + "epoch": 0.32, + "grad_norm": 1.3100400206543243, + "learning_rate": 7.9497836831678e-06, + "loss": 0.3322, + "step": 11039 + }, + { + "epoch": 0.32, + "grad_norm": 0.9549914993973273, + "learning_rate": 7.94940441182215e-06, + "loss": 0.6048, + "step": 11040 + }, + { + "epoch": 0.32, + "grad_norm": 1.32026851825988, + "learning_rate": 7.949025114447974e-06, + "loss": 0.3479, + "step": 11041 + }, + { + "epoch": 0.32, + "grad_norm": 1.3969756107918587, + "learning_rate": 7.948645791048622e-06, + "loss": 0.3792, + "step": 11042 + }, + { + "epoch": 0.32, + "grad_norm": 1.3490057893389442, + "learning_rate": 7.948266441627434e-06, + "loss": 0.3544, + "step": 11043 + }, + { + "epoch": 0.32, + "grad_norm": 1.8344708514086234, + "learning_rate": 7.947887066187764e-06, + "loss": 0.3546, + "step": 11044 + }, + { + "epoch": 0.32, + "grad_norm": 1.350233871889042, + "learning_rate": 7.947507664732957e-06, + "loss": 0.3441, + "step": 11045 + }, + { + "epoch": 0.32, + "grad_norm": 1.380289714992265, + "learning_rate": 7.947128237266363e-06, + "loss": 0.3288, + "step": 11046 + }, + { + "epoch": 0.32, + "grad_norm": 1.3432224211584916, + "learning_rate": 7.946748783791329e-06, + "loss": 0.3254, + "step": 11047 + }, + { + "epoch": 0.32, + "grad_norm": 1.338594212264528, + "learning_rate": 7.946369304311205e-06, + "loss": 0.3626, + "step": 11048 + }, + { + "epoch": 0.32, + "grad_norm": 1.8482789287986168, + "learning_rate": 7.945989798829339e-06, + "loss": 0.3311, + "step": 11049 + }, + { + "epoch": 0.32, + "grad_norm": 1.2549718467293502, + "learning_rate": 7.94561026734908e-06, + "loss": 0.3276, + "step": 11050 + }, + { + "epoch": 0.32, + "grad_norm": 1.925818188396775, + "learning_rate": 7.945230709873778e-06, + "loss": 0.3782, + "step": 11051 + }, + { + "epoch": 0.32, + "grad_norm": 1.5557339376557668, + "learning_rate": 7.944851126406782e-06, + "loss": 0.3539, + "step": 11052 + }, + { + "epoch": 0.32, + "grad_norm": 1.7150858058110467, + "learning_rate": 7.944471516951442e-06, + "loss": 0.3446, + "step": 11053 + }, + { + "epoch": 0.32, + "grad_norm": 1.3241379678333662, + "learning_rate": 7.94409188151111e-06, + "loss": 0.3402, + "step": 11054 + }, + { + "epoch": 0.32, + "grad_norm": 1.3441369804903551, + "learning_rate": 7.943712220089132e-06, + "loss": 0.3791, + "step": 11055 + }, + { + "epoch": 0.32, + "grad_norm": 2.5785090544756923, + "learning_rate": 7.943332532688861e-06, + "loss": 0.3715, + "step": 11056 + }, + { + "epoch": 0.32, + "grad_norm": 1.7217513826448865, + "learning_rate": 7.942952819313649e-06, + "loss": 0.3716, + "step": 11057 + }, + { + "epoch": 0.32, + "grad_norm": 1.4139192597916856, + "learning_rate": 7.942573079966846e-06, + "loss": 0.3727, + "step": 11058 + }, + { + "epoch": 0.32, + "grad_norm": 1.3971537655024826, + "learning_rate": 7.942193314651802e-06, + "loss": 0.3655, + "step": 11059 + }, + { + "epoch": 0.32, + "grad_norm": 1.6325597839931505, + "learning_rate": 7.94181352337187e-06, + "loss": 0.3522, + "step": 11060 + }, + { + "epoch": 0.32, + "grad_norm": 1.5914931244232906, + "learning_rate": 7.9414337061304e-06, + "loss": 0.3649, + "step": 11061 + }, + { + "epoch": 0.32, + "grad_norm": 1.6890460741204072, + "learning_rate": 7.941053862930745e-06, + "loss": 0.3318, + "step": 11062 + }, + { + "epoch": 0.32, + "grad_norm": 1.3762330159805833, + "learning_rate": 7.940673993776258e-06, + "loss": 0.3269, + "step": 11063 + }, + { + "epoch": 0.32, + "grad_norm": 1.308796557266379, + "learning_rate": 7.940294098670291e-06, + "loss": 0.3463, + "step": 11064 + }, + { + "epoch": 0.32, + "grad_norm": 1.4236606063229242, + "learning_rate": 7.939914177616195e-06, + "loss": 0.3388, + "step": 11065 + }, + { + "epoch": 0.32, + "grad_norm": 1.4655166321098698, + "learning_rate": 7.939534230617323e-06, + "loss": 0.3441, + "step": 11066 + }, + { + "epoch": 0.32, + "grad_norm": 1.4594190308086088, + "learning_rate": 7.93915425767703e-06, + "loss": 0.3744, + "step": 11067 + }, + { + "epoch": 0.32, + "grad_norm": 1.3474783102137489, + "learning_rate": 7.938774258798667e-06, + "loss": 0.3336, + "step": 11068 + }, + { + "epoch": 0.32, + "grad_norm": 1.4232968562894854, + "learning_rate": 7.93839423398559e-06, + "loss": 0.3219, + "step": 11069 + }, + { + "epoch": 0.32, + "grad_norm": 1.3472925186264288, + "learning_rate": 7.93801418324115e-06, + "loss": 0.3232, + "step": 11070 + }, + { + "epoch": 0.32, + "grad_norm": 1.2839259896170188, + "learning_rate": 7.937634106568703e-06, + "loss": 0.3343, + "step": 11071 + }, + { + "epoch": 0.32, + "grad_norm": 1.8258451460115481, + "learning_rate": 7.937254003971602e-06, + "loss": 0.3191, + "step": 11072 + }, + { + "epoch": 0.32, + "grad_norm": 1.331289116588754, + "learning_rate": 7.936873875453203e-06, + "loss": 0.3265, + "step": 11073 + }, + { + "epoch": 0.32, + "grad_norm": 1.405368045740166, + "learning_rate": 7.936493721016857e-06, + "loss": 0.3319, + "step": 11074 + }, + { + "epoch": 0.32, + "grad_norm": 1.5846080142387744, + "learning_rate": 7.936113540665923e-06, + "loss": 0.3661, + "step": 11075 + }, + { + "epoch": 0.32, + "grad_norm": 1.2173663915354562, + "learning_rate": 7.935733334403755e-06, + "loss": 0.3327, + "step": 11076 + }, + { + "epoch": 0.32, + "grad_norm": 1.427046402805686, + "learning_rate": 7.935353102233707e-06, + "loss": 0.3329, + "step": 11077 + }, + { + "epoch": 0.32, + "grad_norm": 1.7094237042920166, + "learning_rate": 7.934972844159135e-06, + "loss": 0.3611, + "step": 11078 + }, + { + "epoch": 0.32, + "grad_norm": 3.319075984130112, + "learning_rate": 7.934592560183397e-06, + "loss": 0.3422, + "step": 11079 + }, + { + "epoch": 0.32, + "grad_norm": 1.483992252632356, + "learning_rate": 7.934212250309844e-06, + "loss": 0.3509, + "step": 11080 + }, + { + "epoch": 0.32, + "grad_norm": 1.741783613954823, + "learning_rate": 7.933831914541837e-06, + "loss": 0.327, + "step": 11081 + }, + { + "epoch": 0.32, + "grad_norm": 1.4596894248912287, + "learning_rate": 7.93345155288273e-06, + "loss": 0.3512, + "step": 11082 + }, + { + "epoch": 0.32, + "grad_norm": 1.5543423444661328, + "learning_rate": 7.933071165335882e-06, + "loss": 0.3338, + "step": 11083 + }, + { + "epoch": 0.32, + "grad_norm": 1.3760360783630585, + "learning_rate": 7.932690751904648e-06, + "loss": 0.3552, + "step": 11084 + }, + { + "epoch": 0.32, + "grad_norm": 3.3501107927907197, + "learning_rate": 7.932310312592384e-06, + "loss": 0.3619, + "step": 11085 + }, + { + "epoch": 0.32, + "grad_norm": 1.3108011789846414, + "learning_rate": 7.93192984740245e-06, + "loss": 0.3413, + "step": 11086 + }, + { + "epoch": 0.32, + "grad_norm": 1.5126931047340728, + "learning_rate": 7.931549356338203e-06, + "loss": 0.3664, + "step": 11087 + }, + { + "epoch": 0.32, + "grad_norm": 1.3770937816745261, + "learning_rate": 7.931168839403e-06, + "loss": 0.3316, + "step": 11088 + }, + { + "epoch": 0.32, + "grad_norm": 1.3351376715495957, + "learning_rate": 7.9307882966002e-06, + "loss": 0.3498, + "step": 11089 + }, + { + "epoch": 0.32, + "grad_norm": 1.5586508376339858, + "learning_rate": 7.930407727933161e-06, + "loss": 0.3366, + "step": 11090 + }, + { + "epoch": 0.32, + "grad_norm": 1.6792314820606538, + "learning_rate": 7.93002713340524e-06, + "loss": 0.3576, + "step": 11091 + }, + { + "epoch": 0.32, + "grad_norm": 1.6608116530896364, + "learning_rate": 7.9296465130198e-06, + "loss": 0.3706, + "step": 11092 + }, + { + "epoch": 0.32, + "grad_norm": 1.3246188132304013, + "learning_rate": 7.929265866780194e-06, + "loss": 0.3279, + "step": 11093 + }, + { + "epoch": 0.32, + "grad_norm": 1.4438342255751433, + "learning_rate": 7.928885194689785e-06, + "loss": 0.3345, + "step": 11094 + }, + { + "epoch": 0.32, + "grad_norm": 1.5762615238739546, + "learning_rate": 7.92850449675193e-06, + "loss": 0.3576, + "step": 11095 + }, + { + "epoch": 0.32, + "grad_norm": 1.5154067298338108, + "learning_rate": 7.928123772969991e-06, + "loss": 0.3496, + "step": 11096 + }, + { + "epoch": 0.32, + "grad_norm": 1.4132807279983608, + "learning_rate": 7.92774302334733e-06, + "loss": 0.3537, + "step": 11097 + }, + { + "epoch": 0.32, + "grad_norm": 1.4947207101509692, + "learning_rate": 7.927362247887301e-06, + "loss": 0.3592, + "step": 11098 + }, + { + "epoch": 0.32, + "grad_norm": 1.4979217621014072, + "learning_rate": 7.92698144659327e-06, + "loss": 0.3382, + "step": 11099 + }, + { + "epoch": 0.32, + "grad_norm": 1.3031211091679795, + "learning_rate": 7.926600619468594e-06, + "loss": 0.3354, + "step": 11100 + }, + { + "epoch": 0.32, + "grad_norm": 1.4525663738330783, + "learning_rate": 7.926219766516637e-06, + "loss": 0.3372, + "step": 11101 + }, + { + "epoch": 0.32, + "grad_norm": 1.7701640404012653, + "learning_rate": 7.925838887740756e-06, + "loss": 0.3332, + "step": 11102 + }, + { + "epoch": 0.32, + "grad_norm": 1.3360887676940008, + "learning_rate": 7.925457983144314e-06, + "loss": 0.3827, + "step": 11103 + }, + { + "epoch": 0.32, + "grad_norm": 1.293411759924918, + "learning_rate": 7.925077052730674e-06, + "loss": 0.3334, + "step": 11104 + }, + { + "epoch": 0.32, + "grad_norm": 1.3454350227414003, + "learning_rate": 7.924696096503197e-06, + "loss": 0.3449, + "step": 11105 + }, + { + "epoch": 0.32, + "grad_norm": 1.4707653325018484, + "learning_rate": 7.924315114465245e-06, + "loss": 0.3495, + "step": 11106 + }, + { + "epoch": 0.32, + "grad_norm": 1.271935825313364, + "learning_rate": 7.923934106620178e-06, + "loss": 0.322, + "step": 11107 + }, + { + "epoch": 0.32, + "grad_norm": 1.9021997321052675, + "learning_rate": 7.92355307297136e-06, + "loss": 0.3369, + "step": 11108 + }, + { + "epoch": 0.32, + "grad_norm": 1.3618349780203414, + "learning_rate": 7.923172013522153e-06, + "loss": 0.3257, + "step": 11109 + }, + { + "epoch": 0.32, + "grad_norm": 1.3762865615791813, + "learning_rate": 7.922790928275923e-06, + "loss": 0.3252, + "step": 11110 + }, + { + "epoch": 0.32, + "grad_norm": 1.4753845222457835, + "learning_rate": 7.92240981723603e-06, + "loss": 0.3465, + "step": 11111 + }, + { + "epoch": 0.32, + "grad_norm": 1.608289106540632, + "learning_rate": 7.922028680405838e-06, + "loss": 0.3626, + "step": 11112 + }, + { + "epoch": 0.32, + "grad_norm": 1.4599939512219438, + "learning_rate": 7.921647517788709e-06, + "loss": 0.3872, + "step": 11113 + }, + { + "epoch": 0.32, + "grad_norm": 1.7681710078767923, + "learning_rate": 7.921266329388008e-06, + "loss": 0.3575, + "step": 11114 + }, + { + "epoch": 0.32, + "grad_norm": 1.3085148204186774, + "learning_rate": 7.920885115207102e-06, + "loss": 0.3184, + "step": 11115 + }, + { + "epoch": 0.32, + "grad_norm": 1.2717694965987296, + "learning_rate": 7.92050387524935e-06, + "loss": 0.3259, + "step": 11116 + }, + { + "epoch": 0.32, + "grad_norm": 1.481711883337526, + "learning_rate": 7.92012260951812e-06, + "loss": 0.365, + "step": 11117 + }, + { + "epoch": 0.32, + "grad_norm": 1.2904359178449283, + "learning_rate": 7.919741318016775e-06, + "loss": 0.3306, + "step": 11118 + }, + { + "epoch": 0.32, + "grad_norm": 1.3437001279953291, + "learning_rate": 7.919360000748682e-06, + "loss": 0.3488, + "step": 11119 + }, + { + "epoch": 0.32, + "grad_norm": 1.5540281125241573, + "learning_rate": 7.918978657717204e-06, + "loss": 0.3351, + "step": 11120 + }, + { + "epoch": 0.32, + "grad_norm": 1.3692834277426902, + "learning_rate": 7.918597288925704e-06, + "loss": 0.33, + "step": 11121 + }, + { + "epoch": 0.32, + "grad_norm": 1.4196131560722771, + "learning_rate": 7.918215894377555e-06, + "loss": 0.3509, + "step": 11122 + }, + { + "epoch": 0.32, + "grad_norm": 1.7320745952850944, + "learning_rate": 7.917834474076116e-06, + "loss": 0.3641, + "step": 11123 + }, + { + "epoch": 0.32, + "grad_norm": 1.4038611151024434, + "learning_rate": 7.917453028024756e-06, + "loss": 0.3705, + "step": 11124 + }, + { + "epoch": 0.32, + "grad_norm": 1.5230171101124663, + "learning_rate": 7.91707155622684e-06, + "loss": 0.3363, + "step": 11125 + }, + { + "epoch": 0.32, + "grad_norm": 1.3770601520319037, + "learning_rate": 7.916690058685735e-06, + "loss": 0.3602, + "step": 11126 + }, + { + "epoch": 0.32, + "grad_norm": 1.6516614788436499, + "learning_rate": 7.916308535404809e-06, + "loss": 0.3297, + "step": 11127 + }, + { + "epoch": 0.32, + "grad_norm": 1.6179942730714993, + "learning_rate": 7.915926986387427e-06, + "loss": 0.3302, + "step": 11128 + }, + { + "epoch": 0.32, + "grad_norm": 1.4958989448044848, + "learning_rate": 7.915545411636957e-06, + "loss": 0.3687, + "step": 11129 + }, + { + "epoch": 0.32, + "grad_norm": 1.413694185104905, + "learning_rate": 7.915163811156767e-06, + "loss": 0.3629, + "step": 11130 + }, + { + "epoch": 0.32, + "grad_norm": 1.3411325820056454, + "learning_rate": 7.914782184950223e-06, + "loss": 0.3274, + "step": 11131 + }, + { + "epoch": 0.32, + "grad_norm": 1.8118601766796, + "learning_rate": 7.914400533020695e-06, + "loss": 0.344, + "step": 11132 + }, + { + "epoch": 0.32, + "grad_norm": 1.5280656663440104, + "learning_rate": 7.91401885537155e-06, + "loss": 0.321, + "step": 11133 + }, + { + "epoch": 0.32, + "grad_norm": 1.4731034398816651, + "learning_rate": 7.913637152006155e-06, + "loss": 0.3389, + "step": 11134 + }, + { + "epoch": 0.32, + "grad_norm": 1.6006486376021791, + "learning_rate": 7.913255422927882e-06, + "loss": 0.3293, + "step": 11135 + }, + { + "epoch": 0.32, + "grad_norm": 1.6048137186274551, + "learning_rate": 7.912873668140095e-06, + "loss": 0.3456, + "step": 11136 + }, + { + "epoch": 0.32, + "grad_norm": 1.4836928247096712, + "learning_rate": 7.912491887646166e-06, + "loss": 0.3531, + "step": 11137 + }, + { + "epoch": 0.32, + "grad_norm": 1.3713643253939387, + "learning_rate": 7.912110081449466e-06, + "loss": 0.3649, + "step": 11138 + }, + { + "epoch": 0.32, + "grad_norm": 1.6131592420308243, + "learning_rate": 7.91172824955336e-06, + "loss": 0.3757, + "step": 11139 + }, + { + "epoch": 0.32, + "grad_norm": 1.7983812559359975, + "learning_rate": 7.91134639196122e-06, + "loss": 0.3449, + "step": 11140 + }, + { + "epoch": 0.32, + "grad_norm": 1.3885954995931873, + "learning_rate": 7.910964508676418e-06, + "loss": 0.3434, + "step": 11141 + }, + { + "epoch": 0.32, + "grad_norm": 3.8031211307809922, + "learning_rate": 7.91058259970232e-06, + "loss": 0.3281, + "step": 11142 + }, + { + "epoch": 0.32, + "grad_norm": 1.566070052855502, + "learning_rate": 7.910200665042299e-06, + "loss": 0.3492, + "step": 11143 + }, + { + "epoch": 0.32, + "grad_norm": 1.554205638231855, + "learning_rate": 7.909818704699724e-06, + "loss": 0.3448, + "step": 11144 + }, + { + "epoch": 0.32, + "grad_norm": 1.5033841971898145, + "learning_rate": 7.909436718677968e-06, + "loss": 0.3472, + "step": 11145 + }, + { + "epoch": 0.32, + "grad_norm": 1.3341296383775272, + "learning_rate": 7.9090547069804e-06, + "loss": 0.3412, + "step": 11146 + }, + { + "epoch": 0.32, + "grad_norm": 1.2541404990575413, + "learning_rate": 7.908672669610391e-06, + "loss": 0.3224, + "step": 11147 + }, + { + "epoch": 0.32, + "grad_norm": 1.3618595124968982, + "learning_rate": 7.908290606571315e-06, + "loss": 0.3474, + "step": 11148 + }, + { + "epoch": 0.32, + "grad_norm": 0.9621176987039395, + "learning_rate": 7.90790851786654e-06, + "loss": 0.631, + "step": 11149 + }, + { + "epoch": 0.32, + "grad_norm": 1.6005536588739815, + "learning_rate": 7.907526403499441e-06, + "loss": 0.3334, + "step": 11150 + }, + { + "epoch": 0.32, + "grad_norm": 1.7634570355028525, + "learning_rate": 7.90714426347339e-06, + "loss": 0.3353, + "step": 11151 + }, + { + "epoch": 0.32, + "grad_norm": 2.01422171024753, + "learning_rate": 7.906762097791758e-06, + "loss": 0.3614, + "step": 11152 + }, + { + "epoch": 0.32, + "grad_norm": 1.209044432103428, + "learning_rate": 7.906379906457918e-06, + "loss": 0.3182, + "step": 11153 + }, + { + "epoch": 0.32, + "grad_norm": 1.432796251269698, + "learning_rate": 7.905997689475245e-06, + "loss": 0.3643, + "step": 11154 + }, + { + "epoch": 0.32, + "grad_norm": 1.5581076128288933, + "learning_rate": 7.905615446847107e-06, + "loss": 0.3491, + "step": 11155 + }, + { + "epoch": 0.32, + "grad_norm": 1.3913884592479533, + "learning_rate": 7.905233178576884e-06, + "loss": 0.3515, + "step": 11156 + }, + { + "epoch": 0.32, + "grad_norm": 1.4265939997390875, + "learning_rate": 7.904850884667944e-06, + "loss": 0.3377, + "step": 11157 + }, + { + "epoch": 0.32, + "grad_norm": 1.4374393416437192, + "learning_rate": 7.904468565123661e-06, + "loss": 0.3731, + "step": 11158 + }, + { + "epoch": 0.32, + "grad_norm": 1.8904885503582967, + "learning_rate": 7.904086219947413e-06, + "loss": 0.3453, + "step": 11159 + }, + { + "epoch": 0.32, + "grad_norm": 1.336761493258827, + "learning_rate": 7.903703849142573e-06, + "loss": 0.3485, + "step": 11160 + }, + { + "epoch": 0.32, + "grad_norm": 2.0053327412905473, + "learning_rate": 7.903321452712512e-06, + "loss": 0.3602, + "step": 11161 + }, + { + "epoch": 0.32, + "grad_norm": 1.3235808481222153, + "learning_rate": 7.902939030660607e-06, + "loss": 0.3428, + "step": 11162 + }, + { + "epoch": 0.32, + "grad_norm": 1.4989001198033947, + "learning_rate": 7.902556582990234e-06, + "loss": 0.3644, + "step": 11163 + }, + { + "epoch": 0.32, + "grad_norm": 1.6370866218587468, + "learning_rate": 7.902174109704766e-06, + "loss": 0.3635, + "step": 11164 + }, + { + "epoch": 0.32, + "grad_norm": 1.3725510311767781, + "learning_rate": 7.90179161080758e-06, + "loss": 0.3337, + "step": 11165 + }, + { + "epoch": 0.32, + "grad_norm": 1.4520018415659708, + "learning_rate": 7.90140908630205e-06, + "loss": 0.3229, + "step": 11166 + }, + { + "epoch": 0.32, + "grad_norm": 1.374122034803535, + "learning_rate": 7.901026536191555e-06, + "loss": 0.3343, + "step": 11167 + }, + { + "epoch": 0.32, + "grad_norm": 2.075579677349483, + "learning_rate": 7.900643960479466e-06, + "loss": 0.3402, + "step": 11168 + }, + { + "epoch": 0.32, + "grad_norm": 1.6201513712068123, + "learning_rate": 7.900261359169162e-06, + "loss": 0.3497, + "step": 11169 + }, + { + "epoch": 0.32, + "grad_norm": 1.348060565390155, + "learning_rate": 7.89987873226402e-06, + "loss": 0.3121, + "step": 11170 + }, + { + "epoch": 0.32, + "grad_norm": 1.293497156731868, + "learning_rate": 7.899496079767417e-06, + "loss": 0.3209, + "step": 11171 + }, + { + "epoch": 0.32, + "grad_norm": 2.1000312898398428, + "learning_rate": 7.899113401682728e-06, + "loss": 0.338, + "step": 11172 + }, + { + "epoch": 0.32, + "grad_norm": 1.4947164298000466, + "learning_rate": 7.898730698013331e-06, + "loss": 0.3497, + "step": 11173 + }, + { + "epoch": 0.32, + "grad_norm": 1.674323377207144, + "learning_rate": 7.898347968762602e-06, + "loss": 0.34, + "step": 11174 + }, + { + "epoch": 0.32, + "grad_norm": 1.3394557223736712, + "learning_rate": 7.897965213933923e-06, + "loss": 0.3281, + "step": 11175 + }, + { + "epoch": 0.32, + "grad_norm": 2.171960404303055, + "learning_rate": 7.897582433530668e-06, + "loss": 0.3587, + "step": 11176 + }, + { + "epoch": 0.32, + "grad_norm": 1.2735264046569323, + "learning_rate": 7.897199627556216e-06, + "loss": 0.3322, + "step": 11177 + }, + { + "epoch": 0.32, + "grad_norm": 2.233111421922039, + "learning_rate": 7.896816796013943e-06, + "loss": 0.3713, + "step": 11178 + }, + { + "epoch": 0.32, + "grad_norm": 1.700138318023511, + "learning_rate": 7.896433938907231e-06, + "loss": 0.3397, + "step": 11179 + }, + { + "epoch": 0.32, + "grad_norm": 1.5203195221777512, + "learning_rate": 7.896051056239457e-06, + "loss": 0.3757, + "step": 11180 + }, + { + "epoch": 0.32, + "grad_norm": 1.4672749007317833, + "learning_rate": 7.895668148014002e-06, + "loss": 0.3213, + "step": 11181 + }, + { + "epoch": 0.32, + "grad_norm": 1.7149584508409124, + "learning_rate": 7.895285214234242e-06, + "loss": 0.3569, + "step": 11182 + }, + { + "epoch": 0.32, + "grad_norm": 1.340288930249492, + "learning_rate": 7.894902254903557e-06, + "loss": 0.3445, + "step": 11183 + }, + { + "epoch": 0.32, + "grad_norm": 1.3221863192218988, + "learning_rate": 7.894519270025329e-06, + "loss": 0.3492, + "step": 11184 + }, + { + "epoch": 0.32, + "grad_norm": 1.6315423308446817, + "learning_rate": 7.894136259602937e-06, + "loss": 0.3465, + "step": 11185 + }, + { + "epoch": 0.32, + "grad_norm": 1.4705083739671747, + "learning_rate": 7.893753223639758e-06, + "loss": 0.3223, + "step": 11186 + }, + { + "epoch": 0.32, + "grad_norm": 2.631993967125824, + "learning_rate": 7.893370162139175e-06, + "loss": 0.3275, + "step": 11187 + }, + { + "epoch": 0.32, + "grad_norm": 1.3754447307612312, + "learning_rate": 7.892987075104572e-06, + "loss": 0.3134, + "step": 11188 + }, + { + "epoch": 0.32, + "grad_norm": 1.4367719527096148, + "learning_rate": 7.892603962539323e-06, + "loss": 0.4014, + "step": 11189 + }, + { + "epoch": 0.32, + "grad_norm": 1.3002376423785966, + "learning_rate": 7.89222082444681e-06, + "loss": 0.3181, + "step": 11190 + }, + { + "epoch": 0.32, + "grad_norm": 1.4470956529144732, + "learning_rate": 7.891837660830419e-06, + "loss": 0.3464, + "step": 11191 + }, + { + "epoch": 0.32, + "grad_norm": 1.5994636599755294, + "learning_rate": 7.891454471693529e-06, + "loss": 0.3326, + "step": 11192 + }, + { + "epoch": 0.32, + "grad_norm": 1.811439192601663, + "learning_rate": 7.891071257039519e-06, + "loss": 0.3316, + "step": 11193 + }, + { + "epoch": 0.32, + "grad_norm": 1.6931014822861412, + "learning_rate": 7.890688016871774e-06, + "loss": 0.3481, + "step": 11194 + }, + { + "epoch": 0.32, + "grad_norm": 2.7157851046016095, + "learning_rate": 7.890304751193675e-06, + "loss": 0.363, + "step": 11195 + }, + { + "epoch": 0.32, + "grad_norm": 1.52070126216732, + "learning_rate": 7.889921460008604e-06, + "loss": 0.3454, + "step": 11196 + }, + { + "epoch": 0.32, + "grad_norm": 1.56758108134774, + "learning_rate": 7.889538143319945e-06, + "loss": 0.3439, + "step": 11197 + }, + { + "epoch": 0.32, + "grad_norm": 1.2479802934596855, + "learning_rate": 7.88915480113108e-06, + "loss": 0.3541, + "step": 11198 + }, + { + "epoch": 0.32, + "grad_norm": 1.3069872816342791, + "learning_rate": 7.888771433445393e-06, + "loss": 0.3474, + "step": 11199 + }, + { + "epoch": 0.32, + "grad_norm": 1.4007008541283261, + "learning_rate": 7.888388040266263e-06, + "loss": 0.3107, + "step": 11200 + }, + { + "epoch": 0.32, + "grad_norm": 1.2717467549804873, + "learning_rate": 7.888004621597079e-06, + "loss": 0.3445, + "step": 11201 + }, + { + "epoch": 0.32, + "grad_norm": 3.944793675311623, + "learning_rate": 7.887621177441223e-06, + "loss": 0.3607, + "step": 11202 + }, + { + "epoch": 0.32, + "grad_norm": 2.8354576563043867, + "learning_rate": 7.887237707802075e-06, + "loss": 0.3591, + "step": 11203 + }, + { + "epoch": 0.32, + "grad_norm": 1.3710448502067019, + "learning_rate": 7.886854212683024e-06, + "loss": 0.3334, + "step": 11204 + }, + { + "epoch": 0.32, + "grad_norm": 1.237725266215224, + "learning_rate": 7.886470692087453e-06, + "loss": 0.3236, + "step": 11205 + }, + { + "epoch": 0.33, + "grad_norm": 1.3694709067245563, + "learning_rate": 7.886087146018744e-06, + "loss": 0.3311, + "step": 11206 + }, + { + "epoch": 0.33, + "grad_norm": 1.516753774482218, + "learning_rate": 7.885703574480286e-06, + "loss": 0.3416, + "step": 11207 + }, + { + "epoch": 0.33, + "grad_norm": 1.3824009866837814, + "learning_rate": 7.885319977475461e-06, + "loss": 0.3322, + "step": 11208 + }, + { + "epoch": 0.33, + "grad_norm": 1.3751294879770404, + "learning_rate": 7.884936355007656e-06, + "loss": 0.3346, + "step": 11209 + }, + { + "epoch": 0.33, + "grad_norm": 1.4008222687137746, + "learning_rate": 7.884552707080252e-06, + "loss": 0.3127, + "step": 11210 + }, + { + "epoch": 0.33, + "grad_norm": 1.278701024479458, + "learning_rate": 7.884169033696643e-06, + "loss": 0.3357, + "step": 11211 + }, + { + "epoch": 0.33, + "grad_norm": 1.5665625433146573, + "learning_rate": 7.883785334860208e-06, + "loss": 0.3381, + "step": 11212 + }, + { + "epoch": 0.33, + "grad_norm": 1.502370177068472, + "learning_rate": 7.883401610574338e-06, + "loss": 0.3274, + "step": 11213 + }, + { + "epoch": 0.33, + "grad_norm": 1.496771565163994, + "learning_rate": 7.883017860842412e-06, + "loss": 0.3359, + "step": 11214 + }, + { + "epoch": 0.33, + "grad_norm": 1.438010366035893, + "learning_rate": 7.882634085667826e-06, + "loss": 0.3446, + "step": 11215 + }, + { + "epoch": 0.33, + "grad_norm": 1.3010539887769352, + "learning_rate": 7.88225028505396e-06, + "loss": 0.3428, + "step": 11216 + }, + { + "epoch": 0.33, + "grad_norm": 3.1914046611275695, + "learning_rate": 7.881866459004203e-06, + "loss": 0.3535, + "step": 11217 + }, + { + "epoch": 0.33, + "grad_norm": 1.4310642014885702, + "learning_rate": 7.881482607521943e-06, + "loss": 0.3187, + "step": 11218 + }, + { + "epoch": 0.33, + "grad_norm": 1.60122171580658, + "learning_rate": 7.881098730610569e-06, + "loss": 0.3538, + "step": 11219 + }, + { + "epoch": 0.33, + "grad_norm": 1.4814467681007266, + "learning_rate": 7.880714828273464e-06, + "loss": 0.3572, + "step": 11220 + }, + { + "epoch": 0.33, + "grad_norm": 1.2728635798682872, + "learning_rate": 7.88033090051402e-06, + "loss": 0.3611, + "step": 11221 + }, + { + "epoch": 0.33, + "grad_norm": 1.3467430734459858, + "learning_rate": 7.879946947335624e-06, + "loss": 0.3258, + "step": 11222 + }, + { + "epoch": 0.33, + "grad_norm": 1.4235488964744971, + "learning_rate": 7.879562968741662e-06, + "loss": 0.3454, + "step": 11223 + }, + { + "epoch": 0.33, + "grad_norm": 2.3949070959490224, + "learning_rate": 7.879178964735528e-06, + "loss": 0.3762, + "step": 11224 + }, + { + "epoch": 0.33, + "grad_norm": 1.5891628419252215, + "learning_rate": 7.878794935320606e-06, + "loss": 0.3448, + "step": 11225 + }, + { + "epoch": 0.33, + "grad_norm": 1.8054418462256465, + "learning_rate": 7.878410880500288e-06, + "loss": 0.3377, + "step": 11226 + }, + { + "epoch": 0.33, + "grad_norm": 0.9622738558397279, + "learning_rate": 7.878026800277961e-06, + "loss": 0.5821, + "step": 11227 + }, + { + "epoch": 0.33, + "grad_norm": 1.3495742583056942, + "learning_rate": 7.877642694657014e-06, + "loss": 0.3318, + "step": 11228 + }, + { + "epoch": 0.33, + "grad_norm": 1.341167747096644, + "learning_rate": 7.877258563640841e-06, + "loss": 0.356, + "step": 11229 + }, + { + "epoch": 0.33, + "grad_norm": 1.4143467090368909, + "learning_rate": 7.876874407232829e-06, + "loss": 0.3406, + "step": 11230 + }, + { + "epoch": 0.33, + "grad_norm": 1.319666914073028, + "learning_rate": 7.876490225436366e-06, + "loss": 0.3629, + "step": 11231 + }, + { + "epoch": 0.33, + "grad_norm": 1.4809815991017956, + "learning_rate": 7.876106018254848e-06, + "loss": 0.3244, + "step": 11232 + }, + { + "epoch": 0.33, + "grad_norm": 1.840754311765362, + "learning_rate": 7.87572178569166e-06, + "loss": 0.3105, + "step": 11233 + }, + { + "epoch": 0.33, + "grad_norm": 1.3452197012398543, + "learning_rate": 7.875337527750197e-06, + "loss": 0.3118, + "step": 11234 + }, + { + "epoch": 0.33, + "grad_norm": 1.3740567404085955, + "learning_rate": 7.874953244433847e-06, + "loss": 0.3434, + "step": 11235 + }, + { + "epoch": 0.33, + "grad_norm": 1.2719000802910214, + "learning_rate": 7.874568935746003e-06, + "loss": 0.334, + "step": 11236 + }, + { + "epoch": 0.33, + "grad_norm": 1.3228929497066062, + "learning_rate": 7.874184601690059e-06, + "loss": 0.3396, + "step": 11237 + }, + { + "epoch": 0.33, + "grad_norm": 1.3849312087377363, + "learning_rate": 7.873800242269401e-06, + "loss": 0.3416, + "step": 11238 + }, + { + "epoch": 0.33, + "grad_norm": 1.3413216913755348, + "learning_rate": 7.873415857487424e-06, + "loss": 0.3785, + "step": 11239 + }, + { + "epoch": 0.33, + "grad_norm": 1.3116453991744408, + "learning_rate": 7.87303144734752e-06, + "loss": 0.3352, + "step": 11240 + }, + { + "epoch": 0.33, + "grad_norm": 1.3975974454559132, + "learning_rate": 7.872647011853083e-06, + "loss": 0.3501, + "step": 11241 + }, + { + "epoch": 0.33, + "grad_norm": 1.2556871905932991, + "learning_rate": 7.872262551007504e-06, + "loss": 0.3229, + "step": 11242 + }, + { + "epoch": 0.33, + "grad_norm": 1.4533954708016914, + "learning_rate": 7.871878064814177e-06, + "loss": 0.3209, + "step": 11243 + }, + { + "epoch": 0.33, + "grad_norm": 1.4535821993527671, + "learning_rate": 7.871493553276492e-06, + "loss": 0.3289, + "step": 11244 + }, + { + "epoch": 0.33, + "grad_norm": 1.663886200319414, + "learning_rate": 7.871109016397846e-06, + "loss": 0.3528, + "step": 11245 + }, + { + "epoch": 0.33, + "grad_norm": 1.3283361582719817, + "learning_rate": 7.87072445418163e-06, + "loss": 0.3368, + "step": 11246 + }, + { + "epoch": 0.33, + "grad_norm": 1.2531706174811155, + "learning_rate": 7.87033986663124e-06, + "loss": 0.3483, + "step": 11247 + }, + { + "epoch": 0.33, + "grad_norm": 1.9870335093555842, + "learning_rate": 7.869955253750069e-06, + "loss": 0.3448, + "step": 11248 + }, + { + "epoch": 0.33, + "grad_norm": 1.5580063768417876, + "learning_rate": 7.86957061554151e-06, + "loss": 0.37, + "step": 11249 + }, + { + "epoch": 0.33, + "grad_norm": 1.351365146156303, + "learning_rate": 7.869185952008959e-06, + "loss": 0.3174, + "step": 11250 + }, + { + "epoch": 0.33, + "grad_norm": 1.4599427362484656, + "learning_rate": 7.868801263155809e-06, + "loss": 0.3495, + "step": 11251 + }, + { + "epoch": 0.33, + "grad_norm": 1.342070928760409, + "learning_rate": 7.868416548985458e-06, + "loss": 0.3403, + "step": 11252 + }, + { + "epoch": 0.33, + "grad_norm": 1.2838651489440585, + "learning_rate": 7.868031809501298e-06, + "loss": 0.344, + "step": 11253 + }, + { + "epoch": 0.33, + "grad_norm": 1.3377068198819488, + "learning_rate": 7.867647044706725e-06, + "loss": 0.3394, + "step": 11254 + }, + { + "epoch": 0.33, + "grad_norm": 1.5515090330291348, + "learning_rate": 7.867262254605137e-06, + "loss": 0.3392, + "step": 11255 + }, + { + "epoch": 0.33, + "grad_norm": 1.4037177464567476, + "learning_rate": 7.866877439199927e-06, + "loss": 0.3935, + "step": 11256 + }, + { + "epoch": 0.33, + "grad_norm": 1.4861465205497137, + "learning_rate": 7.86649259849449e-06, + "loss": 0.3425, + "step": 11257 + }, + { + "epoch": 0.33, + "grad_norm": 1.3968923616044373, + "learning_rate": 7.866107732492226e-06, + "loss": 0.34, + "step": 11258 + }, + { + "epoch": 0.33, + "grad_norm": 1.3456594187554478, + "learning_rate": 7.86572284119653e-06, + "loss": 0.3523, + "step": 11259 + }, + { + "epoch": 0.33, + "grad_norm": 1.4483824840727706, + "learning_rate": 7.865337924610796e-06, + "loss": 0.3651, + "step": 11260 + }, + { + "epoch": 0.33, + "grad_norm": 2.281658947167208, + "learning_rate": 7.864952982738424e-06, + "loss": 0.3579, + "step": 11261 + }, + { + "epoch": 0.33, + "grad_norm": 1.248386950999073, + "learning_rate": 7.86456801558281e-06, + "loss": 0.3465, + "step": 11262 + }, + { + "epoch": 0.33, + "grad_norm": 1.3278252407646736, + "learning_rate": 7.86418302314735e-06, + "loss": 0.3348, + "step": 11263 + }, + { + "epoch": 0.33, + "grad_norm": 1.5096056656907562, + "learning_rate": 7.863798005435446e-06, + "loss": 0.3472, + "step": 11264 + }, + { + "epoch": 0.33, + "grad_norm": 1.4316203548408224, + "learning_rate": 7.863412962450491e-06, + "loss": 0.3626, + "step": 11265 + }, + { + "epoch": 0.33, + "grad_norm": 1.4014267263856615, + "learning_rate": 7.863027894195885e-06, + "loss": 0.3296, + "step": 11266 + }, + { + "epoch": 0.33, + "grad_norm": 1.636019136734101, + "learning_rate": 7.862642800675027e-06, + "loss": 0.3395, + "step": 11267 + }, + { + "epoch": 0.33, + "grad_norm": 1.564893809729909, + "learning_rate": 7.862257681891314e-06, + "loss": 0.3397, + "step": 11268 + }, + { + "epoch": 0.33, + "grad_norm": 1.3146996197999319, + "learning_rate": 7.861872537848146e-06, + "loss": 0.3417, + "step": 11269 + }, + { + "epoch": 0.33, + "grad_norm": 1.3887568265062864, + "learning_rate": 7.86148736854892e-06, + "loss": 0.3469, + "step": 11270 + }, + { + "epoch": 0.33, + "grad_norm": 1.2565623451816783, + "learning_rate": 7.861102173997034e-06, + "loss": 0.3574, + "step": 11271 + }, + { + "epoch": 0.33, + "grad_norm": 1.78126945946478, + "learning_rate": 7.860716954195894e-06, + "loss": 0.34, + "step": 11272 + }, + { + "epoch": 0.33, + "grad_norm": 2.009700222518084, + "learning_rate": 7.860331709148894e-06, + "loss": 0.3412, + "step": 11273 + }, + { + "epoch": 0.33, + "grad_norm": 1.2644006491360769, + "learning_rate": 7.859946438859434e-06, + "loss": 0.337, + "step": 11274 + }, + { + "epoch": 0.33, + "grad_norm": 1.296408494315332, + "learning_rate": 7.859561143330916e-06, + "loss": 0.3258, + "step": 11275 + }, + { + "epoch": 0.33, + "grad_norm": 1.5073400107707475, + "learning_rate": 7.859175822566737e-06, + "loss": 0.3306, + "step": 11276 + }, + { + "epoch": 0.33, + "grad_norm": 1.4191433502955735, + "learning_rate": 7.858790476570302e-06, + "loss": 0.3497, + "step": 11277 + }, + { + "epoch": 0.33, + "grad_norm": 1.324139534907289, + "learning_rate": 7.858405105345006e-06, + "loss": 0.3523, + "step": 11278 + }, + { + "epoch": 0.33, + "grad_norm": 1.2155038814810863, + "learning_rate": 7.858019708894257e-06, + "loss": 0.3342, + "step": 11279 + }, + { + "epoch": 0.33, + "grad_norm": 1.6345946782657057, + "learning_rate": 7.85763428722145e-06, + "loss": 0.3508, + "step": 11280 + }, + { + "epoch": 0.33, + "grad_norm": 1.6517010836851609, + "learning_rate": 7.857248840329989e-06, + "loss": 0.3553, + "step": 11281 + }, + { + "epoch": 0.33, + "grad_norm": 1.4170848721058837, + "learning_rate": 7.856863368223277e-06, + "loss": 0.3716, + "step": 11282 + }, + { + "epoch": 0.33, + "grad_norm": 1.5165838978156765, + "learning_rate": 7.856477870904711e-06, + "loss": 0.3247, + "step": 11283 + }, + { + "epoch": 0.33, + "grad_norm": 1.5956077317522777, + "learning_rate": 7.8560923483777e-06, + "loss": 0.3621, + "step": 11284 + }, + { + "epoch": 0.33, + "grad_norm": 1.2694390674763176, + "learning_rate": 7.85570680064564e-06, + "loss": 0.3345, + "step": 11285 + }, + { + "epoch": 0.33, + "grad_norm": 1.325539937428255, + "learning_rate": 7.855321227711936e-06, + "loss": 0.3239, + "step": 11286 + }, + { + "epoch": 0.33, + "grad_norm": 2.8671095124773074, + "learning_rate": 7.854935629579992e-06, + "loss": 0.3681, + "step": 11287 + }, + { + "epoch": 0.33, + "grad_norm": 1.8131503269037617, + "learning_rate": 7.854550006253208e-06, + "loss": 0.357, + "step": 11288 + }, + { + "epoch": 0.33, + "grad_norm": 1.2640767362777034, + "learning_rate": 7.854164357734989e-06, + "loss": 0.3427, + "step": 11289 + }, + { + "epoch": 0.33, + "grad_norm": 1.7220045827406607, + "learning_rate": 7.853778684028735e-06, + "loss": 0.3426, + "step": 11290 + }, + { + "epoch": 0.33, + "grad_norm": 1.3545341362083985, + "learning_rate": 7.853392985137856e-06, + "loss": 0.3339, + "step": 11291 + }, + { + "epoch": 0.33, + "grad_norm": 1.3759094390391164, + "learning_rate": 7.853007261065751e-06, + "loss": 0.346, + "step": 11292 + }, + { + "epoch": 0.33, + "grad_norm": 1.3167920965697357, + "learning_rate": 7.852621511815825e-06, + "loss": 0.3582, + "step": 11293 + }, + { + "epoch": 0.33, + "grad_norm": 3.3639528033263044, + "learning_rate": 7.852235737391483e-06, + "loss": 0.3263, + "step": 11294 + }, + { + "epoch": 0.33, + "grad_norm": 1.4573995271764297, + "learning_rate": 7.85184993779613e-06, + "loss": 0.324, + "step": 11295 + }, + { + "epoch": 0.33, + "grad_norm": 1.598306329419259, + "learning_rate": 7.851464113033166e-06, + "loss": 0.3256, + "step": 11296 + }, + { + "epoch": 0.33, + "grad_norm": 1.3316770228340746, + "learning_rate": 7.851078263106004e-06, + "loss": 0.3371, + "step": 11297 + }, + { + "epoch": 0.33, + "grad_norm": 1.2131438773807977, + "learning_rate": 7.850692388018042e-06, + "loss": 0.3444, + "step": 11298 + }, + { + "epoch": 0.33, + "grad_norm": 1.5383785197923385, + "learning_rate": 7.85030648777269e-06, + "loss": 0.3402, + "step": 11299 + }, + { + "epoch": 0.33, + "grad_norm": 1.6185834515725868, + "learning_rate": 7.849920562373352e-06, + "loss": 0.3738, + "step": 11300 + }, + { + "epoch": 0.33, + "grad_norm": 4.082005031422438, + "learning_rate": 7.84953461182343e-06, + "loss": 0.3474, + "step": 11301 + }, + { + "epoch": 0.33, + "grad_norm": 1.358215523255849, + "learning_rate": 7.849148636126336e-06, + "loss": 0.3491, + "step": 11302 + }, + { + "epoch": 0.33, + "grad_norm": 1.3219522789683438, + "learning_rate": 7.848762635285472e-06, + "loss": 0.3299, + "step": 11303 + }, + { + "epoch": 0.33, + "grad_norm": 0.9397845654711809, + "learning_rate": 7.848376609304249e-06, + "loss": 0.6321, + "step": 11304 + }, + { + "epoch": 0.33, + "grad_norm": 1.4068531816564867, + "learning_rate": 7.847990558186068e-06, + "loss": 0.3216, + "step": 11305 + }, + { + "epoch": 0.33, + "grad_norm": 1.8152678788564336, + "learning_rate": 7.847604481934339e-06, + "loss": 0.3403, + "step": 11306 + }, + { + "epoch": 0.33, + "grad_norm": 1.4191117358491254, + "learning_rate": 7.84721838055247e-06, + "loss": 0.3552, + "step": 11307 + }, + { + "epoch": 0.33, + "grad_norm": 1.8115049957562195, + "learning_rate": 7.846832254043866e-06, + "loss": 0.3449, + "step": 11308 + }, + { + "epoch": 0.33, + "grad_norm": 1.3964191666187413, + "learning_rate": 7.846446102411937e-06, + "loss": 0.342, + "step": 11309 + }, + { + "epoch": 0.33, + "grad_norm": 1.4188936856513508, + "learning_rate": 7.84605992566009e-06, + "loss": 0.3572, + "step": 11310 + }, + { + "epoch": 0.33, + "grad_norm": 1.7575129520146233, + "learning_rate": 7.845673723791732e-06, + "loss": 0.3404, + "step": 11311 + }, + { + "epoch": 0.33, + "grad_norm": 1.6959473081562302, + "learning_rate": 7.84528749681027e-06, + "loss": 0.3536, + "step": 11312 + }, + { + "epoch": 0.33, + "grad_norm": 1.2091116204305943, + "learning_rate": 7.844901244719116e-06, + "loss": 0.3625, + "step": 11313 + }, + { + "epoch": 0.33, + "grad_norm": 2.379223580739836, + "learning_rate": 7.844514967521679e-06, + "loss": 0.3427, + "step": 11314 + }, + { + "epoch": 0.33, + "grad_norm": 1.2958440606374975, + "learning_rate": 7.844128665221364e-06, + "loss": 0.3666, + "step": 11315 + }, + { + "epoch": 0.33, + "grad_norm": 1.2829659328858734, + "learning_rate": 7.84374233782158e-06, + "loss": 0.3305, + "step": 11316 + }, + { + "epoch": 0.33, + "grad_norm": 1.4099128981048117, + "learning_rate": 7.84335598532574e-06, + "loss": 0.3433, + "step": 11317 + }, + { + "epoch": 0.33, + "grad_norm": 1.3230245155170572, + "learning_rate": 7.842969607737252e-06, + "loss": 0.3229, + "step": 11318 + }, + { + "epoch": 0.33, + "grad_norm": 1.5804560960915162, + "learning_rate": 7.842583205059526e-06, + "loss": 0.3936, + "step": 11319 + }, + { + "epoch": 0.33, + "grad_norm": 1.576958578257033, + "learning_rate": 7.842196777295972e-06, + "loss": 0.3455, + "step": 11320 + }, + { + "epoch": 0.33, + "grad_norm": 1.5412886915537527, + "learning_rate": 7.84181032445e-06, + "loss": 0.3289, + "step": 11321 + }, + { + "epoch": 0.33, + "grad_norm": 1.2823474183665697, + "learning_rate": 7.841423846525018e-06, + "loss": 0.3572, + "step": 11322 + }, + { + "epoch": 0.33, + "grad_norm": 1.3391085042494282, + "learning_rate": 7.84103734352444e-06, + "loss": 0.3568, + "step": 11323 + }, + { + "epoch": 0.33, + "grad_norm": 1.5037190368411175, + "learning_rate": 7.840650815451675e-06, + "loss": 0.3333, + "step": 11324 + }, + { + "epoch": 0.33, + "grad_norm": 1.3925746286959455, + "learning_rate": 7.840264262310137e-06, + "loss": 0.3436, + "step": 11325 + }, + { + "epoch": 0.33, + "grad_norm": 1.2427243869624345, + "learning_rate": 7.839877684103237e-06, + "loss": 0.3383, + "step": 11326 + }, + { + "epoch": 0.33, + "grad_norm": 1.4301743334643633, + "learning_rate": 7.839491080834382e-06, + "loss": 0.337, + "step": 11327 + }, + { + "epoch": 0.33, + "grad_norm": 1.5603241675259694, + "learning_rate": 7.839104452506988e-06, + "loss": 0.3762, + "step": 11328 + }, + { + "epoch": 0.33, + "grad_norm": 3.884039609928281, + "learning_rate": 7.838717799124464e-06, + "loss": 0.3356, + "step": 11329 + }, + { + "epoch": 0.33, + "grad_norm": 1.6108234033054591, + "learning_rate": 7.838331120690225e-06, + "loss": 0.3554, + "step": 11330 + }, + { + "epoch": 0.33, + "grad_norm": 1.4113572757012631, + "learning_rate": 7.837944417207683e-06, + "loss": 0.3409, + "step": 11331 + }, + { + "epoch": 0.33, + "grad_norm": 1.5737473068894798, + "learning_rate": 7.837557688680251e-06, + "loss": 0.3305, + "step": 11332 + }, + { + "epoch": 0.33, + "grad_norm": 1.482550455496487, + "learning_rate": 7.837170935111338e-06, + "loss": 0.346, + "step": 11333 + }, + { + "epoch": 0.33, + "grad_norm": 1.5472587824618664, + "learning_rate": 7.836784156504362e-06, + "loss": 0.3422, + "step": 11334 + }, + { + "epoch": 0.33, + "grad_norm": 1.3912347772698748, + "learning_rate": 7.836397352862733e-06, + "loss": 0.3697, + "step": 11335 + }, + { + "epoch": 0.33, + "grad_norm": 1.4566204767050128, + "learning_rate": 7.836010524189868e-06, + "loss": 0.3314, + "step": 11336 + }, + { + "epoch": 0.33, + "grad_norm": 1.3548144766273265, + "learning_rate": 7.835623670489176e-06, + "loss": 0.3364, + "step": 11337 + }, + { + "epoch": 0.33, + "grad_norm": 1.3326422436620473, + "learning_rate": 7.835236791764076e-06, + "loss": 0.3701, + "step": 11338 + }, + { + "epoch": 0.33, + "grad_norm": 1.4724397323535101, + "learning_rate": 7.834849888017979e-06, + "loss": 0.3402, + "step": 11339 + }, + { + "epoch": 0.33, + "grad_norm": 1.8056363900796055, + "learning_rate": 7.8344629592543e-06, + "loss": 0.3293, + "step": 11340 + }, + { + "epoch": 0.33, + "grad_norm": 1.3310751136821322, + "learning_rate": 7.834076005476454e-06, + "loss": 0.3257, + "step": 11341 + }, + { + "epoch": 0.33, + "grad_norm": 1.477796045342004, + "learning_rate": 7.833689026687856e-06, + "loss": 0.3416, + "step": 11342 + }, + { + "epoch": 0.33, + "grad_norm": 1.8035831616847382, + "learning_rate": 7.833302022891921e-06, + "loss": 0.3331, + "step": 11343 + }, + { + "epoch": 0.33, + "grad_norm": 1.3962824890476468, + "learning_rate": 7.832914994092064e-06, + "loss": 0.3466, + "step": 11344 + }, + { + "epoch": 0.33, + "grad_norm": 1.4717177414843199, + "learning_rate": 7.8325279402917e-06, + "loss": 0.3183, + "step": 11345 + }, + { + "epoch": 0.33, + "grad_norm": 1.3434569287945388, + "learning_rate": 7.832140861494248e-06, + "loss": 0.3284, + "step": 11346 + }, + { + "epoch": 0.33, + "grad_norm": 1.3301522839123525, + "learning_rate": 7.83175375770312e-06, + "loss": 0.3497, + "step": 11347 + }, + { + "epoch": 0.33, + "grad_norm": 1.4107715570101123, + "learning_rate": 7.831366628921733e-06, + "loss": 0.3661, + "step": 11348 + }, + { + "epoch": 0.33, + "grad_norm": 1.3327315878547803, + "learning_rate": 7.830979475153504e-06, + "loss": 0.3411, + "step": 11349 + }, + { + "epoch": 0.33, + "grad_norm": 1.5286175064627103, + "learning_rate": 7.83059229640185e-06, + "loss": 0.3239, + "step": 11350 + }, + { + "epoch": 0.33, + "grad_norm": 1.383867287472983, + "learning_rate": 7.830205092670189e-06, + "loss": 0.3431, + "step": 11351 + }, + { + "epoch": 0.33, + "grad_norm": 1.4251199732973914, + "learning_rate": 7.829817863961935e-06, + "loss": 0.3425, + "step": 11352 + }, + { + "epoch": 0.33, + "grad_norm": 1.3502130588877541, + "learning_rate": 7.829430610280507e-06, + "loss": 0.33, + "step": 11353 + }, + { + "epoch": 0.33, + "grad_norm": 1.39861098338975, + "learning_rate": 7.829043331629323e-06, + "loss": 0.3447, + "step": 11354 + }, + { + "epoch": 0.33, + "grad_norm": 1.3760428642785323, + "learning_rate": 7.828656028011801e-06, + "loss": 0.3489, + "step": 11355 + }, + { + "epoch": 0.33, + "grad_norm": 1.7636211156274486, + "learning_rate": 7.828268699431358e-06, + "loss": 0.362, + "step": 11356 + }, + { + "epoch": 0.33, + "grad_norm": 1.3872288093605878, + "learning_rate": 7.827881345891411e-06, + "loss": 0.3456, + "step": 11357 + }, + { + "epoch": 0.33, + "grad_norm": 1.3537965853678218, + "learning_rate": 7.82749396739538e-06, + "loss": 0.3379, + "step": 11358 + }, + { + "epoch": 0.33, + "grad_norm": 1.367365338925557, + "learning_rate": 7.827106563946684e-06, + "loss": 0.352, + "step": 11359 + }, + { + "epoch": 0.33, + "grad_norm": 0.9923592041800793, + "learning_rate": 7.826719135548742e-06, + "loss": 0.5574, + "step": 11360 + }, + { + "epoch": 0.33, + "grad_norm": 1.327550622947669, + "learning_rate": 7.82633168220497e-06, + "loss": 0.3275, + "step": 11361 + }, + { + "epoch": 0.33, + "grad_norm": 2.382270805537581, + "learning_rate": 7.825944203918792e-06, + "loss": 0.3547, + "step": 11362 + }, + { + "epoch": 0.33, + "grad_norm": 1.3551866475575478, + "learning_rate": 7.825556700693624e-06, + "loss": 0.3374, + "step": 11363 + }, + { + "epoch": 0.33, + "grad_norm": 1.6180995682516182, + "learning_rate": 7.825169172532888e-06, + "loss": 0.3438, + "step": 11364 + }, + { + "epoch": 0.33, + "grad_norm": 2.4731323002617547, + "learning_rate": 7.82478161944e-06, + "loss": 0.342, + "step": 11365 + }, + { + "epoch": 0.33, + "grad_norm": 1.3702906805545834, + "learning_rate": 7.824394041418384e-06, + "loss": 0.3409, + "step": 11366 + }, + { + "epoch": 0.33, + "grad_norm": 1.5033688587126675, + "learning_rate": 7.824006438471458e-06, + "loss": 0.3538, + "step": 11367 + }, + { + "epoch": 0.33, + "grad_norm": 1.5272297645964068, + "learning_rate": 7.823618810602646e-06, + "loss": 0.3644, + "step": 11368 + }, + { + "epoch": 0.33, + "grad_norm": 1.4104416780778528, + "learning_rate": 7.823231157815367e-06, + "loss": 0.3763, + "step": 11369 + }, + { + "epoch": 0.33, + "grad_norm": 1.721652424763735, + "learning_rate": 7.82284348011304e-06, + "loss": 0.3526, + "step": 11370 + }, + { + "epoch": 0.33, + "grad_norm": 1.2913413843296686, + "learning_rate": 7.82245577749909e-06, + "loss": 0.3435, + "step": 11371 + }, + { + "epoch": 0.33, + "grad_norm": 1.5015393169676197, + "learning_rate": 7.822068049976934e-06, + "loss": 0.3251, + "step": 11372 + }, + { + "epoch": 0.33, + "grad_norm": 1.7740021013326817, + "learning_rate": 7.821680297549998e-06, + "loss": 0.3661, + "step": 11373 + }, + { + "epoch": 0.33, + "grad_norm": 1.5493070486330647, + "learning_rate": 7.821292520221702e-06, + "loss": 0.3794, + "step": 11374 + }, + { + "epoch": 0.33, + "grad_norm": 1.5431759807426857, + "learning_rate": 7.820904717995468e-06, + "loss": 0.3558, + "step": 11375 + }, + { + "epoch": 0.33, + "grad_norm": 1.9503985954197283, + "learning_rate": 7.820516890874718e-06, + "loss": 0.3409, + "step": 11376 + }, + { + "epoch": 0.33, + "grad_norm": 1.4711344408674123, + "learning_rate": 7.820129038862875e-06, + "loss": 0.3571, + "step": 11377 + }, + { + "epoch": 0.33, + "grad_norm": 1.3192822144876308, + "learning_rate": 7.819741161963361e-06, + "loss": 0.3436, + "step": 11378 + }, + { + "epoch": 0.33, + "grad_norm": 1.4835308953527806, + "learning_rate": 7.819353260179603e-06, + "loss": 0.3398, + "step": 11379 + }, + { + "epoch": 0.33, + "grad_norm": 1.4154449464126073, + "learning_rate": 7.818965333515018e-06, + "loss": 0.3295, + "step": 11380 + }, + { + "epoch": 0.33, + "grad_norm": 1.4266582391304325, + "learning_rate": 7.818577381973035e-06, + "loss": 0.3232, + "step": 11381 + }, + { + "epoch": 0.33, + "grad_norm": 1.480619330045077, + "learning_rate": 7.818189405557075e-06, + "loss": 0.3408, + "step": 11382 + }, + { + "epoch": 0.33, + "grad_norm": 1.2224006926889266, + "learning_rate": 7.817801404270561e-06, + "loss": 0.3435, + "step": 11383 + }, + { + "epoch": 0.33, + "grad_norm": 1.465330712496723, + "learning_rate": 7.817413378116918e-06, + "loss": 0.3474, + "step": 11384 + }, + { + "epoch": 0.33, + "grad_norm": 1.42233837816896, + "learning_rate": 7.817025327099574e-06, + "loss": 0.3505, + "step": 11385 + }, + { + "epoch": 0.33, + "grad_norm": 1.5685793449498484, + "learning_rate": 7.816637251221948e-06, + "loss": 0.3264, + "step": 11386 + }, + { + "epoch": 0.33, + "grad_norm": 1.411367080527796, + "learning_rate": 7.816249150487467e-06, + "loss": 0.3715, + "step": 11387 + }, + { + "epoch": 0.33, + "grad_norm": 1.422873131958085, + "learning_rate": 7.815861024899556e-06, + "loss": 0.3428, + "step": 11388 + }, + { + "epoch": 0.33, + "grad_norm": 1.3245930118110578, + "learning_rate": 7.81547287446164e-06, + "loss": 0.3427, + "step": 11389 + }, + { + "epoch": 0.33, + "grad_norm": 1.689185465410745, + "learning_rate": 7.815084699177145e-06, + "loss": 0.3408, + "step": 11390 + }, + { + "epoch": 0.33, + "grad_norm": 1.294501385676415, + "learning_rate": 7.814696499049496e-06, + "loss": 0.3497, + "step": 11391 + }, + { + "epoch": 0.33, + "grad_norm": 0.9904386830957067, + "learning_rate": 7.814308274082119e-06, + "loss": 0.5982, + "step": 11392 + }, + { + "epoch": 0.33, + "grad_norm": 1.321833300782887, + "learning_rate": 7.81392002427844e-06, + "loss": 0.3365, + "step": 11393 + }, + { + "epoch": 0.33, + "grad_norm": 1.5074324061665174, + "learning_rate": 7.813531749641885e-06, + "loss": 0.3993, + "step": 11394 + }, + { + "epoch": 0.33, + "grad_norm": 1.4563894952244913, + "learning_rate": 7.813143450175883e-06, + "loss": 0.3393, + "step": 11395 + }, + { + "epoch": 0.33, + "grad_norm": 1.604116675163695, + "learning_rate": 7.812755125883859e-06, + "loss": 0.3206, + "step": 11396 + }, + { + "epoch": 0.33, + "grad_norm": 1.3971115927124589, + "learning_rate": 7.812366776769239e-06, + "loss": 0.3553, + "step": 11397 + }, + { + "epoch": 0.33, + "grad_norm": 1.2949841111694949, + "learning_rate": 7.81197840283545e-06, + "loss": 0.3452, + "step": 11398 + }, + { + "epoch": 0.33, + "grad_norm": 1.9757086055380588, + "learning_rate": 7.811590004085923e-06, + "loss": 0.3437, + "step": 11399 + }, + { + "epoch": 0.33, + "grad_norm": 1.3012482562768444, + "learning_rate": 7.81120158052408e-06, + "loss": 0.3296, + "step": 11400 + }, + { + "epoch": 0.33, + "grad_norm": 1.504734373572095, + "learning_rate": 7.810813132153354e-06, + "loss": 0.3459, + "step": 11401 + }, + { + "epoch": 0.33, + "grad_norm": 1.3747266741712523, + "learning_rate": 7.810424658977169e-06, + "loss": 0.3416, + "step": 11402 + }, + { + "epoch": 0.33, + "grad_norm": 1.3845072937912422, + "learning_rate": 7.810036160998957e-06, + "loss": 0.3452, + "step": 11403 + }, + { + "epoch": 0.33, + "grad_norm": 1.6223787289021139, + "learning_rate": 7.809647638222144e-06, + "loss": 0.355, + "step": 11404 + }, + { + "epoch": 0.33, + "grad_norm": 1.511155520748403, + "learning_rate": 7.80925909065016e-06, + "loss": 0.3721, + "step": 11405 + }, + { + "epoch": 0.33, + "grad_norm": 1.6890023784227168, + "learning_rate": 7.808870518286432e-06, + "loss": 0.3413, + "step": 11406 + }, + { + "epoch": 0.33, + "grad_norm": 1.2771662413897922, + "learning_rate": 7.808481921134393e-06, + "loss": 0.3511, + "step": 11407 + }, + { + "epoch": 0.33, + "grad_norm": 1.933080253182895, + "learning_rate": 7.808093299197466e-06, + "loss": 0.3547, + "step": 11408 + }, + { + "epoch": 0.33, + "grad_norm": 1.2247780634434133, + "learning_rate": 7.807704652479087e-06, + "loss": 0.3346, + "step": 11409 + }, + { + "epoch": 0.33, + "grad_norm": 1.267217410235215, + "learning_rate": 7.807315980982683e-06, + "loss": 0.339, + "step": 11410 + }, + { + "epoch": 0.33, + "grad_norm": 1.6200270262652576, + "learning_rate": 7.806927284711686e-06, + "loss": 0.354, + "step": 11411 + }, + { + "epoch": 0.33, + "grad_norm": 1.833520585431327, + "learning_rate": 7.806538563669523e-06, + "loss": 0.3334, + "step": 11412 + }, + { + "epoch": 0.33, + "grad_norm": 1.4583239906548944, + "learning_rate": 7.806149817859625e-06, + "loss": 0.3346, + "step": 11413 + }, + { + "epoch": 0.33, + "grad_norm": 1.3791571908199531, + "learning_rate": 7.805761047285425e-06, + "loss": 0.3503, + "step": 11414 + }, + { + "epoch": 0.33, + "grad_norm": 1.3310244116374577, + "learning_rate": 7.805372251950352e-06, + "loss": 0.344, + "step": 11415 + }, + { + "epoch": 0.33, + "grad_norm": 4.05185108540576, + "learning_rate": 7.804983431857838e-06, + "loss": 0.3257, + "step": 11416 + }, + { + "epoch": 0.33, + "grad_norm": 1.763439552322988, + "learning_rate": 7.804594587011314e-06, + "loss": 0.3724, + "step": 11417 + }, + { + "epoch": 0.33, + "grad_norm": 1.3662417039029164, + "learning_rate": 7.804205717414212e-06, + "loss": 0.3372, + "step": 11418 + }, + { + "epoch": 0.33, + "grad_norm": 1.2731313851415331, + "learning_rate": 7.803816823069964e-06, + "loss": 0.3369, + "step": 11419 + }, + { + "epoch": 0.33, + "grad_norm": 1.4658722749305868, + "learning_rate": 7.803427903981999e-06, + "loss": 0.333, + "step": 11420 + }, + { + "epoch": 0.33, + "grad_norm": 1.327262395325209, + "learning_rate": 7.803038960153755e-06, + "loss": 0.3344, + "step": 11421 + }, + { + "epoch": 0.33, + "grad_norm": 1.2929870115717097, + "learning_rate": 7.802649991588658e-06, + "loss": 0.3106, + "step": 11422 + }, + { + "epoch": 0.33, + "grad_norm": 2.469914134772053, + "learning_rate": 7.802260998290145e-06, + "loss": 0.3519, + "step": 11423 + }, + { + "epoch": 0.33, + "grad_norm": 1.3879823899703223, + "learning_rate": 7.801871980261648e-06, + "loss": 0.3559, + "step": 11424 + }, + { + "epoch": 0.33, + "grad_norm": 1.4279522343597375, + "learning_rate": 7.801482937506599e-06, + "loss": 0.3914, + "step": 11425 + }, + { + "epoch": 0.33, + "grad_norm": 1.394477610460068, + "learning_rate": 7.801093870028433e-06, + "loss": 0.3327, + "step": 11426 + }, + { + "epoch": 0.33, + "grad_norm": 1.3934073205130737, + "learning_rate": 7.800704777830583e-06, + "loss": 0.3567, + "step": 11427 + }, + { + "epoch": 0.33, + "grad_norm": 1.5672215448912838, + "learning_rate": 7.800315660916481e-06, + "loss": 0.3365, + "step": 11428 + }, + { + "epoch": 0.33, + "grad_norm": 1.273499545652384, + "learning_rate": 7.799926519289563e-06, + "loss": 0.3388, + "step": 11429 + }, + { + "epoch": 0.33, + "grad_norm": 1.5973843999061612, + "learning_rate": 7.79953735295326e-06, + "loss": 0.3248, + "step": 11430 + }, + { + "epoch": 0.33, + "grad_norm": 1.8058398812983165, + "learning_rate": 7.799148161911013e-06, + "loss": 0.3609, + "step": 11431 + }, + { + "epoch": 0.33, + "grad_norm": 1.2580498562901863, + "learning_rate": 7.798758946166249e-06, + "loss": 0.339, + "step": 11432 + }, + { + "epoch": 0.33, + "grad_norm": 1.2748822819981402, + "learning_rate": 7.798369705722407e-06, + "loss": 0.3321, + "step": 11433 + }, + { + "epoch": 0.33, + "grad_norm": 1.5650644355272245, + "learning_rate": 7.797980440582921e-06, + "loss": 0.3498, + "step": 11434 + }, + { + "epoch": 0.33, + "grad_norm": 1.3798944442671837, + "learning_rate": 7.797591150751228e-06, + "loss": 0.3409, + "step": 11435 + }, + { + "epoch": 0.33, + "grad_norm": 1.5892758639285274, + "learning_rate": 7.797201836230761e-06, + "loss": 0.3311, + "step": 11436 + }, + { + "epoch": 0.33, + "grad_norm": 1.3247225198337338, + "learning_rate": 7.796812497024957e-06, + "loss": 0.3329, + "step": 11437 + }, + { + "epoch": 0.33, + "grad_norm": 1.4372076733646617, + "learning_rate": 7.796423133137252e-06, + "loss": 0.3567, + "step": 11438 + }, + { + "epoch": 0.33, + "grad_norm": 1.6835266838375542, + "learning_rate": 7.796033744571081e-06, + "loss": 0.3169, + "step": 11439 + }, + { + "epoch": 0.33, + "grad_norm": 1.522983995907781, + "learning_rate": 7.795644331329884e-06, + "loss": 0.3533, + "step": 11440 + }, + { + "epoch": 0.33, + "grad_norm": 1.9998526197353883, + "learning_rate": 7.795254893417092e-06, + "loss": 0.339, + "step": 11441 + }, + { + "epoch": 0.33, + "grad_norm": 1.497357575010212, + "learning_rate": 7.794865430836144e-06, + "loss": 0.358, + "step": 11442 + }, + { + "epoch": 0.33, + "grad_norm": 1.2921610665934993, + "learning_rate": 7.794475943590478e-06, + "loss": 0.3298, + "step": 11443 + }, + { + "epoch": 0.33, + "grad_norm": 1.2472187521196598, + "learning_rate": 7.794086431683532e-06, + "loss": 0.3474, + "step": 11444 + }, + { + "epoch": 0.33, + "grad_norm": 1.4128492150867227, + "learning_rate": 7.793696895118742e-06, + "loss": 0.3329, + "step": 11445 + }, + { + "epoch": 0.33, + "grad_norm": 1.0920957483121485, + "learning_rate": 7.793307333899545e-06, + "loss": 0.6546, + "step": 11446 + }, + { + "epoch": 0.33, + "grad_norm": 1.9724913602448586, + "learning_rate": 7.792917748029381e-06, + "loss": 0.3412, + "step": 11447 + }, + { + "epoch": 0.33, + "grad_norm": 0.9078508687627849, + "learning_rate": 7.792528137511686e-06, + "loss": 0.5627, + "step": 11448 + }, + { + "epoch": 0.33, + "grad_norm": 1.3465760839732863, + "learning_rate": 7.7921385023499e-06, + "loss": 0.3428, + "step": 11449 + }, + { + "epoch": 0.33, + "grad_norm": 0.8958717772147385, + "learning_rate": 7.79174884254746e-06, + "loss": 0.5831, + "step": 11450 + }, + { + "epoch": 0.33, + "grad_norm": 1.4331041138670493, + "learning_rate": 7.791359158107804e-06, + "loss": 0.3444, + "step": 11451 + }, + { + "epoch": 0.33, + "grad_norm": 1.1931236193829704, + "learning_rate": 7.790969449034375e-06, + "loss": 0.3356, + "step": 11452 + }, + { + "epoch": 0.33, + "grad_norm": 1.3919390513792658, + "learning_rate": 7.790579715330609e-06, + "loss": 0.3524, + "step": 11453 + }, + { + "epoch": 0.33, + "grad_norm": 1.2082946705655535, + "learning_rate": 7.790189956999945e-06, + "loss": 0.3588, + "step": 11454 + }, + { + "epoch": 0.33, + "grad_norm": 2.141025972508768, + "learning_rate": 7.789800174045826e-06, + "loss": 0.3568, + "step": 11455 + }, + { + "epoch": 0.33, + "grad_norm": 1.3265057130429145, + "learning_rate": 7.789410366471687e-06, + "loss": 0.3611, + "step": 11456 + }, + { + "epoch": 0.33, + "grad_norm": 1.3411066407187553, + "learning_rate": 7.789020534280974e-06, + "loss": 0.3612, + "step": 11457 + }, + { + "epoch": 0.33, + "grad_norm": 1.4958084955622597, + "learning_rate": 7.78863067747712e-06, + "loss": 0.357, + "step": 11458 + }, + { + "epoch": 0.33, + "grad_norm": 1.4563897777887482, + "learning_rate": 7.78824079606357e-06, + "loss": 0.3634, + "step": 11459 + }, + { + "epoch": 0.33, + "grad_norm": 1.7168970413062934, + "learning_rate": 7.787850890043766e-06, + "loss": 0.3436, + "step": 11460 + }, + { + "epoch": 0.33, + "grad_norm": 1.6707706552384143, + "learning_rate": 7.787460959421145e-06, + "loss": 0.3406, + "step": 11461 + }, + { + "epoch": 0.33, + "grad_norm": 1.6278569559612428, + "learning_rate": 7.787071004199152e-06, + "loss": 0.3423, + "step": 11462 + }, + { + "epoch": 0.33, + "grad_norm": 2.242075731782366, + "learning_rate": 7.786681024381224e-06, + "loss": 0.3542, + "step": 11463 + }, + { + "epoch": 0.33, + "grad_norm": 1.4719563964472506, + "learning_rate": 7.786291019970808e-06, + "loss": 0.3372, + "step": 11464 + }, + { + "epoch": 0.33, + "grad_norm": 1.2447524328263135, + "learning_rate": 7.78590099097134e-06, + "loss": 0.3423, + "step": 11465 + }, + { + "epoch": 0.33, + "grad_norm": 1.7053362342133795, + "learning_rate": 7.785510937386268e-06, + "loss": 0.3403, + "step": 11466 + }, + { + "epoch": 0.33, + "grad_norm": 1.385109449986925, + "learning_rate": 7.785120859219028e-06, + "loss": 0.3367, + "step": 11467 + }, + { + "epoch": 0.33, + "grad_norm": 1.6527462797161399, + "learning_rate": 7.784730756473067e-06, + "loss": 0.3329, + "step": 11468 + }, + { + "epoch": 0.33, + "grad_norm": 2.1005829065007036, + "learning_rate": 7.784340629151826e-06, + "loss": 0.3613, + "step": 11469 + }, + { + "epoch": 0.33, + "grad_norm": 1.5558481816155552, + "learning_rate": 7.783950477258748e-06, + "loss": 0.3403, + "step": 11470 + }, + { + "epoch": 0.33, + "grad_norm": 1.3852898546767478, + "learning_rate": 7.783560300797276e-06, + "loss": 0.3499, + "step": 11471 + }, + { + "epoch": 0.33, + "grad_norm": 1.3295228186182393, + "learning_rate": 7.783170099770854e-06, + "loss": 0.3102, + "step": 11472 + }, + { + "epoch": 0.33, + "grad_norm": 1.5046195807519074, + "learning_rate": 7.782779874182923e-06, + "loss": 0.3374, + "step": 11473 + }, + { + "epoch": 0.33, + "grad_norm": 1.2737061485477266, + "learning_rate": 7.782389624036933e-06, + "loss": 0.3065, + "step": 11474 + }, + { + "epoch": 0.33, + "grad_norm": 1.4810876543565288, + "learning_rate": 7.78199934933632e-06, + "loss": 0.3625, + "step": 11475 + }, + { + "epoch": 0.33, + "grad_norm": 1.5001415141547114, + "learning_rate": 7.781609050084533e-06, + "loss": 0.355, + "step": 11476 + }, + { + "epoch": 0.33, + "grad_norm": 1.5060765028220569, + "learning_rate": 7.781218726285014e-06, + "loss": 0.353, + "step": 11477 + }, + { + "epoch": 0.33, + "grad_norm": 1.3151702286621167, + "learning_rate": 7.78082837794121e-06, + "loss": 0.3638, + "step": 11478 + }, + { + "epoch": 0.33, + "grad_norm": 1.4707995036440984, + "learning_rate": 7.780438005056565e-06, + "loss": 0.3858, + "step": 11479 + }, + { + "epoch": 0.33, + "grad_norm": 1.5274163314645628, + "learning_rate": 7.780047607634525e-06, + "loss": 0.3436, + "step": 11480 + }, + { + "epoch": 0.33, + "grad_norm": 1.4945858128238412, + "learning_rate": 7.779657185678531e-06, + "loss": 0.3719, + "step": 11481 + }, + { + "epoch": 0.33, + "grad_norm": 1.5896525389194531, + "learning_rate": 7.779266739192033e-06, + "loss": 0.338, + "step": 11482 + }, + { + "epoch": 0.33, + "grad_norm": 1.327540982482544, + "learning_rate": 7.778876268178477e-06, + "loss": 0.3411, + "step": 11483 + }, + { + "epoch": 0.33, + "grad_norm": 1.367375368586868, + "learning_rate": 7.778485772641304e-06, + "loss": 0.3625, + "step": 11484 + }, + { + "epoch": 0.33, + "grad_norm": 1.2841839190773667, + "learning_rate": 7.778095252583965e-06, + "loss": 0.3787, + "step": 11485 + }, + { + "epoch": 0.33, + "grad_norm": 1.4479631440802676, + "learning_rate": 7.777704708009905e-06, + "loss": 0.3363, + "step": 11486 + }, + { + "epoch": 0.33, + "grad_norm": 1.0929602025625418, + "learning_rate": 7.777314138922569e-06, + "loss": 0.6147, + "step": 11487 + }, + { + "epoch": 0.33, + "grad_norm": 1.4907979707018073, + "learning_rate": 7.776923545325405e-06, + "loss": 0.3691, + "step": 11488 + }, + { + "epoch": 0.33, + "grad_norm": 1.5683407733127783, + "learning_rate": 7.77653292722186e-06, + "loss": 0.3585, + "step": 11489 + }, + { + "epoch": 0.33, + "grad_norm": 2.591107629474722, + "learning_rate": 7.776142284615383e-06, + "loss": 0.3407, + "step": 11490 + }, + { + "epoch": 0.33, + "grad_norm": 1.740546048090341, + "learning_rate": 7.775751617509416e-06, + "loss": 0.3351, + "step": 11491 + }, + { + "epoch": 0.33, + "grad_norm": 1.4837213063680725, + "learning_rate": 7.775360925907413e-06, + "loss": 0.352, + "step": 11492 + }, + { + "epoch": 0.33, + "grad_norm": 1.2590674105542823, + "learning_rate": 7.774970209812818e-06, + "loss": 0.3375, + "step": 11493 + }, + { + "epoch": 0.33, + "grad_norm": 1.8777953624348187, + "learning_rate": 7.77457946922908e-06, + "loss": 0.3544, + "step": 11494 + }, + { + "epoch": 0.33, + "grad_norm": 1.4325080078671604, + "learning_rate": 7.774188704159646e-06, + "loss": 0.3429, + "step": 11495 + }, + { + "epoch": 0.33, + "grad_norm": 17.89368572506621, + "learning_rate": 7.773797914607967e-06, + "loss": 0.3279, + "step": 11496 + }, + { + "epoch": 0.33, + "grad_norm": 1.4045838709780782, + "learning_rate": 7.773407100577492e-06, + "loss": 0.339, + "step": 11497 + }, + { + "epoch": 0.33, + "grad_norm": 1.359214278455175, + "learning_rate": 7.773016262071669e-06, + "loss": 0.3296, + "step": 11498 + }, + { + "epoch": 0.33, + "grad_norm": 1.692504321597271, + "learning_rate": 7.772625399093945e-06, + "loss": 0.3611, + "step": 11499 + }, + { + "epoch": 0.33, + "grad_norm": 1.4924460453171389, + "learning_rate": 7.772234511647771e-06, + "loss": 0.3453, + "step": 11500 + }, + { + "epoch": 0.33, + "grad_norm": 1.5503651495875785, + "learning_rate": 7.771843599736598e-06, + "loss": 0.3401, + "step": 11501 + }, + { + "epoch": 0.33, + "grad_norm": 1.4979505545190883, + "learning_rate": 7.771452663363873e-06, + "loss": 0.3455, + "step": 11502 + }, + { + "epoch": 0.33, + "grad_norm": 1.4728401761365386, + "learning_rate": 7.771061702533049e-06, + "loss": 0.3325, + "step": 11503 + }, + { + "epoch": 0.33, + "grad_norm": 1.2754586807625725, + "learning_rate": 7.770670717247572e-06, + "loss": 0.3343, + "step": 11504 + }, + { + "epoch": 0.33, + "grad_norm": 1.470519399369391, + "learning_rate": 7.7702797075109e-06, + "loss": 0.3376, + "step": 11505 + }, + { + "epoch": 0.33, + "grad_norm": 1.37897593462972, + "learning_rate": 7.769888673326475e-06, + "loss": 0.3681, + "step": 11506 + }, + { + "epoch": 0.33, + "grad_norm": 1.6013397768417197, + "learning_rate": 7.769497614697752e-06, + "loss": 0.3523, + "step": 11507 + }, + { + "epoch": 0.33, + "grad_norm": 1.220709568439323, + "learning_rate": 7.769106531628182e-06, + "loss": 0.315, + "step": 11508 + }, + { + "epoch": 0.33, + "grad_norm": 1.3938821415905593, + "learning_rate": 7.768715424121216e-06, + "loss": 0.3568, + "step": 11509 + }, + { + "epoch": 0.33, + "grad_norm": 1.7286905810747637, + "learning_rate": 7.768324292180307e-06, + "loss": 0.3296, + "step": 11510 + }, + { + "epoch": 0.33, + "grad_norm": 1.9485668120350845, + "learning_rate": 7.767933135808905e-06, + "loss": 0.3288, + "step": 11511 + }, + { + "epoch": 0.33, + "grad_norm": 1.2266435976439432, + "learning_rate": 7.767541955010463e-06, + "loss": 0.3229, + "step": 11512 + }, + { + "epoch": 0.33, + "grad_norm": 1.599266499683057, + "learning_rate": 7.767150749788431e-06, + "loss": 0.3609, + "step": 11513 + }, + { + "epoch": 0.33, + "grad_norm": 1.266692837189872, + "learning_rate": 7.766759520146263e-06, + "loss": 0.3154, + "step": 11514 + }, + { + "epoch": 0.33, + "grad_norm": 2.4480611448109015, + "learning_rate": 7.766368266087413e-06, + "loss": 0.351, + "step": 11515 + }, + { + "epoch": 0.33, + "grad_norm": 1.3241396945074928, + "learning_rate": 7.765976987615332e-06, + "loss": 0.3177, + "step": 11516 + }, + { + "epoch": 0.33, + "grad_norm": 1.8100598103130399, + "learning_rate": 7.765585684733472e-06, + "loss": 0.3546, + "step": 11517 + }, + { + "epoch": 0.33, + "grad_norm": 1.3063505786960075, + "learning_rate": 7.765194357445288e-06, + "loss": 0.3308, + "step": 11518 + }, + { + "epoch": 0.33, + "grad_norm": 1.5428023784861291, + "learning_rate": 7.764803005754234e-06, + "loss": 0.3516, + "step": 11519 + }, + { + "epoch": 0.33, + "grad_norm": 1.3915115265650204, + "learning_rate": 7.764411629663762e-06, + "loss": 0.3355, + "step": 11520 + }, + { + "epoch": 0.33, + "grad_norm": 3.3776225195467457, + "learning_rate": 7.764020229177327e-06, + "loss": 0.3319, + "step": 11521 + }, + { + "epoch": 0.33, + "grad_norm": 1.6498226270361807, + "learning_rate": 7.763628804298383e-06, + "loss": 0.3416, + "step": 11522 + }, + { + "epoch": 0.33, + "grad_norm": 2.0668619141819886, + "learning_rate": 7.763237355030384e-06, + "loss": 0.3676, + "step": 11523 + }, + { + "epoch": 0.33, + "grad_norm": 1.4151942898964491, + "learning_rate": 7.762845881376785e-06, + "loss": 0.3344, + "step": 11524 + }, + { + "epoch": 0.33, + "grad_norm": 1.9409415035004203, + "learning_rate": 7.76245438334104e-06, + "loss": 0.3186, + "step": 11525 + }, + { + "epoch": 0.33, + "grad_norm": 2.8738481035746064, + "learning_rate": 7.762062860926606e-06, + "loss": 0.3426, + "step": 11526 + }, + { + "epoch": 0.33, + "grad_norm": 1.4956790972261358, + "learning_rate": 7.761671314136935e-06, + "loss": 0.3765, + "step": 11527 + }, + { + "epoch": 0.33, + "grad_norm": 1.5494664193724539, + "learning_rate": 7.761279742975484e-06, + "loss": 0.3438, + "step": 11528 + }, + { + "epoch": 0.33, + "grad_norm": 1.188791691439459, + "learning_rate": 7.76088814744571e-06, + "loss": 0.3477, + "step": 11529 + }, + { + "epoch": 0.33, + "grad_norm": 1.578642698935575, + "learning_rate": 7.760496527551065e-06, + "loss": 0.3526, + "step": 11530 + }, + { + "epoch": 0.33, + "grad_norm": 1.7572787974154267, + "learning_rate": 7.76010488329501e-06, + "loss": 0.3373, + "step": 11531 + }, + { + "epoch": 0.33, + "grad_norm": 1.4612876660680938, + "learning_rate": 7.759713214680997e-06, + "loss": 0.3384, + "step": 11532 + }, + { + "epoch": 0.33, + "grad_norm": 1.091376286696929, + "learning_rate": 7.759321521712484e-06, + "loss": 0.6154, + "step": 11533 + }, + { + "epoch": 0.33, + "grad_norm": 1.474952238053242, + "learning_rate": 7.75892980439293e-06, + "loss": 0.3337, + "step": 11534 + }, + { + "epoch": 0.33, + "grad_norm": 1.5023824674992041, + "learning_rate": 7.75853806272579e-06, + "loss": 0.3358, + "step": 11535 + }, + { + "epoch": 0.33, + "grad_norm": 1.7995166282741366, + "learning_rate": 7.758146296714519e-06, + "loss": 0.3522, + "step": 11536 + }, + { + "epoch": 0.33, + "grad_norm": 1.563971812237109, + "learning_rate": 7.757754506362575e-06, + "loss": 0.3709, + "step": 11537 + }, + { + "epoch": 0.33, + "grad_norm": 1.4410582859187735, + "learning_rate": 7.75736269167342e-06, + "loss": 0.3519, + "step": 11538 + }, + { + "epoch": 0.33, + "grad_norm": 2.3797661494215223, + "learning_rate": 7.756970852650509e-06, + "loss": 0.3698, + "step": 11539 + }, + { + "epoch": 0.33, + "grad_norm": 1.371399242918237, + "learning_rate": 7.756578989297299e-06, + "loss": 0.3445, + "step": 11540 + }, + { + "epoch": 0.33, + "grad_norm": 1.3636741836575634, + "learning_rate": 7.756187101617248e-06, + "loss": 0.3373, + "step": 11541 + }, + { + "epoch": 0.33, + "grad_norm": 1.3462729718693964, + "learning_rate": 7.755795189613815e-06, + "loss": 0.3387, + "step": 11542 + }, + { + "epoch": 0.33, + "grad_norm": 1.552082581222324, + "learning_rate": 7.75540325329046e-06, + "loss": 0.3302, + "step": 11543 + }, + { + "epoch": 0.33, + "grad_norm": 1.35164905355156, + "learning_rate": 7.755011292650638e-06, + "loss": 0.3373, + "step": 11544 + }, + { + "epoch": 0.33, + "grad_norm": 1.415586609455866, + "learning_rate": 7.754619307697812e-06, + "loss": 0.3206, + "step": 11545 + }, + { + "epoch": 0.33, + "grad_norm": 1.377894116514725, + "learning_rate": 7.754227298435442e-06, + "loss": 0.3539, + "step": 11546 + }, + { + "epoch": 0.33, + "grad_norm": 3.1233200411578115, + "learning_rate": 7.753835264866985e-06, + "loss": 0.3523, + "step": 11547 + }, + { + "epoch": 0.33, + "grad_norm": 1.4511299684460035, + "learning_rate": 7.7534432069959e-06, + "loss": 0.3853, + "step": 11548 + }, + { + "epoch": 0.33, + "grad_norm": 1.373250443581798, + "learning_rate": 7.753051124825648e-06, + "loss": 0.3167, + "step": 11549 + }, + { + "epoch": 0.34, + "grad_norm": 2.0990717706621256, + "learning_rate": 7.75265901835969e-06, + "loss": 0.3311, + "step": 11550 + }, + { + "epoch": 0.34, + "grad_norm": 1.3543388446895965, + "learning_rate": 7.752266887601486e-06, + "loss": 0.3306, + "step": 11551 + }, + { + "epoch": 0.34, + "grad_norm": 1.7200245098143716, + "learning_rate": 7.751874732554496e-06, + "loss": 0.3288, + "step": 11552 + }, + { + "epoch": 0.34, + "grad_norm": 1.6828132336155819, + "learning_rate": 7.751482553222181e-06, + "loss": 0.3323, + "step": 11553 + }, + { + "epoch": 0.34, + "grad_norm": 0.9136466837672492, + "learning_rate": 7.751090349608003e-06, + "loss": 0.6749, + "step": 11554 + }, + { + "epoch": 0.34, + "grad_norm": 1.3397058611648722, + "learning_rate": 7.75069812171542e-06, + "loss": 0.3509, + "step": 11555 + }, + { + "epoch": 0.34, + "grad_norm": 0.9520192030255247, + "learning_rate": 7.750305869547896e-06, + "loss": 0.5933, + "step": 11556 + }, + { + "epoch": 0.34, + "grad_norm": 1.3088027497135841, + "learning_rate": 7.749913593108893e-06, + "loss": 0.3174, + "step": 11557 + }, + { + "epoch": 0.34, + "grad_norm": 1.533532195357622, + "learning_rate": 7.749521292401872e-06, + "loss": 0.3131, + "step": 11558 + }, + { + "epoch": 0.34, + "grad_norm": 1.4795939212996023, + "learning_rate": 7.749128967430295e-06, + "loss": 0.3478, + "step": 11559 + }, + { + "epoch": 0.34, + "grad_norm": 1.3224943343458975, + "learning_rate": 7.748736618197623e-06, + "loss": 0.3435, + "step": 11560 + }, + { + "epoch": 0.34, + "grad_norm": 1.5001125191292217, + "learning_rate": 7.748344244707322e-06, + "loss": 0.3469, + "step": 11561 + }, + { + "epoch": 0.34, + "grad_norm": 1.7097602718439813, + "learning_rate": 7.74795184696285e-06, + "loss": 0.3494, + "step": 11562 + }, + { + "epoch": 0.34, + "grad_norm": 1.45878476400733, + "learning_rate": 7.747559424967675e-06, + "loss": 0.3114, + "step": 11563 + }, + { + "epoch": 0.34, + "grad_norm": 1.4170567315965894, + "learning_rate": 7.747166978725255e-06, + "loss": 0.3511, + "step": 11564 + }, + { + "epoch": 0.34, + "grad_norm": 1.466221419504235, + "learning_rate": 7.74677450823906e-06, + "loss": 0.3222, + "step": 11565 + }, + { + "epoch": 0.34, + "grad_norm": 1.6373781035586095, + "learning_rate": 7.746382013512546e-06, + "loss": 0.3407, + "step": 11566 + }, + { + "epoch": 0.34, + "grad_norm": 1.479196933551821, + "learning_rate": 7.745989494549182e-06, + "loss": 0.3311, + "step": 11567 + }, + { + "epoch": 0.34, + "grad_norm": 1.415075239371081, + "learning_rate": 7.74559695135243e-06, + "loss": 0.3375, + "step": 11568 + }, + { + "epoch": 0.34, + "grad_norm": 1.3839161840692185, + "learning_rate": 7.745204383925753e-06, + "loss": 0.3426, + "step": 11569 + }, + { + "epoch": 0.34, + "grad_norm": 1.2715419711330127, + "learning_rate": 7.744811792272619e-06, + "loss": 0.3464, + "step": 11570 + }, + { + "epoch": 0.34, + "grad_norm": 1.417138950693204, + "learning_rate": 7.744419176396489e-06, + "loss": 0.353, + "step": 11571 + }, + { + "epoch": 0.34, + "grad_norm": 1.2660961071320014, + "learning_rate": 7.74402653630083e-06, + "loss": 0.3434, + "step": 11572 + }, + { + "epoch": 0.34, + "grad_norm": 1.41895667100254, + "learning_rate": 7.743633871989105e-06, + "loss": 0.3518, + "step": 11573 + }, + { + "epoch": 0.34, + "grad_norm": 1.3927045148605623, + "learning_rate": 7.743241183464783e-06, + "loss": 0.3358, + "step": 11574 + }, + { + "epoch": 0.34, + "grad_norm": 1.7003572615730456, + "learning_rate": 7.742848470731325e-06, + "loss": 0.3376, + "step": 11575 + }, + { + "epoch": 0.34, + "grad_norm": 1.4390470867842189, + "learning_rate": 7.7424557337922e-06, + "loss": 0.3683, + "step": 11576 + }, + { + "epoch": 0.34, + "grad_norm": 2.0307173945050288, + "learning_rate": 7.742062972650872e-06, + "loss": 0.3183, + "step": 11577 + }, + { + "epoch": 0.34, + "grad_norm": 0.968524392712898, + "learning_rate": 7.741670187310808e-06, + "loss": 0.619, + "step": 11578 + }, + { + "epoch": 0.34, + "grad_norm": 1.3302586619738874, + "learning_rate": 7.741277377775473e-06, + "loss": 0.3295, + "step": 11579 + }, + { + "epoch": 0.34, + "grad_norm": 1.779449089376463, + "learning_rate": 7.740884544048337e-06, + "loss": 0.3503, + "step": 11580 + }, + { + "epoch": 0.34, + "grad_norm": 1.4208003606070359, + "learning_rate": 7.740491686132864e-06, + "loss": 0.3051, + "step": 11581 + }, + { + "epoch": 0.34, + "grad_norm": 1.331798526356091, + "learning_rate": 7.74009880403252e-06, + "loss": 0.3312, + "step": 11582 + }, + { + "epoch": 0.34, + "grad_norm": 2.2456231168420286, + "learning_rate": 7.739705897750775e-06, + "loss": 0.324, + "step": 11583 + }, + { + "epoch": 0.34, + "grad_norm": 1.4044123490972764, + "learning_rate": 7.739312967291095e-06, + "loss": 0.36, + "step": 11584 + }, + { + "epoch": 0.34, + "grad_norm": 1.6480929273545333, + "learning_rate": 7.738920012656947e-06, + "loss": 0.3473, + "step": 11585 + }, + { + "epoch": 0.34, + "grad_norm": 1.4076696979007612, + "learning_rate": 7.738527033851802e-06, + "loss": 0.3514, + "step": 11586 + }, + { + "epoch": 0.34, + "grad_norm": 1.3519964536161644, + "learning_rate": 7.738134030879122e-06, + "loss": 0.3465, + "step": 11587 + }, + { + "epoch": 0.34, + "grad_norm": 1.256886662506597, + "learning_rate": 7.73774100374238e-06, + "loss": 0.3186, + "step": 11588 + }, + { + "epoch": 0.34, + "grad_norm": 1.3653551003156827, + "learning_rate": 7.737347952445044e-06, + "loss": 0.3142, + "step": 11589 + }, + { + "epoch": 0.34, + "grad_norm": 1.3720457890887088, + "learning_rate": 7.736954876990583e-06, + "loss": 0.3315, + "step": 11590 + }, + { + "epoch": 0.34, + "grad_norm": 1.3359930450439472, + "learning_rate": 7.736561777382463e-06, + "loss": 0.3159, + "step": 11591 + }, + { + "epoch": 0.34, + "grad_norm": 1.41964201206167, + "learning_rate": 7.736168653624154e-06, + "loss": 0.3606, + "step": 11592 + }, + { + "epoch": 0.34, + "grad_norm": 1.4474201543732723, + "learning_rate": 7.735775505719129e-06, + "loss": 0.3336, + "step": 11593 + }, + { + "epoch": 0.34, + "grad_norm": 0.9886730888229691, + "learning_rate": 7.735382333670853e-06, + "loss": 0.596, + "step": 11594 + }, + { + "epoch": 0.34, + "grad_norm": 1.4933626061302, + "learning_rate": 7.734989137482798e-06, + "loss": 0.3416, + "step": 11595 + }, + { + "epoch": 0.34, + "grad_norm": 1.3200813149001134, + "learning_rate": 7.734595917158434e-06, + "loss": 0.3421, + "step": 11596 + }, + { + "epoch": 0.34, + "grad_norm": 1.5046380063889164, + "learning_rate": 7.73420267270123e-06, + "loss": 0.3516, + "step": 11597 + }, + { + "epoch": 0.34, + "grad_norm": 1.4434833749882972, + "learning_rate": 7.733809404114657e-06, + "loss": 0.3536, + "step": 11598 + }, + { + "epoch": 0.34, + "grad_norm": 1.3390686652829784, + "learning_rate": 7.733416111402187e-06, + "loss": 0.3569, + "step": 11599 + }, + { + "epoch": 0.34, + "grad_norm": 2.210346337291404, + "learning_rate": 7.733022794567288e-06, + "loss": 0.3808, + "step": 11600 + }, + { + "epoch": 0.34, + "grad_norm": 1.4853234618740145, + "learning_rate": 7.732629453613433e-06, + "loss": 0.3307, + "step": 11601 + }, + { + "epoch": 0.34, + "grad_norm": 1.2911515074515139, + "learning_rate": 7.732236088544093e-06, + "loss": 0.3501, + "step": 11602 + }, + { + "epoch": 0.34, + "grad_norm": 1.3691819502661517, + "learning_rate": 7.731842699362739e-06, + "loss": 0.3346, + "step": 11603 + }, + { + "epoch": 0.34, + "grad_norm": 1.2892056653806419, + "learning_rate": 7.731449286072843e-06, + "loss": 0.3388, + "step": 11604 + }, + { + "epoch": 0.34, + "grad_norm": 1.9243852411060336, + "learning_rate": 7.731055848677875e-06, + "loss": 0.3464, + "step": 11605 + }, + { + "epoch": 0.34, + "grad_norm": 1.3931234487518016, + "learning_rate": 7.730662387181311e-06, + "loss": 0.3339, + "step": 11606 + }, + { + "epoch": 0.34, + "grad_norm": 1.2234691867473289, + "learning_rate": 7.730268901586621e-06, + "loss": 0.3386, + "step": 11607 + }, + { + "epoch": 0.34, + "grad_norm": 1.323870809198424, + "learning_rate": 7.729875391897276e-06, + "loss": 0.3263, + "step": 11608 + }, + { + "epoch": 0.34, + "grad_norm": 1.3317030834796177, + "learning_rate": 7.729481858116754e-06, + "loss": 0.3421, + "step": 11609 + }, + { + "epoch": 0.34, + "grad_norm": 1.3950041792820007, + "learning_rate": 7.72908830024852e-06, + "loss": 0.3581, + "step": 11610 + }, + { + "epoch": 0.34, + "grad_norm": 1.8703514050321766, + "learning_rate": 7.728694718296055e-06, + "loss": 0.3813, + "step": 11611 + }, + { + "epoch": 0.34, + "grad_norm": 1.6471864712513595, + "learning_rate": 7.728301112262826e-06, + "loss": 0.3596, + "step": 11612 + }, + { + "epoch": 0.34, + "grad_norm": 2.9266031380208406, + "learning_rate": 7.727907482152311e-06, + "loss": 0.3469, + "step": 11613 + }, + { + "epoch": 0.34, + "grad_norm": 1.4256259696456706, + "learning_rate": 7.727513827967981e-06, + "loss": 0.3232, + "step": 11614 + }, + { + "epoch": 0.34, + "grad_norm": 2.5453083766493783, + "learning_rate": 7.727120149713313e-06, + "loss": 0.3727, + "step": 11615 + }, + { + "epoch": 0.34, + "grad_norm": 2.1746318861476635, + "learning_rate": 7.726726447391779e-06, + "loss": 0.3244, + "step": 11616 + }, + { + "epoch": 0.34, + "grad_norm": 1.600841115269529, + "learning_rate": 7.726332721006852e-06, + "loss": 0.364, + "step": 11617 + }, + { + "epoch": 0.34, + "grad_norm": 1.08963949795417, + "learning_rate": 7.72593897056201e-06, + "loss": 0.6172, + "step": 11618 + }, + { + "epoch": 0.34, + "grad_norm": 1.2881312483225733, + "learning_rate": 7.725545196060726e-06, + "loss": 0.3433, + "step": 11619 + }, + { + "epoch": 0.34, + "grad_norm": 1.3325729189295004, + "learning_rate": 7.725151397506477e-06, + "loss": 0.351, + "step": 11620 + }, + { + "epoch": 0.34, + "grad_norm": 1.7173318134722115, + "learning_rate": 7.724757574902735e-06, + "loss": 0.3278, + "step": 11621 + }, + { + "epoch": 0.34, + "grad_norm": 1.2890597939090642, + "learning_rate": 7.724363728252977e-06, + "loss": 0.3464, + "step": 11622 + }, + { + "epoch": 0.34, + "grad_norm": 1.4855988203987631, + "learning_rate": 7.72396985756068e-06, + "loss": 0.3706, + "step": 11623 + }, + { + "epoch": 0.34, + "grad_norm": 1.2783811580069888, + "learning_rate": 7.723575962829317e-06, + "loss": 0.3234, + "step": 11624 + }, + { + "epoch": 0.34, + "grad_norm": 1.2718648324534063, + "learning_rate": 7.723182044062368e-06, + "loss": 0.3268, + "step": 11625 + }, + { + "epoch": 0.34, + "grad_norm": 1.673242038811271, + "learning_rate": 7.722788101263306e-06, + "loss": 0.3295, + "step": 11626 + }, + { + "epoch": 0.34, + "grad_norm": 1.6028551537420377, + "learning_rate": 7.722394134435609e-06, + "loss": 0.3182, + "step": 11627 + }, + { + "epoch": 0.34, + "grad_norm": 2.280524269998426, + "learning_rate": 7.722000143582754e-06, + "loss": 0.3503, + "step": 11628 + }, + { + "epoch": 0.34, + "grad_norm": 1.7555131244848303, + "learning_rate": 7.721606128708217e-06, + "loss": 0.3268, + "step": 11629 + }, + { + "epoch": 0.34, + "grad_norm": 1.3097857854073918, + "learning_rate": 7.721212089815475e-06, + "loss": 0.347, + "step": 11630 + }, + { + "epoch": 0.34, + "grad_norm": 1.3715390261257363, + "learning_rate": 7.720818026908007e-06, + "loss": 0.3329, + "step": 11631 + }, + { + "epoch": 0.34, + "grad_norm": 3.753708840823589, + "learning_rate": 7.72042393998929e-06, + "loss": 0.3448, + "step": 11632 + }, + { + "epoch": 0.34, + "grad_norm": 1.343313447199088, + "learning_rate": 7.720029829062803e-06, + "loss": 0.3313, + "step": 11633 + }, + { + "epoch": 0.34, + "grad_norm": 1.3554390439607962, + "learning_rate": 7.71963569413202e-06, + "loss": 0.3208, + "step": 11634 + }, + { + "epoch": 0.34, + "grad_norm": 1.3377173170737189, + "learning_rate": 7.719241535200423e-06, + "loss": 0.3298, + "step": 11635 + }, + { + "epoch": 0.34, + "grad_norm": 1.308711163717399, + "learning_rate": 7.718847352271487e-06, + "loss": 0.3298, + "step": 11636 + }, + { + "epoch": 0.34, + "grad_norm": 1.6311786600286078, + "learning_rate": 7.718453145348696e-06, + "loss": 0.3332, + "step": 11637 + }, + { + "epoch": 0.34, + "grad_norm": 1.2753454653296872, + "learning_rate": 7.718058914435526e-06, + "loss": 0.31, + "step": 11638 + }, + { + "epoch": 0.34, + "grad_norm": 1.4547115495590455, + "learning_rate": 7.717664659535456e-06, + "loss": 0.3734, + "step": 11639 + }, + { + "epoch": 0.34, + "grad_norm": 1.3916875547942282, + "learning_rate": 7.717270380651962e-06, + "loss": 0.3391, + "step": 11640 + }, + { + "epoch": 0.34, + "grad_norm": 1.5593827819368578, + "learning_rate": 7.71687607778853e-06, + "loss": 0.3326, + "step": 11641 + }, + { + "epoch": 0.34, + "grad_norm": 1.5449359413937076, + "learning_rate": 7.716481750948634e-06, + "loss": 0.3562, + "step": 11642 + }, + { + "epoch": 0.34, + "grad_norm": 1.5366394941449868, + "learning_rate": 7.716087400135759e-06, + "loss": 0.3076, + "step": 11643 + }, + { + "epoch": 0.34, + "grad_norm": 0.9688180958810084, + "learning_rate": 7.71569302535338e-06, + "loss": 0.6013, + "step": 11644 + }, + { + "epoch": 0.34, + "grad_norm": 1.3432473246017325, + "learning_rate": 7.715298626604981e-06, + "loss": 0.3354, + "step": 11645 + }, + { + "epoch": 0.34, + "grad_norm": 1.3791703437720215, + "learning_rate": 7.714904203894041e-06, + "loss": 0.3437, + "step": 11646 + }, + { + "epoch": 0.34, + "grad_norm": 1.4587307715505538, + "learning_rate": 7.714509757224043e-06, + "loss": 0.3353, + "step": 11647 + }, + { + "epoch": 0.34, + "grad_norm": 1.4530773064554074, + "learning_rate": 7.714115286598465e-06, + "loss": 0.3658, + "step": 11648 + }, + { + "epoch": 0.34, + "grad_norm": 1.2558131882006813, + "learning_rate": 7.713720792020788e-06, + "loss": 0.3194, + "step": 11649 + }, + { + "epoch": 0.34, + "grad_norm": 1.5211337069825626, + "learning_rate": 7.713326273494496e-06, + "loss": 0.3434, + "step": 11650 + }, + { + "epoch": 0.34, + "grad_norm": 1.4411693550116238, + "learning_rate": 7.712931731023073e-06, + "loss": 0.3263, + "step": 11651 + }, + { + "epoch": 0.34, + "grad_norm": 1.625595941388698, + "learning_rate": 7.712537164609995e-06, + "loss": 0.3435, + "step": 11652 + }, + { + "epoch": 0.34, + "grad_norm": 1.6133655469689185, + "learning_rate": 7.712142574258745e-06, + "loss": 0.3394, + "step": 11653 + }, + { + "epoch": 0.34, + "grad_norm": 1.3778577525929965, + "learning_rate": 7.711747959972807e-06, + "loss": 0.3575, + "step": 11654 + }, + { + "epoch": 0.34, + "grad_norm": 1.713416500184339, + "learning_rate": 7.711353321755663e-06, + "loss": 0.318, + "step": 11655 + }, + { + "epoch": 0.34, + "grad_norm": 1.3025733416122267, + "learning_rate": 7.710958659610798e-06, + "loss": 0.3362, + "step": 11656 + }, + { + "epoch": 0.34, + "grad_norm": 1.465374429262567, + "learning_rate": 7.710563973541691e-06, + "loss": 0.3347, + "step": 11657 + }, + { + "epoch": 0.34, + "grad_norm": 2.4200495271674263, + "learning_rate": 7.710169263551826e-06, + "loss": 0.3203, + "step": 11658 + }, + { + "epoch": 0.34, + "grad_norm": 1.4282581513475248, + "learning_rate": 7.709774529644688e-06, + "loss": 0.3443, + "step": 11659 + }, + { + "epoch": 0.34, + "grad_norm": 1.4743643533165105, + "learning_rate": 7.709379771823762e-06, + "loss": 0.3288, + "step": 11660 + }, + { + "epoch": 0.34, + "grad_norm": 1.5040003257676526, + "learning_rate": 7.708984990092528e-06, + "loss": 0.3769, + "step": 11661 + }, + { + "epoch": 0.34, + "grad_norm": 1.3416333484746334, + "learning_rate": 7.708590184454469e-06, + "loss": 0.329, + "step": 11662 + }, + { + "epoch": 0.34, + "grad_norm": 1.4089478180938728, + "learning_rate": 7.708195354913074e-06, + "loss": 0.3472, + "step": 11663 + }, + { + "epoch": 0.34, + "grad_norm": 1.3283452497957584, + "learning_rate": 7.707800501471825e-06, + "loss": 0.3213, + "step": 11664 + }, + { + "epoch": 0.34, + "grad_norm": 1.4087048570262977, + "learning_rate": 7.707405624134207e-06, + "loss": 0.3474, + "step": 11665 + }, + { + "epoch": 0.34, + "grad_norm": 2.9820832235956747, + "learning_rate": 7.707010722903703e-06, + "loss": 0.3441, + "step": 11666 + }, + { + "epoch": 0.34, + "grad_norm": 1.316008071375744, + "learning_rate": 7.7066157977838e-06, + "loss": 0.348, + "step": 11667 + }, + { + "epoch": 0.34, + "grad_norm": 1.4587856090960563, + "learning_rate": 7.706220848777981e-06, + "loss": 0.327, + "step": 11668 + }, + { + "epoch": 0.34, + "grad_norm": 1.7171600541192529, + "learning_rate": 7.705825875889735e-06, + "loss": 0.3204, + "step": 11669 + }, + { + "epoch": 0.34, + "grad_norm": 0.9987705832207702, + "learning_rate": 7.705430879122544e-06, + "loss": 0.6383, + "step": 11670 + }, + { + "epoch": 0.34, + "grad_norm": 1.78957853271295, + "learning_rate": 7.705035858479897e-06, + "loss": 0.3406, + "step": 11671 + }, + { + "epoch": 0.34, + "grad_norm": 1.5624490175122976, + "learning_rate": 7.70464081396528e-06, + "loss": 0.3532, + "step": 11672 + }, + { + "epoch": 0.34, + "grad_norm": 1.3108570431650137, + "learning_rate": 7.704245745582174e-06, + "loss": 0.3226, + "step": 11673 + }, + { + "epoch": 0.34, + "grad_norm": 1.4536994470664737, + "learning_rate": 7.70385065333407e-06, + "loss": 0.3433, + "step": 11674 + }, + { + "epoch": 0.34, + "grad_norm": 1.3995082777748793, + "learning_rate": 7.703455537224455e-06, + "loss": 0.3362, + "step": 11675 + }, + { + "epoch": 0.34, + "grad_norm": 1.3210176068071224, + "learning_rate": 7.703060397256816e-06, + "loss": 0.3363, + "step": 11676 + }, + { + "epoch": 0.34, + "grad_norm": 1.432385242641282, + "learning_rate": 7.70266523343464e-06, + "loss": 0.3522, + "step": 11677 + }, + { + "epoch": 0.34, + "grad_norm": 1.2982671236805186, + "learning_rate": 7.70227004576141e-06, + "loss": 0.3254, + "step": 11678 + }, + { + "epoch": 0.34, + "grad_norm": 1.6124374857479105, + "learning_rate": 7.701874834240617e-06, + "loss": 0.3547, + "step": 11679 + }, + { + "epoch": 0.34, + "grad_norm": 1.4395448929371137, + "learning_rate": 7.701479598875751e-06, + "loss": 0.3518, + "step": 11680 + }, + { + "epoch": 0.34, + "grad_norm": 1.7561147675759643, + "learning_rate": 7.701084339670294e-06, + "loss": 0.3125, + "step": 11681 + }, + { + "epoch": 0.34, + "grad_norm": 1.4074759229252083, + "learning_rate": 7.700689056627741e-06, + "loss": 0.3549, + "step": 11682 + }, + { + "epoch": 0.34, + "grad_norm": 1.3266712554501476, + "learning_rate": 7.700293749751578e-06, + "loss": 0.341, + "step": 11683 + }, + { + "epoch": 0.34, + "grad_norm": 1.3785048789975298, + "learning_rate": 7.69989841904529e-06, + "loss": 0.3263, + "step": 11684 + }, + { + "epoch": 0.34, + "grad_norm": 1.2751434375345578, + "learning_rate": 7.699503064512368e-06, + "loss": 0.3307, + "step": 11685 + }, + { + "epoch": 0.34, + "grad_norm": 1.6478381244086011, + "learning_rate": 7.699107686156305e-06, + "loss": 0.3619, + "step": 11686 + }, + { + "epoch": 0.34, + "grad_norm": 1.6713736415548195, + "learning_rate": 7.698712283980583e-06, + "loss": 0.3653, + "step": 11687 + }, + { + "epoch": 0.34, + "grad_norm": 1.4747998937873008, + "learning_rate": 7.698316857988698e-06, + "loss": 0.339, + "step": 11688 + }, + { + "epoch": 0.34, + "grad_norm": 1.7031280734839853, + "learning_rate": 7.697921408184134e-06, + "loss": 0.3286, + "step": 11689 + }, + { + "epoch": 0.34, + "grad_norm": 1.3341413462354885, + "learning_rate": 7.697525934570386e-06, + "loss": 0.3519, + "step": 11690 + }, + { + "epoch": 0.34, + "grad_norm": 1.7182471955995413, + "learning_rate": 7.697130437150941e-06, + "loss": 0.3845, + "step": 11691 + }, + { + "epoch": 0.34, + "grad_norm": 1.7017180370999658, + "learning_rate": 7.69673491592929e-06, + "loss": 0.3644, + "step": 11692 + }, + { + "epoch": 0.34, + "grad_norm": 1.4361537290380821, + "learning_rate": 7.696339370908924e-06, + "loss": 0.3171, + "step": 11693 + }, + { + "epoch": 0.34, + "grad_norm": 1.3959483435393527, + "learning_rate": 7.695943802093333e-06, + "loss": 0.3231, + "step": 11694 + }, + { + "epoch": 0.34, + "grad_norm": 1.777780015679685, + "learning_rate": 7.695548209486006e-06, + "loss": 0.3489, + "step": 11695 + }, + { + "epoch": 0.34, + "grad_norm": 1.2759116224119738, + "learning_rate": 7.695152593090438e-06, + "loss": 0.3197, + "step": 11696 + }, + { + "epoch": 0.34, + "grad_norm": 1.2538564977048665, + "learning_rate": 7.694756952910118e-06, + "loss": 0.3276, + "step": 11697 + }, + { + "epoch": 0.34, + "grad_norm": 1.4362967365072914, + "learning_rate": 7.694361288948538e-06, + "loss": 0.3516, + "step": 11698 + }, + { + "epoch": 0.34, + "grad_norm": 1.3901385644075719, + "learning_rate": 7.69396560120919e-06, + "loss": 0.3448, + "step": 11699 + }, + { + "epoch": 0.34, + "grad_norm": 1.3314496857471914, + "learning_rate": 7.693569889695565e-06, + "loss": 0.345, + "step": 11700 + }, + { + "epoch": 0.34, + "grad_norm": 1.2416817210785933, + "learning_rate": 7.693174154411157e-06, + "loss": 0.335, + "step": 11701 + }, + { + "epoch": 0.34, + "grad_norm": 2.3459189351677656, + "learning_rate": 7.692778395359457e-06, + "loss": 0.333, + "step": 11702 + }, + { + "epoch": 0.34, + "grad_norm": 1.6927983250427272, + "learning_rate": 7.692382612543956e-06, + "loss": 0.3015, + "step": 11703 + }, + { + "epoch": 0.34, + "grad_norm": 1.3195035582398928, + "learning_rate": 7.69198680596815e-06, + "loss": 0.3246, + "step": 11704 + }, + { + "epoch": 0.34, + "grad_norm": 1.4647044803795746, + "learning_rate": 7.691590975635532e-06, + "loss": 0.3681, + "step": 11705 + }, + { + "epoch": 0.34, + "grad_norm": 1.2812272217500926, + "learning_rate": 7.691195121549592e-06, + "loss": 0.3275, + "step": 11706 + }, + { + "epoch": 0.34, + "grad_norm": 1.550446197306871, + "learning_rate": 7.690799243713825e-06, + "loss": 0.3799, + "step": 11707 + }, + { + "epoch": 0.34, + "grad_norm": 1.3989886616726266, + "learning_rate": 7.690403342131725e-06, + "loss": 0.3578, + "step": 11708 + }, + { + "epoch": 0.34, + "grad_norm": 1.372939826907779, + "learning_rate": 7.690007416806787e-06, + "loss": 0.3586, + "step": 11709 + }, + { + "epoch": 0.34, + "grad_norm": 1.8446953808135333, + "learning_rate": 7.689611467742503e-06, + "loss": 0.3397, + "step": 11710 + }, + { + "epoch": 0.34, + "grad_norm": 1.518554935631792, + "learning_rate": 7.689215494942366e-06, + "loss": 0.4008, + "step": 11711 + }, + { + "epoch": 0.34, + "grad_norm": 1.3660699681643331, + "learning_rate": 7.688819498409875e-06, + "loss": 0.35, + "step": 11712 + }, + { + "epoch": 0.34, + "grad_norm": 1.4113516904757422, + "learning_rate": 7.688423478148522e-06, + "loss": 0.3331, + "step": 11713 + }, + { + "epoch": 0.34, + "grad_norm": 1.2695765416003453, + "learning_rate": 7.6880274341618e-06, + "loss": 0.3257, + "step": 11714 + }, + { + "epoch": 0.34, + "grad_norm": 1.547074102280158, + "learning_rate": 7.687631366453208e-06, + "loss": 0.3512, + "step": 11715 + }, + { + "epoch": 0.34, + "grad_norm": 1.519653402290216, + "learning_rate": 7.687235275026239e-06, + "loss": 0.3521, + "step": 11716 + }, + { + "epoch": 0.34, + "grad_norm": 1.5402726777983062, + "learning_rate": 7.686839159884388e-06, + "loss": 0.3498, + "step": 11717 + }, + { + "epoch": 0.34, + "grad_norm": 2.002193339470131, + "learning_rate": 7.686443021031151e-06, + "loss": 0.3581, + "step": 11718 + }, + { + "epoch": 0.34, + "grad_norm": 1.4601320744463695, + "learning_rate": 7.686046858470026e-06, + "loss": 0.3608, + "step": 11719 + }, + { + "epoch": 0.34, + "grad_norm": 1.4602824419803049, + "learning_rate": 7.685650672204508e-06, + "loss": 0.3288, + "step": 11720 + }, + { + "epoch": 0.34, + "grad_norm": 1.2467255877733687, + "learning_rate": 7.685254462238092e-06, + "loss": 0.3166, + "step": 11721 + }, + { + "epoch": 0.34, + "grad_norm": 1.482878771727343, + "learning_rate": 7.684858228574277e-06, + "loss": 0.3253, + "step": 11722 + }, + { + "epoch": 0.34, + "grad_norm": 1.5091026873872995, + "learning_rate": 7.684461971216557e-06, + "loss": 0.3441, + "step": 11723 + }, + { + "epoch": 0.34, + "grad_norm": 1.2240647452597169, + "learning_rate": 7.684065690168431e-06, + "loss": 0.3206, + "step": 11724 + }, + { + "epoch": 0.34, + "grad_norm": 1.2117593894528942, + "learning_rate": 7.683669385433394e-06, + "loss": 0.3488, + "step": 11725 + }, + { + "epoch": 0.34, + "grad_norm": 1.409863321407666, + "learning_rate": 7.683273057014946e-06, + "loss": 0.3338, + "step": 11726 + }, + { + "epoch": 0.34, + "grad_norm": 1.7579329879244712, + "learning_rate": 7.682876704916584e-06, + "loss": 0.3382, + "step": 11727 + }, + { + "epoch": 0.34, + "grad_norm": 1.3270049256060787, + "learning_rate": 7.682480329141804e-06, + "loss": 0.3251, + "step": 11728 + }, + { + "epoch": 0.34, + "grad_norm": 1.3527554786539686, + "learning_rate": 7.682083929694106e-06, + "loss": 0.36, + "step": 11729 + }, + { + "epoch": 0.34, + "grad_norm": 1.350207057681014, + "learning_rate": 7.681687506576988e-06, + "loss": 0.3823, + "step": 11730 + }, + { + "epoch": 0.34, + "grad_norm": 1.2972538632678554, + "learning_rate": 7.681291059793947e-06, + "loss": 0.332, + "step": 11731 + }, + { + "epoch": 0.34, + "grad_norm": 1.3653099506201316, + "learning_rate": 7.680894589348484e-06, + "loss": 0.3327, + "step": 11732 + }, + { + "epoch": 0.34, + "grad_norm": 1.5057722412530654, + "learning_rate": 7.680498095244096e-06, + "loss": 0.3531, + "step": 11733 + }, + { + "epoch": 0.34, + "grad_norm": 1.3143448795397532, + "learning_rate": 7.680101577484282e-06, + "loss": 0.3252, + "step": 11734 + }, + { + "epoch": 0.34, + "grad_norm": 1.4835313695936605, + "learning_rate": 7.679705036072542e-06, + "loss": 0.3354, + "step": 11735 + }, + { + "epoch": 0.34, + "grad_norm": 1.5198592796434125, + "learning_rate": 7.679308471012377e-06, + "loss": 0.3457, + "step": 11736 + }, + { + "epoch": 0.34, + "grad_norm": 1.331323232915701, + "learning_rate": 7.678911882307282e-06, + "loss": 0.3447, + "step": 11737 + }, + { + "epoch": 0.34, + "grad_norm": 1.3382464011371002, + "learning_rate": 7.678515269960763e-06, + "loss": 0.3261, + "step": 11738 + }, + { + "epoch": 0.34, + "grad_norm": 1.417873341477348, + "learning_rate": 7.678118633976315e-06, + "loss": 0.3381, + "step": 11739 + }, + { + "epoch": 0.34, + "grad_norm": 1.4276295774821428, + "learning_rate": 7.67772197435744e-06, + "loss": 0.3772, + "step": 11740 + }, + { + "epoch": 0.34, + "grad_norm": 1.4385818639423338, + "learning_rate": 7.677325291107641e-06, + "loss": 0.3646, + "step": 11741 + }, + { + "epoch": 0.34, + "grad_norm": 1.3782846354394747, + "learning_rate": 7.676928584230417e-06, + "loss": 0.3569, + "step": 11742 + }, + { + "epoch": 0.34, + "grad_norm": 1.5797693600245188, + "learning_rate": 7.676531853729266e-06, + "loss": 0.3297, + "step": 11743 + }, + { + "epoch": 0.34, + "grad_norm": 1.4426063741278512, + "learning_rate": 7.676135099607693e-06, + "loss": 0.3395, + "step": 11744 + }, + { + "epoch": 0.34, + "grad_norm": 1.3774701528855076, + "learning_rate": 7.675738321869197e-06, + "loss": 0.3349, + "step": 11745 + }, + { + "epoch": 0.34, + "grad_norm": 1.3087922231786908, + "learning_rate": 7.675341520517282e-06, + "loss": 0.3445, + "step": 11746 + }, + { + "epoch": 0.34, + "grad_norm": 1.4306682344920816, + "learning_rate": 7.674944695555447e-06, + "loss": 0.3479, + "step": 11747 + }, + { + "epoch": 0.34, + "grad_norm": 1.6166895204879124, + "learning_rate": 7.674547846987197e-06, + "loss": 0.3367, + "step": 11748 + }, + { + "epoch": 0.34, + "grad_norm": 1.3825178231700501, + "learning_rate": 7.674150974816031e-06, + "loss": 0.3328, + "step": 11749 + }, + { + "epoch": 0.34, + "grad_norm": 1.2798132443511914, + "learning_rate": 7.673754079045455e-06, + "loss": 0.3414, + "step": 11750 + }, + { + "epoch": 0.34, + "grad_norm": 1.40947533452502, + "learning_rate": 7.673357159678969e-06, + "loss": 0.3252, + "step": 11751 + }, + { + "epoch": 0.34, + "grad_norm": 1.3857920533383903, + "learning_rate": 7.672960216720076e-06, + "loss": 0.366, + "step": 11752 + }, + { + "epoch": 0.34, + "grad_norm": 1.5803169122593343, + "learning_rate": 7.672563250172278e-06, + "loss": 0.3344, + "step": 11753 + }, + { + "epoch": 0.34, + "grad_norm": 1.2308141948134954, + "learning_rate": 7.67216626003908e-06, + "loss": 0.3296, + "step": 11754 + }, + { + "epoch": 0.34, + "grad_norm": 1.395415589192456, + "learning_rate": 7.671769246323986e-06, + "loss": 0.3391, + "step": 11755 + }, + { + "epoch": 0.34, + "grad_norm": 1.429989666845395, + "learning_rate": 7.671372209030499e-06, + "loss": 0.3596, + "step": 11756 + }, + { + "epoch": 0.34, + "grad_norm": 1.578848432445936, + "learning_rate": 7.670975148162124e-06, + "loss": 0.3345, + "step": 11757 + }, + { + "epoch": 0.34, + "grad_norm": 0.9110590626810313, + "learning_rate": 7.67057806372236e-06, + "loss": 0.5836, + "step": 11758 + }, + { + "epoch": 0.34, + "grad_norm": 1.352001710678159, + "learning_rate": 7.670180955714717e-06, + "loss": 0.3658, + "step": 11759 + }, + { + "epoch": 0.34, + "grad_norm": 1.447660903001411, + "learning_rate": 7.669783824142698e-06, + "loss": 0.3722, + "step": 11760 + }, + { + "epoch": 0.34, + "grad_norm": 1.555602230715191, + "learning_rate": 7.669386669009807e-06, + "loss": 0.3453, + "step": 11761 + }, + { + "epoch": 0.34, + "grad_norm": 1.246770088247038, + "learning_rate": 7.668989490319547e-06, + "loss": 0.3322, + "step": 11762 + }, + { + "epoch": 0.34, + "grad_norm": 1.3550564798941747, + "learning_rate": 7.668592288075427e-06, + "loss": 0.3445, + "step": 11763 + }, + { + "epoch": 0.34, + "grad_norm": 1.4110820728778999, + "learning_rate": 7.66819506228095e-06, + "loss": 0.3687, + "step": 11764 + }, + { + "epoch": 0.34, + "grad_norm": 1.2801009539460149, + "learning_rate": 7.667797812939624e-06, + "loss": 0.324, + "step": 11765 + }, + { + "epoch": 0.34, + "grad_norm": 2.0320047298590986, + "learning_rate": 7.66740054005495e-06, + "loss": 0.3754, + "step": 11766 + }, + { + "epoch": 0.34, + "grad_norm": 1.2736913551074103, + "learning_rate": 7.667003243630438e-06, + "loss": 0.3376, + "step": 11767 + }, + { + "epoch": 0.34, + "grad_norm": 1.4318526192342638, + "learning_rate": 7.666605923669592e-06, + "loss": 0.3717, + "step": 11768 + }, + { + "epoch": 0.34, + "grad_norm": 1.9631474631433254, + "learning_rate": 7.66620858017592e-06, + "loss": 0.3771, + "step": 11769 + }, + { + "epoch": 0.34, + "grad_norm": 1.2664373807412774, + "learning_rate": 7.665811213152928e-06, + "loss": 0.3227, + "step": 11770 + }, + { + "epoch": 0.34, + "grad_norm": 1.5023822693611923, + "learning_rate": 7.665413822604119e-06, + "loss": 0.3371, + "step": 11771 + }, + { + "epoch": 0.34, + "grad_norm": 1.4160980447034421, + "learning_rate": 7.665016408533007e-06, + "loss": 0.3196, + "step": 11772 + }, + { + "epoch": 0.34, + "grad_norm": 1.2612825520799116, + "learning_rate": 7.664618970943094e-06, + "loss": 0.34, + "step": 11773 + }, + { + "epoch": 0.34, + "grad_norm": 1.344760306460247, + "learning_rate": 7.66422150983789e-06, + "loss": 0.3417, + "step": 11774 + }, + { + "epoch": 0.34, + "grad_norm": 1.4728811218164901, + "learning_rate": 7.663824025220902e-06, + "loss": 0.3235, + "step": 11775 + }, + { + "epoch": 0.34, + "grad_norm": 1.3849620460373568, + "learning_rate": 7.663426517095637e-06, + "loss": 0.3405, + "step": 11776 + }, + { + "epoch": 0.34, + "grad_norm": 1.3731229309294917, + "learning_rate": 7.663028985465601e-06, + "loss": 0.3474, + "step": 11777 + }, + { + "epoch": 0.34, + "grad_norm": 1.3411514901551091, + "learning_rate": 7.662631430334308e-06, + "loss": 0.3307, + "step": 11778 + }, + { + "epoch": 0.34, + "grad_norm": 1.2766261494487392, + "learning_rate": 7.662233851705264e-06, + "loss": 0.3553, + "step": 11779 + }, + { + "epoch": 0.34, + "grad_norm": 1.2780887362197455, + "learning_rate": 7.661836249581973e-06, + "loss": 0.3221, + "step": 11780 + }, + { + "epoch": 0.34, + "grad_norm": 1.2760296003690237, + "learning_rate": 7.661438623967949e-06, + "loss": 0.3082, + "step": 11781 + }, + { + "epoch": 0.34, + "grad_norm": 1.590020554996691, + "learning_rate": 7.6610409748667e-06, + "loss": 0.3555, + "step": 11782 + }, + { + "epoch": 0.34, + "grad_norm": 1.2752711791565723, + "learning_rate": 7.660643302281736e-06, + "loss": 0.3339, + "step": 11783 + }, + { + "epoch": 0.34, + "grad_norm": 1.4716314718450727, + "learning_rate": 7.660245606216564e-06, + "loss": 0.353, + "step": 11784 + }, + { + "epoch": 0.34, + "grad_norm": 1.2787243682555558, + "learning_rate": 7.659847886674693e-06, + "loss": 0.3457, + "step": 11785 + }, + { + "epoch": 0.34, + "grad_norm": 1.532813869443174, + "learning_rate": 7.659450143659637e-06, + "loss": 0.3494, + "step": 11786 + }, + { + "epoch": 0.34, + "grad_norm": 1.574245490673514, + "learning_rate": 7.659052377174906e-06, + "loss": 0.3254, + "step": 11787 + }, + { + "epoch": 0.34, + "grad_norm": 8.5516106894896, + "learning_rate": 7.658654587224006e-06, + "loss": 0.3533, + "step": 11788 + }, + { + "epoch": 0.34, + "grad_norm": 2.863953608998095, + "learning_rate": 7.65825677381045e-06, + "loss": 0.3571, + "step": 11789 + }, + { + "epoch": 0.34, + "grad_norm": 1.3512981671012199, + "learning_rate": 7.657858936937748e-06, + "loss": 0.322, + "step": 11790 + }, + { + "epoch": 0.34, + "grad_norm": 1.6445959104085945, + "learning_rate": 7.657461076609411e-06, + "loss": 0.3409, + "step": 11791 + }, + { + "epoch": 0.34, + "grad_norm": 1.3148457417598247, + "learning_rate": 7.657063192828953e-06, + "loss": 0.3482, + "step": 11792 + }, + { + "epoch": 0.34, + "grad_norm": 1.4600268593575108, + "learning_rate": 7.65666528559988e-06, + "loss": 0.3295, + "step": 11793 + }, + { + "epoch": 0.34, + "grad_norm": 1.3482683459390163, + "learning_rate": 7.656267354925709e-06, + "loss": 0.3497, + "step": 11794 + }, + { + "epoch": 0.34, + "grad_norm": 1.4044103700767392, + "learning_rate": 7.655869400809947e-06, + "loss": 0.3394, + "step": 11795 + }, + { + "epoch": 0.34, + "grad_norm": 1.5843804249146818, + "learning_rate": 7.65547142325611e-06, + "loss": 0.329, + "step": 11796 + }, + { + "epoch": 0.34, + "grad_norm": 1.0156510382226955, + "learning_rate": 7.655073422267707e-06, + "loss": 0.6396, + "step": 11797 + }, + { + "epoch": 0.34, + "grad_norm": 1.4456622274294784, + "learning_rate": 7.65467539784825e-06, + "loss": 0.3526, + "step": 11798 + }, + { + "epoch": 0.34, + "grad_norm": 1.232096369807147, + "learning_rate": 7.654277350001255e-06, + "loss": 0.3523, + "step": 11799 + }, + { + "epoch": 0.34, + "grad_norm": 1.3932190501113044, + "learning_rate": 7.653879278730232e-06, + "loss": 0.3409, + "step": 11800 + }, + { + "epoch": 0.34, + "grad_norm": 1.444926187647836, + "learning_rate": 7.653481184038698e-06, + "loss": 0.3537, + "step": 11801 + }, + { + "epoch": 0.34, + "grad_norm": 1.6503570956144493, + "learning_rate": 7.653083065930161e-06, + "loss": 0.3463, + "step": 11802 + }, + { + "epoch": 0.34, + "grad_norm": 1.2819097155027928, + "learning_rate": 7.652684924408136e-06, + "loss": 0.3496, + "step": 11803 + }, + { + "epoch": 0.34, + "grad_norm": 1.5475647262379704, + "learning_rate": 7.652286759476137e-06, + "loss": 0.3509, + "step": 11804 + }, + { + "epoch": 0.34, + "grad_norm": 1.3564998550951484, + "learning_rate": 7.651888571137678e-06, + "loss": 0.3247, + "step": 11805 + }, + { + "epoch": 0.34, + "grad_norm": 1.2999575451779963, + "learning_rate": 7.651490359396275e-06, + "loss": 0.3562, + "step": 11806 + }, + { + "epoch": 0.34, + "grad_norm": 1.334911222167986, + "learning_rate": 7.651092124255437e-06, + "loss": 0.3222, + "step": 11807 + }, + { + "epoch": 0.34, + "grad_norm": 1.1821192722025242, + "learning_rate": 7.650693865718684e-06, + "loss": 0.3454, + "step": 11808 + }, + { + "epoch": 0.34, + "grad_norm": 1.6966380720154406, + "learning_rate": 7.650295583789527e-06, + "loss": 0.3395, + "step": 11809 + }, + { + "epoch": 0.34, + "grad_norm": 1.5100918752862664, + "learning_rate": 7.649897278471481e-06, + "loss": 0.3329, + "step": 11810 + }, + { + "epoch": 0.34, + "grad_norm": 1.0025811242855687, + "learning_rate": 7.649498949768064e-06, + "loss": 0.604, + "step": 11811 + }, + { + "epoch": 0.34, + "grad_norm": 1.3233054600789194, + "learning_rate": 7.64910059768279e-06, + "loss": 0.3429, + "step": 11812 + }, + { + "epoch": 0.34, + "grad_norm": 1.6658122034403882, + "learning_rate": 7.64870222221917e-06, + "loss": 0.3642, + "step": 11813 + }, + { + "epoch": 0.34, + "grad_norm": 1.37996949712813, + "learning_rate": 7.648303823380727e-06, + "loss": 0.3442, + "step": 11814 + }, + { + "epoch": 0.34, + "grad_norm": 1.3786432348447781, + "learning_rate": 7.647905401170974e-06, + "loss": 0.3443, + "step": 11815 + }, + { + "epoch": 0.34, + "grad_norm": 1.6059382976681797, + "learning_rate": 7.647506955593425e-06, + "loss": 0.3505, + "step": 11816 + }, + { + "epoch": 0.34, + "grad_norm": 1.808363996416566, + "learning_rate": 7.647108486651598e-06, + "loss": 0.3173, + "step": 11817 + }, + { + "epoch": 0.34, + "grad_norm": 1.437108418998759, + "learning_rate": 7.64670999434901e-06, + "loss": 0.3617, + "step": 11818 + }, + { + "epoch": 0.34, + "grad_norm": 1.5405532012932772, + "learning_rate": 7.646311478689174e-06, + "loss": 0.3234, + "step": 11819 + }, + { + "epoch": 0.34, + "grad_norm": 1.3069486581656493, + "learning_rate": 7.645912939675612e-06, + "loss": 0.3491, + "step": 11820 + }, + { + "epoch": 0.34, + "grad_norm": 1.475662292752165, + "learning_rate": 7.64551437731184e-06, + "loss": 0.3431, + "step": 11821 + }, + { + "epoch": 0.34, + "grad_norm": 0.9714282698792918, + "learning_rate": 7.645115791601371e-06, + "loss": 0.5846, + "step": 11822 + }, + { + "epoch": 0.34, + "grad_norm": 1.9405018289034903, + "learning_rate": 7.64471718254773e-06, + "loss": 0.3248, + "step": 11823 + }, + { + "epoch": 0.34, + "grad_norm": 1.2724977494697818, + "learning_rate": 7.644318550154428e-06, + "loss": 0.3329, + "step": 11824 + }, + { + "epoch": 0.34, + "grad_norm": 1.2670940556600683, + "learning_rate": 7.643919894424985e-06, + "loss": 0.3534, + "step": 11825 + }, + { + "epoch": 0.34, + "grad_norm": 1.417281615617481, + "learning_rate": 7.64352121536292e-06, + "loss": 0.3812, + "step": 11826 + }, + { + "epoch": 0.34, + "grad_norm": 1.4038704739753733, + "learning_rate": 7.643122512971752e-06, + "loss": 0.3648, + "step": 11827 + }, + { + "epoch": 0.34, + "grad_norm": 1.6664633442764059, + "learning_rate": 7.642723787254998e-06, + "loss": 0.3229, + "step": 11828 + }, + { + "epoch": 0.34, + "grad_norm": 1.4358166912082693, + "learning_rate": 7.642325038216178e-06, + "loss": 0.3728, + "step": 11829 + }, + { + "epoch": 0.34, + "grad_norm": 1.6969996504863738, + "learning_rate": 7.641926265858807e-06, + "loss": 0.3278, + "step": 11830 + }, + { + "epoch": 0.34, + "grad_norm": 1.7732856868491358, + "learning_rate": 7.64152747018641e-06, + "loss": 0.3741, + "step": 11831 + }, + { + "epoch": 0.34, + "grad_norm": 1.4178829406138396, + "learning_rate": 7.641128651202505e-06, + "loss": 0.3237, + "step": 11832 + }, + { + "epoch": 0.34, + "grad_norm": 1.287832930419046, + "learning_rate": 7.640729808910609e-06, + "loss": 0.3312, + "step": 11833 + }, + { + "epoch": 0.34, + "grad_norm": 1.5313575806307855, + "learning_rate": 7.640330943314243e-06, + "loss": 0.3215, + "step": 11834 + }, + { + "epoch": 0.34, + "grad_norm": 1.8920811495005536, + "learning_rate": 7.639932054416927e-06, + "loss": 0.3277, + "step": 11835 + }, + { + "epoch": 0.34, + "grad_norm": 1.692922186938723, + "learning_rate": 7.639533142222182e-06, + "loss": 0.3337, + "step": 11836 + }, + { + "epoch": 0.34, + "grad_norm": 1.243626081620986, + "learning_rate": 7.639134206733528e-06, + "loss": 0.3562, + "step": 11837 + }, + { + "epoch": 0.34, + "grad_norm": 1.3085541062905757, + "learning_rate": 7.638735247954483e-06, + "loss": 0.3289, + "step": 11838 + }, + { + "epoch": 0.34, + "grad_norm": 1.240883699133277, + "learning_rate": 7.638336265888573e-06, + "loss": 0.3161, + "step": 11839 + }, + { + "epoch": 0.34, + "grad_norm": 1.2566501812706456, + "learning_rate": 7.637937260539315e-06, + "loss": 0.36, + "step": 11840 + }, + { + "epoch": 0.34, + "grad_norm": 1.2818731925717242, + "learning_rate": 7.63753823191023e-06, + "loss": 0.3281, + "step": 11841 + }, + { + "epoch": 0.34, + "grad_norm": 1.2601456938881566, + "learning_rate": 7.637139180004842e-06, + "loss": 0.3292, + "step": 11842 + }, + { + "epoch": 0.34, + "grad_norm": 1.414140810874942, + "learning_rate": 7.636740104826672e-06, + "loss": 0.3394, + "step": 11843 + }, + { + "epoch": 0.34, + "grad_norm": 1.6327125166596335, + "learning_rate": 7.636341006379241e-06, + "loss": 0.3349, + "step": 11844 + }, + { + "epoch": 0.34, + "grad_norm": 1.4463604647798805, + "learning_rate": 7.635941884666072e-06, + "loss": 0.3461, + "step": 11845 + }, + { + "epoch": 0.34, + "grad_norm": 0.9381510373223069, + "learning_rate": 7.635542739690684e-06, + "loss": 0.6085, + "step": 11846 + }, + { + "epoch": 0.34, + "grad_norm": 1.2468533125911903, + "learning_rate": 7.635143571456604e-06, + "loss": 0.3285, + "step": 11847 + }, + { + "epoch": 0.34, + "grad_norm": 1.3278002875351165, + "learning_rate": 7.634744379967354e-06, + "loss": 0.371, + "step": 11848 + }, + { + "epoch": 0.34, + "grad_norm": 1.3043254408154434, + "learning_rate": 7.634345165226455e-06, + "loss": 0.3174, + "step": 11849 + }, + { + "epoch": 0.34, + "grad_norm": 1.551730218268998, + "learning_rate": 7.63394592723743e-06, + "loss": 0.3478, + "step": 11850 + }, + { + "epoch": 0.34, + "grad_norm": 1.3246371977038764, + "learning_rate": 7.633546666003802e-06, + "loss": 0.3529, + "step": 11851 + }, + { + "epoch": 0.34, + "grad_norm": 1.2782858295382555, + "learning_rate": 7.633147381529098e-06, + "loss": 0.3372, + "step": 11852 + }, + { + "epoch": 0.34, + "grad_norm": 1.4992529596757456, + "learning_rate": 7.632748073816838e-06, + "loss": 0.3175, + "step": 11853 + }, + { + "epoch": 0.34, + "grad_norm": 1.3169261316745275, + "learning_rate": 7.632348742870545e-06, + "loss": 0.3196, + "step": 11854 + }, + { + "epoch": 0.34, + "grad_norm": 1.321459042151336, + "learning_rate": 7.631949388693748e-06, + "loss": 0.359, + "step": 11855 + }, + { + "epoch": 0.34, + "grad_norm": 1.3049570202715495, + "learning_rate": 7.631550011289968e-06, + "loss": 0.3387, + "step": 11856 + }, + { + "epoch": 0.34, + "grad_norm": 1.3384714397849267, + "learning_rate": 7.631150610662729e-06, + "loss": 0.3471, + "step": 11857 + }, + { + "epoch": 0.34, + "grad_norm": 1.4162322335720874, + "learning_rate": 7.630751186815557e-06, + "loss": 0.3277, + "step": 11858 + }, + { + "epoch": 0.34, + "grad_norm": 1.3291628458699725, + "learning_rate": 7.630351739751976e-06, + "loss": 0.334, + "step": 11859 + }, + { + "epoch": 0.34, + "grad_norm": 1.3175831348096108, + "learning_rate": 7.629952269475514e-06, + "loss": 0.3328, + "step": 11860 + }, + { + "epoch": 0.34, + "grad_norm": 1.6632507484289085, + "learning_rate": 7.629552775989691e-06, + "loss": 0.339, + "step": 11861 + }, + { + "epoch": 0.34, + "grad_norm": 1.2788587945596015, + "learning_rate": 7.629153259298037e-06, + "loss": 0.3421, + "step": 11862 + }, + { + "epoch": 0.34, + "grad_norm": 1.3631869756947832, + "learning_rate": 7.628753719404076e-06, + "loss": 0.3392, + "step": 11863 + }, + { + "epoch": 0.34, + "grad_norm": 1.3480935490259622, + "learning_rate": 7.628354156311335e-06, + "loss": 0.3301, + "step": 11864 + }, + { + "epoch": 0.34, + "grad_norm": 1.3090235921215114, + "learning_rate": 7.627954570023338e-06, + "loss": 0.3342, + "step": 11865 + }, + { + "epoch": 0.34, + "grad_norm": 1.3399174279516815, + "learning_rate": 7.627554960543615e-06, + "loss": 0.3571, + "step": 11866 + }, + { + "epoch": 0.34, + "grad_norm": 1.4960655996016474, + "learning_rate": 7.627155327875688e-06, + "loss": 0.3269, + "step": 11867 + }, + { + "epoch": 0.34, + "grad_norm": 1.5893002996509717, + "learning_rate": 7.626755672023087e-06, + "loss": 0.3532, + "step": 11868 + }, + { + "epoch": 0.34, + "grad_norm": 1.3495819728991871, + "learning_rate": 7.626355992989338e-06, + "loss": 0.3292, + "step": 11869 + }, + { + "epoch": 0.34, + "grad_norm": 1.3025608765242436, + "learning_rate": 7.625956290777969e-06, + "loss": 0.3242, + "step": 11870 + }, + { + "epoch": 0.34, + "grad_norm": 1.2509360460732482, + "learning_rate": 7.625556565392506e-06, + "loss": 0.3161, + "step": 11871 + }, + { + "epoch": 0.34, + "grad_norm": 1.406758162123726, + "learning_rate": 7.625156816836477e-06, + "loss": 0.321, + "step": 11872 + }, + { + "epoch": 0.34, + "grad_norm": 1.2906614836413244, + "learning_rate": 7.6247570451134086e-06, + "loss": 0.3254, + "step": 11873 + }, + { + "epoch": 0.34, + "grad_norm": 0.9971385293362875, + "learning_rate": 7.624357250226833e-06, + "loss": 0.607, + "step": 11874 + }, + { + "epoch": 0.34, + "grad_norm": 1.6047505313744657, + "learning_rate": 7.623957432180275e-06, + "loss": 0.3386, + "step": 11875 + }, + { + "epoch": 0.34, + "grad_norm": 1.2672550780780054, + "learning_rate": 7.623557590977263e-06, + "loss": 0.3918, + "step": 11876 + }, + { + "epoch": 0.34, + "grad_norm": 0.9647807565303946, + "learning_rate": 7.623157726621325e-06, + "loss": 0.6032, + "step": 11877 + }, + { + "epoch": 0.34, + "grad_norm": 1.3152904989042744, + "learning_rate": 7.622757839115993e-06, + "loss": 0.3167, + "step": 11878 + }, + { + "epoch": 0.34, + "grad_norm": 1.4975215793886745, + "learning_rate": 7.622357928464792e-06, + "loss": 0.3293, + "step": 11879 + }, + { + "epoch": 0.34, + "grad_norm": 1.2790508567134493, + "learning_rate": 7.621957994671254e-06, + "loss": 0.318, + "step": 11880 + }, + { + "epoch": 0.34, + "grad_norm": 1.475286558783772, + "learning_rate": 7.621558037738908e-06, + "loss": 0.3575, + "step": 11881 + }, + { + "epoch": 0.34, + "grad_norm": 1.325467222940143, + "learning_rate": 7.621158057671283e-06, + "loss": 0.345, + "step": 11882 + }, + { + "epoch": 0.34, + "grad_norm": 1.3834986116669414, + "learning_rate": 7.620758054471909e-06, + "loss": 0.3292, + "step": 11883 + }, + { + "epoch": 0.34, + "grad_norm": 1.3863186276024897, + "learning_rate": 7.620358028144317e-06, + "loss": 0.3301, + "step": 11884 + }, + { + "epoch": 0.34, + "grad_norm": 1.2583227987870107, + "learning_rate": 7.619957978692034e-06, + "loss": 0.3423, + "step": 11885 + }, + { + "epoch": 0.34, + "grad_norm": 2.450672808649229, + "learning_rate": 7.619557906118594e-06, + "loss": 0.3597, + "step": 11886 + }, + { + "epoch": 0.34, + "grad_norm": 1.2775767110286946, + "learning_rate": 7.619157810427526e-06, + "loss": 0.3519, + "step": 11887 + }, + { + "epoch": 0.34, + "grad_norm": 1.3331902622488496, + "learning_rate": 7.618757691622362e-06, + "loss": 0.3384, + "step": 11888 + }, + { + "epoch": 0.34, + "grad_norm": 1.3524864312404388, + "learning_rate": 7.618357549706633e-06, + "loss": 0.3348, + "step": 11889 + }, + { + "epoch": 0.34, + "grad_norm": 1.2944169237778014, + "learning_rate": 7.617957384683869e-06, + "loss": 0.3414, + "step": 11890 + }, + { + "epoch": 0.34, + "grad_norm": 1.6058771409861174, + "learning_rate": 7.617557196557601e-06, + "loss": 0.3287, + "step": 11891 + }, + { + "epoch": 0.34, + "grad_norm": 1.625953217397217, + "learning_rate": 7.617156985331363e-06, + "loss": 0.3309, + "step": 11892 + }, + { + "epoch": 0.34, + "grad_norm": 1.4133343912756233, + "learning_rate": 7.616756751008686e-06, + "loss": 0.3494, + "step": 11893 + }, + { + "epoch": 0.34, + "grad_norm": 1.2472624034627444, + "learning_rate": 7.6163564935931e-06, + "loss": 0.334, + "step": 11894 + }, + { + "epoch": 0.35, + "grad_norm": 1.2765988734686822, + "learning_rate": 7.6159562130881404e-06, + "loss": 0.3503, + "step": 11895 + }, + { + "epoch": 0.35, + "grad_norm": 1.2226046715654206, + "learning_rate": 7.615555909497337e-06, + "loss": 0.3193, + "step": 11896 + }, + { + "epoch": 0.35, + "grad_norm": 1.2797435407912563, + "learning_rate": 7.615155582824225e-06, + "loss": 0.3672, + "step": 11897 + }, + { + "epoch": 0.35, + "grad_norm": 1.6606308628312125, + "learning_rate": 7.614755233072334e-06, + "loss": 0.3658, + "step": 11898 + }, + { + "epoch": 0.35, + "grad_norm": 1.469250361065378, + "learning_rate": 7.6143548602451996e-06, + "loss": 0.317, + "step": 11899 + }, + { + "epoch": 0.35, + "grad_norm": 1.233940465502504, + "learning_rate": 7.613954464346356e-06, + "loss": 0.3329, + "step": 11900 + }, + { + "epoch": 0.35, + "grad_norm": 1.2816661023628877, + "learning_rate": 7.613554045379333e-06, + "loss": 0.3681, + "step": 11901 + }, + { + "epoch": 0.35, + "grad_norm": 1.3718402402270076, + "learning_rate": 7.6131536033476695e-06, + "loss": 0.315, + "step": 11902 + }, + { + "epoch": 0.35, + "grad_norm": 1.3466766241013162, + "learning_rate": 7.612753138254894e-06, + "loss": 0.3658, + "step": 11903 + }, + { + "epoch": 0.35, + "grad_norm": 1.3877338219119715, + "learning_rate": 7.612352650104544e-06, + "loss": 0.3154, + "step": 11904 + }, + { + "epoch": 0.35, + "grad_norm": 1.4492272459615485, + "learning_rate": 7.611952138900154e-06, + "loss": 0.352, + "step": 11905 + }, + { + "epoch": 0.35, + "grad_norm": 1.539735976038923, + "learning_rate": 7.611551604645256e-06, + "loss": 0.3177, + "step": 11906 + }, + { + "epoch": 0.35, + "grad_norm": 1.6437011369347618, + "learning_rate": 7.6111510473433855e-06, + "loss": 0.3272, + "step": 11907 + }, + { + "epoch": 0.35, + "grad_norm": 1.3568820483623922, + "learning_rate": 7.610750466998081e-06, + "loss": 0.3504, + "step": 11908 + }, + { + "epoch": 0.35, + "grad_norm": 1.256790527938456, + "learning_rate": 7.6103498636128715e-06, + "loss": 0.3353, + "step": 11909 + }, + { + "epoch": 0.35, + "grad_norm": 1.2377227830544937, + "learning_rate": 7.609949237191296e-06, + "loss": 0.3212, + "step": 11910 + }, + { + "epoch": 0.35, + "grad_norm": 1.34232323350301, + "learning_rate": 7.609548587736891e-06, + "loss": 0.3249, + "step": 11911 + }, + { + "epoch": 0.35, + "grad_norm": 1.562787931329634, + "learning_rate": 7.609147915253189e-06, + "loss": 0.3643, + "step": 11912 + }, + { + "epoch": 0.35, + "grad_norm": 1.2878519165638662, + "learning_rate": 7.608747219743729e-06, + "loss": 0.3222, + "step": 11913 + }, + { + "epoch": 0.35, + "grad_norm": 1.3311770628504973, + "learning_rate": 7.608346501212045e-06, + "loss": 0.3385, + "step": 11914 + }, + { + "epoch": 0.35, + "grad_norm": 1.3472613630508292, + "learning_rate": 7.607945759661674e-06, + "loss": 0.3585, + "step": 11915 + }, + { + "epoch": 0.35, + "grad_norm": 1.4210226292601347, + "learning_rate": 7.607544995096155e-06, + "loss": 0.3298, + "step": 11916 + }, + { + "epoch": 0.35, + "grad_norm": 1.46411882654752, + "learning_rate": 7.60714420751902e-06, + "loss": 0.3194, + "step": 11917 + }, + { + "epoch": 0.35, + "grad_norm": 1.4073012719763658, + "learning_rate": 7.606743396933809e-06, + "loss": 0.3225, + "step": 11918 + }, + { + "epoch": 0.35, + "grad_norm": 1.3003325838013924, + "learning_rate": 7.606342563344059e-06, + "loss": 0.3234, + "step": 11919 + }, + { + "epoch": 0.35, + "grad_norm": 1.298478984254971, + "learning_rate": 7.6059417067533085e-06, + "loss": 0.3369, + "step": 11920 + }, + { + "epoch": 0.35, + "grad_norm": 1.519653402736026, + "learning_rate": 7.6055408271650895e-06, + "loss": 0.3557, + "step": 11921 + }, + { + "epoch": 0.35, + "grad_norm": 1.4020007634454579, + "learning_rate": 7.605139924582947e-06, + "loss": 0.3217, + "step": 11922 + }, + { + "epoch": 0.35, + "grad_norm": 1.5681460786731616, + "learning_rate": 7.604738999010415e-06, + "loss": 0.3359, + "step": 11923 + }, + { + "epoch": 0.35, + "grad_norm": 2.183917707452044, + "learning_rate": 7.604338050451032e-06, + "loss": 0.3418, + "step": 11924 + }, + { + "epoch": 0.35, + "grad_norm": 1.2971810306699745, + "learning_rate": 7.603937078908338e-06, + "loss": 0.3376, + "step": 11925 + }, + { + "epoch": 0.35, + "grad_norm": 1.4501866243529766, + "learning_rate": 7.603536084385869e-06, + "loss": 0.373, + "step": 11926 + }, + { + "epoch": 0.35, + "grad_norm": 1.699694892891665, + "learning_rate": 7.603135066887166e-06, + "loss": 0.3599, + "step": 11927 + }, + { + "epoch": 0.35, + "grad_norm": 3.956701780547351, + "learning_rate": 7.602734026415766e-06, + "loss": 0.3579, + "step": 11928 + }, + { + "epoch": 0.35, + "grad_norm": 2.7668474837252983, + "learning_rate": 7.602332962975212e-06, + "loss": 0.3314, + "step": 11929 + }, + { + "epoch": 0.35, + "grad_norm": 1.3547606215924668, + "learning_rate": 7.601931876569038e-06, + "loss": 0.3589, + "step": 11930 + }, + { + "epoch": 0.35, + "grad_norm": 1.5046764927460106, + "learning_rate": 7.601530767200787e-06, + "loss": 0.3052, + "step": 11931 + }, + { + "epoch": 0.35, + "grad_norm": 1.281945326988941, + "learning_rate": 7.601129634873998e-06, + "loss": 0.3361, + "step": 11932 + }, + { + "epoch": 0.35, + "grad_norm": 1.2702445451951838, + "learning_rate": 7.600728479592212e-06, + "loss": 0.3339, + "step": 11933 + }, + { + "epoch": 0.35, + "grad_norm": 1.9542609251742475, + "learning_rate": 7.60032730135897e-06, + "loss": 0.3349, + "step": 11934 + }, + { + "epoch": 0.35, + "grad_norm": 1.4955950539705791, + "learning_rate": 7.599926100177807e-06, + "loss": 0.3869, + "step": 11935 + }, + { + "epoch": 0.35, + "grad_norm": 1.5948973390990104, + "learning_rate": 7.5995248760522686e-06, + "loss": 0.3776, + "step": 11936 + }, + { + "epoch": 0.35, + "grad_norm": 1.710394879209886, + "learning_rate": 7.599123628985894e-06, + "loss": 0.3565, + "step": 11937 + }, + { + "epoch": 0.35, + "grad_norm": 1.3314888127559927, + "learning_rate": 7.598722358982227e-06, + "loss": 0.3456, + "step": 11938 + }, + { + "epoch": 0.35, + "grad_norm": 1.6694280350823285, + "learning_rate": 7.5983210660448035e-06, + "loss": 0.3676, + "step": 11939 + }, + { + "epoch": 0.35, + "grad_norm": 1.212249599550521, + "learning_rate": 7.597919750177168e-06, + "loss": 0.3503, + "step": 11940 + }, + { + "epoch": 0.35, + "grad_norm": 2.75051802397079, + "learning_rate": 7.597518411382863e-06, + "loss": 0.3328, + "step": 11941 + }, + { + "epoch": 0.35, + "grad_norm": 1.3118953028943805, + "learning_rate": 7.597117049665429e-06, + "loss": 0.3352, + "step": 11942 + }, + { + "epoch": 0.35, + "grad_norm": 1.3522721892716272, + "learning_rate": 7.596715665028409e-06, + "loss": 0.3339, + "step": 11943 + }, + { + "epoch": 0.35, + "grad_norm": 1.3088328621937049, + "learning_rate": 7.596314257475344e-06, + "loss": 0.3613, + "step": 11944 + }, + { + "epoch": 0.35, + "grad_norm": 1.561511345880347, + "learning_rate": 7.595912827009775e-06, + "loss": 0.3414, + "step": 11945 + }, + { + "epoch": 0.35, + "grad_norm": 1.4414733733581058, + "learning_rate": 7.595511373635249e-06, + "loss": 0.3251, + "step": 11946 + }, + { + "epoch": 0.35, + "grad_norm": 1.3043384360284374, + "learning_rate": 7.595109897355304e-06, + "loss": 0.3209, + "step": 11947 + }, + { + "epoch": 0.35, + "grad_norm": 1.7103068955049148, + "learning_rate": 7.594708398173488e-06, + "loss": 0.3045, + "step": 11948 + }, + { + "epoch": 0.35, + "grad_norm": 1.6648635296042913, + "learning_rate": 7.59430687609334e-06, + "loss": 0.3452, + "step": 11949 + }, + { + "epoch": 0.35, + "grad_norm": 1.2523148702302143, + "learning_rate": 7.593905331118405e-06, + "loss": 0.3336, + "step": 11950 + }, + { + "epoch": 0.35, + "grad_norm": 1.3620317069375232, + "learning_rate": 7.5935037632522265e-06, + "loss": 0.3515, + "step": 11951 + }, + { + "epoch": 0.35, + "grad_norm": 1.367362076012957, + "learning_rate": 7.59310217249835e-06, + "loss": 0.3392, + "step": 11952 + }, + { + "epoch": 0.35, + "grad_norm": 1.9829545352976787, + "learning_rate": 7.592700558860317e-06, + "loss": 0.3396, + "step": 11953 + }, + { + "epoch": 0.35, + "grad_norm": 1.4158608953302683, + "learning_rate": 7.592298922341672e-06, + "loss": 0.3642, + "step": 11954 + }, + { + "epoch": 0.35, + "grad_norm": 1.2866734880157784, + "learning_rate": 7.591897262945961e-06, + "loss": 0.3255, + "step": 11955 + }, + { + "epoch": 0.35, + "grad_norm": 1.2412173369933985, + "learning_rate": 7.591495580676729e-06, + "loss": 0.3292, + "step": 11956 + }, + { + "epoch": 0.35, + "grad_norm": 1.284718632455849, + "learning_rate": 7.591093875537519e-06, + "loss": 0.3273, + "step": 11957 + }, + { + "epoch": 0.35, + "grad_norm": 1.3548202142490975, + "learning_rate": 7.590692147531875e-06, + "loss": 0.3293, + "step": 11958 + }, + { + "epoch": 0.35, + "grad_norm": 1.3452674656839416, + "learning_rate": 7.590290396663346e-06, + "loss": 0.3522, + "step": 11959 + }, + { + "epoch": 0.35, + "grad_norm": 1.2802080895604664, + "learning_rate": 7.5898886229354754e-06, + "loss": 0.3388, + "step": 11960 + }, + { + "epoch": 0.35, + "grad_norm": 1.3093201149743992, + "learning_rate": 7.5894868263518086e-06, + "loss": 0.3605, + "step": 11961 + }, + { + "epoch": 0.35, + "grad_norm": 1.3129655230000785, + "learning_rate": 7.589085006915892e-06, + "loss": 0.3757, + "step": 11962 + }, + { + "epoch": 0.35, + "grad_norm": 1.2383104577366628, + "learning_rate": 7.588683164631272e-06, + "loss": 0.3277, + "step": 11963 + }, + { + "epoch": 0.35, + "grad_norm": 1.663764968070029, + "learning_rate": 7.588281299501493e-06, + "loss": 0.3487, + "step": 11964 + }, + { + "epoch": 0.35, + "grad_norm": 1.2929660736627613, + "learning_rate": 7.5878794115301035e-06, + "loss": 0.346, + "step": 11965 + }, + { + "epoch": 0.35, + "grad_norm": 1.246274717355408, + "learning_rate": 7.5874775007206504e-06, + "loss": 0.3218, + "step": 11966 + }, + { + "epoch": 0.35, + "grad_norm": 1.3185757989939313, + "learning_rate": 7.587075567076678e-06, + "loss": 0.3688, + "step": 11967 + }, + { + "epoch": 0.35, + "grad_norm": 1.2248306783382097, + "learning_rate": 7.586673610601736e-06, + "loss": 0.3425, + "step": 11968 + }, + { + "epoch": 0.35, + "grad_norm": 0.9677906027323135, + "learning_rate": 7.586271631299371e-06, + "loss": 0.6355, + "step": 11969 + }, + { + "epoch": 0.35, + "grad_norm": 1.2860718098308732, + "learning_rate": 7.5858696291731305e-06, + "loss": 0.3275, + "step": 11970 + }, + { + "epoch": 0.35, + "grad_norm": 1.6447203706832787, + "learning_rate": 7.5854676042265596e-06, + "loss": 0.3801, + "step": 11971 + }, + { + "epoch": 0.35, + "grad_norm": 1.3458311046214357, + "learning_rate": 7.5850655564632106e-06, + "loss": 0.3411, + "step": 11972 + }, + { + "epoch": 0.35, + "grad_norm": 1.336143340561403, + "learning_rate": 7.5846634858866275e-06, + "loss": 0.3365, + "step": 11973 + }, + { + "epoch": 0.35, + "grad_norm": 1.2676538445428338, + "learning_rate": 7.584261392500363e-06, + "loss": 0.3256, + "step": 11974 + }, + { + "epoch": 0.35, + "grad_norm": 1.3131334970553852, + "learning_rate": 7.583859276307962e-06, + "loss": 0.3532, + "step": 11975 + }, + { + "epoch": 0.35, + "grad_norm": 2.7755490533262437, + "learning_rate": 7.583457137312974e-06, + "loss": 0.3196, + "step": 11976 + }, + { + "epoch": 0.35, + "grad_norm": 1.3904327418824918, + "learning_rate": 7.5830549755189476e-06, + "loss": 0.3478, + "step": 11977 + }, + { + "epoch": 0.35, + "grad_norm": 1.3763628300466066, + "learning_rate": 7.582652790929433e-06, + "loss": 0.3322, + "step": 11978 + }, + { + "epoch": 0.35, + "grad_norm": 1.582789010816138, + "learning_rate": 7.582250583547978e-06, + "loss": 0.3387, + "step": 11979 + }, + { + "epoch": 0.35, + "grad_norm": 0.9534739559451901, + "learning_rate": 7.581848353378134e-06, + "loss": 0.6481, + "step": 11980 + }, + { + "epoch": 0.35, + "grad_norm": 1.4378570287667363, + "learning_rate": 7.581446100423449e-06, + "loss": 0.3347, + "step": 11981 + }, + { + "epoch": 0.35, + "grad_norm": 1.2601885764303133, + "learning_rate": 7.581043824687473e-06, + "loss": 0.3171, + "step": 11982 + }, + { + "epoch": 0.35, + "grad_norm": 1.3196649028221534, + "learning_rate": 7.580641526173758e-06, + "loss": 0.3334, + "step": 11983 + }, + { + "epoch": 0.35, + "grad_norm": 1.7214488306742972, + "learning_rate": 7.580239204885853e-06, + "loss": 0.3382, + "step": 11984 + }, + { + "epoch": 0.35, + "grad_norm": 1.2407776955663148, + "learning_rate": 7.579836860827307e-06, + "loss": 0.3228, + "step": 11985 + }, + { + "epoch": 0.35, + "grad_norm": 1.3445523229478777, + "learning_rate": 7.579434494001672e-06, + "loss": 0.3382, + "step": 11986 + }, + { + "epoch": 0.35, + "grad_norm": 1.6534709906197322, + "learning_rate": 7.5790321044125e-06, + "loss": 0.3781, + "step": 11987 + }, + { + "epoch": 0.35, + "grad_norm": 1.430752457730092, + "learning_rate": 7.5786296920633414e-06, + "loss": 0.3229, + "step": 11988 + }, + { + "epoch": 0.35, + "grad_norm": 1.2158082621861397, + "learning_rate": 7.578227256957746e-06, + "loss": 0.3074, + "step": 11989 + }, + { + "epoch": 0.35, + "grad_norm": 1.3609568081497507, + "learning_rate": 7.577824799099268e-06, + "loss": 0.3447, + "step": 11990 + }, + { + "epoch": 0.35, + "grad_norm": 1.370758989227989, + "learning_rate": 7.5774223184914565e-06, + "loss": 0.3534, + "step": 11991 + }, + { + "epoch": 0.35, + "grad_norm": 2.0102952961496117, + "learning_rate": 7.577019815137864e-06, + "loss": 0.3392, + "step": 11992 + }, + { + "epoch": 0.35, + "grad_norm": 1.329389772019505, + "learning_rate": 7.576617289042044e-06, + "loss": 0.3344, + "step": 11993 + }, + { + "epoch": 0.35, + "grad_norm": 1.477252499648012, + "learning_rate": 7.576214740207548e-06, + "loss": 0.3533, + "step": 11994 + }, + { + "epoch": 0.35, + "grad_norm": 1.173167713473143, + "learning_rate": 7.575812168637926e-06, + "loss": 0.3105, + "step": 11995 + }, + { + "epoch": 0.35, + "grad_norm": 1.491325174490624, + "learning_rate": 7.575409574336735e-06, + "loss": 0.3606, + "step": 11996 + }, + { + "epoch": 0.35, + "grad_norm": 1.2783117288432961, + "learning_rate": 7.5750069573075255e-06, + "loss": 0.3458, + "step": 11997 + }, + { + "epoch": 0.35, + "grad_norm": 1.8719906965700521, + "learning_rate": 7.574604317553852e-06, + "loss": 0.3458, + "step": 11998 + }, + { + "epoch": 0.35, + "grad_norm": 1.3788222962007812, + "learning_rate": 7.574201655079265e-06, + "loss": 0.3157, + "step": 11999 + }, + { + "epoch": 0.35, + "grad_norm": 1.3862245782419043, + "learning_rate": 7.573798969887321e-06, + "loss": 0.3492, + "step": 12000 + }, + { + "epoch": 0.35, + "grad_norm": 1.4659171576589418, + "learning_rate": 7.573396261981572e-06, + "loss": 0.3675, + "step": 12001 + }, + { + "epoch": 0.35, + "grad_norm": 1.2679623069829633, + "learning_rate": 7.572993531365575e-06, + "loss": 0.3126, + "step": 12002 + }, + { + "epoch": 0.35, + "grad_norm": 1.5197994786538729, + "learning_rate": 7.57259077804288e-06, + "loss": 0.3313, + "step": 12003 + }, + { + "epoch": 0.35, + "grad_norm": 1.70531674529176, + "learning_rate": 7.572188002017041e-06, + "loss": 0.3271, + "step": 12004 + }, + { + "epoch": 0.35, + "grad_norm": 1.250813204087029, + "learning_rate": 7.571785203291616e-06, + "loss": 0.3164, + "step": 12005 + }, + { + "epoch": 0.35, + "grad_norm": 1.6472736908143057, + "learning_rate": 7.571382381870157e-06, + "loss": 0.352, + "step": 12006 + }, + { + "epoch": 0.35, + "grad_norm": 1.4131714726019453, + "learning_rate": 7.570979537756222e-06, + "loss": 0.3435, + "step": 12007 + }, + { + "epoch": 0.35, + "grad_norm": 1.259618227979244, + "learning_rate": 7.570576670953362e-06, + "loss": 0.3265, + "step": 12008 + }, + { + "epoch": 0.35, + "grad_norm": 1.247513065889257, + "learning_rate": 7.570173781465136e-06, + "loss": 0.3271, + "step": 12009 + }, + { + "epoch": 0.35, + "grad_norm": 1.3017548907306595, + "learning_rate": 7.569770869295095e-06, + "loss": 0.3377, + "step": 12010 + }, + { + "epoch": 0.35, + "grad_norm": 1.7880679605044671, + "learning_rate": 7.5693679344468e-06, + "loss": 0.3305, + "step": 12011 + }, + { + "epoch": 0.35, + "grad_norm": 1.318147083405997, + "learning_rate": 7.568964976923805e-06, + "loss": 0.3369, + "step": 12012 + }, + { + "epoch": 0.35, + "grad_norm": 1.2503259332973446, + "learning_rate": 7.568561996729664e-06, + "loss": 0.3332, + "step": 12013 + }, + { + "epoch": 0.35, + "grad_norm": 1.276148967760115, + "learning_rate": 7.568158993867936e-06, + "loss": 0.3546, + "step": 12014 + }, + { + "epoch": 0.35, + "grad_norm": 1.2764473324169463, + "learning_rate": 7.567755968342174e-06, + "loss": 0.4127, + "step": 12015 + }, + { + "epoch": 0.35, + "grad_norm": 1.5118306854173065, + "learning_rate": 7.567352920155939e-06, + "loss": 0.3318, + "step": 12016 + }, + { + "epoch": 0.35, + "grad_norm": 1.5211239111555723, + "learning_rate": 7.566949849312788e-06, + "loss": 0.3368, + "step": 12017 + }, + { + "epoch": 0.35, + "grad_norm": 1.5211008433136615, + "learning_rate": 7.566546755816273e-06, + "loss": 0.3628, + "step": 12018 + }, + { + "epoch": 0.35, + "grad_norm": 1.451749898671403, + "learning_rate": 7.566143639669956e-06, + "loss": 0.3381, + "step": 12019 + }, + { + "epoch": 0.35, + "grad_norm": 2.144069042313712, + "learning_rate": 7.5657405008773925e-06, + "loss": 0.3652, + "step": 12020 + }, + { + "epoch": 0.35, + "grad_norm": 1.3480279831502722, + "learning_rate": 7.5653373394421404e-06, + "loss": 0.3251, + "step": 12021 + }, + { + "epoch": 0.35, + "grad_norm": 1.2718256667475494, + "learning_rate": 7.564934155367758e-06, + "loss": 0.3403, + "step": 12022 + }, + { + "epoch": 0.35, + "grad_norm": 1.3463497051460287, + "learning_rate": 7.564530948657803e-06, + "loss": 0.3731, + "step": 12023 + }, + { + "epoch": 0.35, + "grad_norm": 1.5839595959854627, + "learning_rate": 7.564127719315833e-06, + "loss": 0.3275, + "step": 12024 + }, + { + "epoch": 0.35, + "grad_norm": 1.447882346539864, + "learning_rate": 7.563724467345409e-06, + "loss": 0.3279, + "step": 12025 + }, + { + "epoch": 0.35, + "grad_norm": 1.8123678628390267, + "learning_rate": 7.563321192750088e-06, + "loss": 0.3432, + "step": 12026 + }, + { + "epoch": 0.35, + "grad_norm": 1.2314533601534527, + "learning_rate": 7.562917895533428e-06, + "loss": 0.3348, + "step": 12027 + }, + { + "epoch": 0.35, + "grad_norm": 0.9474688556203114, + "learning_rate": 7.5625145756989894e-06, + "loss": 0.6483, + "step": 12028 + }, + { + "epoch": 0.35, + "grad_norm": 1.2613669432839865, + "learning_rate": 7.5621112332503325e-06, + "loss": 0.3264, + "step": 12029 + }, + { + "epoch": 0.35, + "grad_norm": 1.2634418427639431, + "learning_rate": 7.561707868191015e-06, + "loss": 0.3279, + "step": 12030 + }, + { + "epoch": 0.35, + "grad_norm": 1.3968402631289865, + "learning_rate": 7.561304480524596e-06, + "loss": 0.3401, + "step": 12031 + }, + { + "epoch": 0.35, + "grad_norm": 1.3128282044864703, + "learning_rate": 7.560901070254638e-06, + "loss": 0.394, + "step": 12032 + }, + { + "epoch": 0.35, + "grad_norm": 1.3617356271567826, + "learning_rate": 7.560497637384698e-06, + "loss": 0.3261, + "step": 12033 + }, + { + "epoch": 0.35, + "grad_norm": 1.244858044892589, + "learning_rate": 7.560094181918339e-06, + "loss": 0.3399, + "step": 12034 + }, + { + "epoch": 0.35, + "grad_norm": 1.417905036410997, + "learning_rate": 7.5596907038591214e-06, + "loss": 0.3332, + "step": 12035 + }, + { + "epoch": 0.35, + "grad_norm": 1.4147118739965536, + "learning_rate": 7.559287203210604e-06, + "loss": 0.3412, + "step": 12036 + }, + { + "epoch": 0.35, + "grad_norm": 1.2489112853638922, + "learning_rate": 7.558883679976347e-06, + "loss": 0.3407, + "step": 12037 + }, + { + "epoch": 0.35, + "grad_norm": 2.13988991470687, + "learning_rate": 7.5584801341599154e-06, + "loss": 0.3299, + "step": 12038 + }, + { + "epoch": 0.35, + "grad_norm": 1.4937467215164513, + "learning_rate": 7.558076565764868e-06, + "loss": 0.3329, + "step": 12039 + }, + { + "epoch": 0.35, + "grad_norm": 1.4879339167574803, + "learning_rate": 7.557672974794765e-06, + "loss": 0.3257, + "step": 12040 + }, + { + "epoch": 0.35, + "grad_norm": 1.8417814971443374, + "learning_rate": 7.557269361253172e-06, + "loss": 0.3678, + "step": 12041 + }, + { + "epoch": 0.35, + "grad_norm": 1.438817345370817, + "learning_rate": 7.556865725143646e-06, + "loss": 0.3577, + "step": 12042 + }, + { + "epoch": 0.35, + "grad_norm": 1.7859377662977554, + "learning_rate": 7.556462066469754e-06, + "loss": 0.3464, + "step": 12043 + }, + { + "epoch": 0.35, + "grad_norm": 1.4471247078239446, + "learning_rate": 7.556058385235055e-06, + "loss": 0.318, + "step": 12044 + }, + { + "epoch": 0.35, + "grad_norm": 1.510187138060877, + "learning_rate": 7.555654681443112e-06, + "loss": 0.3288, + "step": 12045 + }, + { + "epoch": 0.35, + "grad_norm": 1.727058356098661, + "learning_rate": 7.555250955097489e-06, + "loss": 0.3399, + "step": 12046 + }, + { + "epoch": 0.35, + "grad_norm": 1.2905194410778422, + "learning_rate": 7.554847206201745e-06, + "loss": 0.3317, + "step": 12047 + }, + { + "epoch": 0.35, + "grad_norm": 1.2881719526999382, + "learning_rate": 7.55444343475945e-06, + "loss": 0.3511, + "step": 12048 + }, + { + "epoch": 0.35, + "grad_norm": 1.3092547923214206, + "learning_rate": 7.554039640774161e-06, + "loss": 0.3204, + "step": 12049 + }, + { + "epoch": 0.35, + "grad_norm": 1.7709324974547322, + "learning_rate": 7.553635824249445e-06, + "loss": 0.3478, + "step": 12050 + }, + { + "epoch": 0.35, + "grad_norm": 1.3018797159630202, + "learning_rate": 7.553231985188862e-06, + "loss": 0.3376, + "step": 12051 + }, + { + "epoch": 0.35, + "grad_norm": 1.2760375298688793, + "learning_rate": 7.552828123595981e-06, + "loss": 0.3194, + "step": 12052 + }, + { + "epoch": 0.35, + "grad_norm": 1.3042779019482615, + "learning_rate": 7.552424239474364e-06, + "loss": 0.3347, + "step": 12053 + }, + { + "epoch": 0.35, + "grad_norm": 1.2593093486920555, + "learning_rate": 7.5520203328275736e-06, + "loss": 0.3437, + "step": 12054 + }, + { + "epoch": 0.35, + "grad_norm": 1.441336651098294, + "learning_rate": 7.551616403659175e-06, + "loss": 0.3417, + "step": 12055 + }, + { + "epoch": 0.35, + "grad_norm": 2.0261949622072053, + "learning_rate": 7.551212451972734e-06, + "loss": 0.355, + "step": 12056 + }, + { + "epoch": 0.35, + "grad_norm": 1.3137196884743103, + "learning_rate": 7.550808477771815e-06, + "loss": 0.3383, + "step": 12057 + }, + { + "epoch": 0.35, + "grad_norm": 1.4154548748023303, + "learning_rate": 7.550404481059982e-06, + "loss": 0.3362, + "step": 12058 + }, + { + "epoch": 0.35, + "grad_norm": 1.40695139387801, + "learning_rate": 7.550000461840803e-06, + "loss": 0.3474, + "step": 12059 + }, + { + "epoch": 0.35, + "grad_norm": 1.381887577767152, + "learning_rate": 7.54959642011784e-06, + "loss": 0.3342, + "step": 12060 + }, + { + "epoch": 0.35, + "grad_norm": 1.237659105829993, + "learning_rate": 7.549192355894659e-06, + "loss": 0.3221, + "step": 12061 + }, + { + "epoch": 0.35, + "grad_norm": 1.3213308109032273, + "learning_rate": 7.5487882691748295e-06, + "loss": 0.3361, + "step": 12062 + }, + { + "epoch": 0.35, + "grad_norm": 1.3534185381659747, + "learning_rate": 7.548384159961915e-06, + "loss": 0.3415, + "step": 12063 + }, + { + "epoch": 0.35, + "grad_norm": 1.3934808714576603, + "learning_rate": 7.5479800282594805e-06, + "loss": 0.3563, + "step": 12064 + }, + { + "epoch": 0.35, + "grad_norm": 1.4207888154532242, + "learning_rate": 7.547575874071096e-06, + "loss": 0.3543, + "step": 12065 + }, + { + "epoch": 0.35, + "grad_norm": 1.2886904376470474, + "learning_rate": 7.547171697400326e-06, + "loss": 0.3494, + "step": 12066 + }, + { + "epoch": 0.35, + "grad_norm": 1.4775532560340878, + "learning_rate": 7.546767498250735e-06, + "loss": 0.3302, + "step": 12067 + }, + { + "epoch": 0.35, + "grad_norm": 1.311007941937263, + "learning_rate": 7.546363276625894e-06, + "loss": 0.336, + "step": 12068 + }, + { + "epoch": 0.35, + "grad_norm": 1.724551176879987, + "learning_rate": 7.54595903252937e-06, + "loss": 0.305, + "step": 12069 + }, + { + "epoch": 0.35, + "grad_norm": 3.5313297040992486, + "learning_rate": 7.545554765964728e-06, + "loss": 0.3484, + "step": 12070 + }, + { + "epoch": 0.35, + "grad_norm": 1.3744209274418984, + "learning_rate": 7.545150476935537e-06, + "loss": 0.3257, + "step": 12071 + }, + { + "epoch": 0.35, + "grad_norm": 1.3007295756581552, + "learning_rate": 7.544746165445366e-06, + "loss": 0.3438, + "step": 12072 + }, + { + "epoch": 0.35, + "grad_norm": 1.3576848399257277, + "learning_rate": 7.5443418314977815e-06, + "loss": 0.3469, + "step": 12073 + }, + { + "epoch": 0.35, + "grad_norm": 1.4275531351978206, + "learning_rate": 7.543937475096351e-06, + "loss": 0.3435, + "step": 12074 + }, + { + "epoch": 0.35, + "grad_norm": 1.411444951605265, + "learning_rate": 7.543533096244644e-06, + "loss": 0.3518, + "step": 12075 + }, + { + "epoch": 0.35, + "grad_norm": 1.1513576354790127, + "learning_rate": 7.5431286949462315e-06, + "loss": 0.3095, + "step": 12076 + }, + { + "epoch": 0.35, + "grad_norm": 1.6418613840245204, + "learning_rate": 7.542724271204678e-06, + "loss": 0.3372, + "step": 12077 + }, + { + "epoch": 0.35, + "grad_norm": 1.6051179668919542, + "learning_rate": 7.542319825023555e-06, + "loss": 0.3283, + "step": 12078 + }, + { + "epoch": 0.35, + "grad_norm": 1.3942599820150794, + "learning_rate": 7.541915356406431e-06, + "loss": 0.3407, + "step": 12079 + }, + { + "epoch": 0.35, + "grad_norm": 1.2828935706555455, + "learning_rate": 7.541510865356877e-06, + "loss": 0.3247, + "step": 12080 + }, + { + "epoch": 0.35, + "grad_norm": 1.3439974069859248, + "learning_rate": 7.54110635187846e-06, + "loss": 0.3541, + "step": 12081 + }, + { + "epoch": 0.35, + "grad_norm": 1.3218102760370956, + "learning_rate": 7.540701815974751e-06, + "loss": 0.3291, + "step": 12082 + }, + { + "epoch": 0.35, + "grad_norm": 1.3558720390840626, + "learning_rate": 7.540297257649322e-06, + "loss": 0.3458, + "step": 12083 + }, + { + "epoch": 0.35, + "grad_norm": 1.40516535900134, + "learning_rate": 7.539892676905741e-06, + "loss": 0.3653, + "step": 12084 + }, + { + "epoch": 0.35, + "grad_norm": 1.4404357247048132, + "learning_rate": 7.539488073747578e-06, + "loss": 0.3345, + "step": 12085 + }, + { + "epoch": 0.35, + "grad_norm": 1.5746802661767043, + "learning_rate": 7.539083448178406e-06, + "loss": 0.3616, + "step": 12086 + }, + { + "epoch": 0.35, + "grad_norm": 1.2694928344629697, + "learning_rate": 7.5386788002017926e-06, + "loss": 0.3371, + "step": 12087 + }, + { + "epoch": 0.35, + "grad_norm": 1.3912452336069858, + "learning_rate": 7.538274129821313e-06, + "loss": 0.3354, + "step": 12088 + }, + { + "epoch": 0.35, + "grad_norm": 1.3740805657509372, + "learning_rate": 7.537869437040535e-06, + "loss": 0.3271, + "step": 12089 + }, + { + "epoch": 0.35, + "grad_norm": 1.2864063050978223, + "learning_rate": 7.5374647218630305e-06, + "loss": 0.3237, + "step": 12090 + }, + { + "epoch": 0.35, + "grad_norm": 1.3803553136028137, + "learning_rate": 7.537059984292372e-06, + "loss": 0.34, + "step": 12091 + }, + { + "epoch": 0.35, + "grad_norm": 0.9900101170596877, + "learning_rate": 7.53665522433213e-06, + "loss": 0.6231, + "step": 12092 + }, + { + "epoch": 0.35, + "grad_norm": 0.9959422492511081, + "learning_rate": 7.536250441985878e-06, + "loss": 0.6103, + "step": 12093 + }, + { + "epoch": 0.35, + "grad_norm": 1.4004693031061264, + "learning_rate": 7.535845637257189e-06, + "loss": 0.3337, + "step": 12094 + }, + { + "epoch": 0.35, + "grad_norm": 1.3392745414744707, + "learning_rate": 7.535440810149632e-06, + "loss": 0.3441, + "step": 12095 + }, + { + "epoch": 0.35, + "grad_norm": 1.1957342983700585, + "learning_rate": 7.535035960666783e-06, + "loss": 0.3643, + "step": 12096 + }, + { + "epoch": 0.35, + "grad_norm": 1.5506853519372037, + "learning_rate": 7.534631088812214e-06, + "loss": 0.3218, + "step": 12097 + }, + { + "epoch": 0.35, + "grad_norm": 0.9537235766438269, + "learning_rate": 7.534226194589498e-06, + "loss": 0.5801, + "step": 12098 + }, + { + "epoch": 0.35, + "grad_norm": 1.6545800248620388, + "learning_rate": 7.533821278002207e-06, + "loss": 0.3845, + "step": 12099 + }, + { + "epoch": 0.35, + "grad_norm": 1.3627616416328043, + "learning_rate": 7.533416339053914e-06, + "loss": 0.3499, + "step": 12100 + }, + { + "epoch": 0.35, + "grad_norm": 1.2609450088197045, + "learning_rate": 7.533011377748195e-06, + "loss": 0.3553, + "step": 12101 + }, + { + "epoch": 0.35, + "grad_norm": 1.4474747083869721, + "learning_rate": 7.5326063940886215e-06, + "loss": 0.3368, + "step": 12102 + }, + { + "epoch": 0.35, + "grad_norm": 1.4741405478930192, + "learning_rate": 7.532201388078771e-06, + "loss": 0.3271, + "step": 12103 + }, + { + "epoch": 0.35, + "grad_norm": 1.3744075236978663, + "learning_rate": 7.531796359722212e-06, + "loss": 0.3442, + "step": 12104 + }, + { + "epoch": 0.35, + "grad_norm": 1.324269396274133, + "learning_rate": 7.531391309022524e-06, + "loss": 0.3618, + "step": 12105 + }, + { + "epoch": 0.35, + "grad_norm": 1.9723950814657034, + "learning_rate": 7.530986235983278e-06, + "loss": 0.3274, + "step": 12106 + }, + { + "epoch": 0.35, + "grad_norm": 1.2802056050017727, + "learning_rate": 7.530581140608051e-06, + "loss": 0.3129, + "step": 12107 + }, + { + "epoch": 0.35, + "grad_norm": 1.3889786685753482, + "learning_rate": 7.530176022900417e-06, + "loss": 0.3252, + "step": 12108 + }, + { + "epoch": 0.35, + "grad_norm": 1.3037347350264075, + "learning_rate": 7.529770882863953e-06, + "loss": 0.3399, + "step": 12109 + }, + { + "epoch": 0.35, + "grad_norm": 1.412331223172066, + "learning_rate": 7.529365720502231e-06, + "loss": 0.3115, + "step": 12110 + }, + { + "epoch": 0.35, + "grad_norm": 1.3743048875238184, + "learning_rate": 7.52896053581883e-06, + "loss": 0.3348, + "step": 12111 + }, + { + "epoch": 0.35, + "grad_norm": 1.327722207493832, + "learning_rate": 7.528555328817324e-06, + "loss": 0.335, + "step": 12112 + }, + { + "epoch": 0.35, + "grad_norm": 1.419075585059124, + "learning_rate": 7.528150099501288e-06, + "loss": 0.3278, + "step": 12113 + }, + { + "epoch": 0.35, + "grad_norm": 1.4140279752143712, + "learning_rate": 7.527744847874301e-06, + "loss": 0.3337, + "step": 12114 + }, + { + "epoch": 0.35, + "grad_norm": 1.36212928661232, + "learning_rate": 7.527339573939938e-06, + "loss": 0.341, + "step": 12115 + }, + { + "epoch": 0.35, + "grad_norm": 1.3399291874855097, + "learning_rate": 7.526934277701773e-06, + "loss": 0.3296, + "step": 12116 + }, + { + "epoch": 0.35, + "grad_norm": 1.7074797815806038, + "learning_rate": 7.526528959163386e-06, + "loss": 0.3371, + "step": 12117 + }, + { + "epoch": 0.35, + "grad_norm": 1.3153568943818683, + "learning_rate": 7.526123618328354e-06, + "loss": 0.3406, + "step": 12118 + }, + { + "epoch": 0.35, + "grad_norm": 1.3368219972375537, + "learning_rate": 7.525718255200252e-06, + "loss": 0.3211, + "step": 12119 + }, + { + "epoch": 0.35, + "grad_norm": 1.2713508148839887, + "learning_rate": 7.525312869782657e-06, + "loss": 0.3229, + "step": 12120 + }, + { + "epoch": 0.35, + "grad_norm": 1.3353349172846047, + "learning_rate": 7.524907462079149e-06, + "loss": 0.356, + "step": 12121 + }, + { + "epoch": 0.35, + "grad_norm": 1.278849586083833, + "learning_rate": 7.524502032093306e-06, + "loss": 0.3442, + "step": 12122 + }, + { + "epoch": 0.35, + "grad_norm": 2.3886780651709327, + "learning_rate": 7.524096579828703e-06, + "loss": 0.3293, + "step": 12123 + }, + { + "epoch": 0.35, + "grad_norm": 1.2206923717128049, + "learning_rate": 7.5236911052889194e-06, + "loss": 0.3138, + "step": 12124 + }, + { + "epoch": 0.35, + "grad_norm": 1.3632744199357976, + "learning_rate": 7.523285608477535e-06, + "loss": 0.3489, + "step": 12125 + }, + { + "epoch": 0.35, + "grad_norm": 1.3583951351182146, + "learning_rate": 7.522880089398128e-06, + "loss": 0.3402, + "step": 12126 + }, + { + "epoch": 0.35, + "grad_norm": 1.5181984740250536, + "learning_rate": 7.522474548054275e-06, + "loss": 0.3277, + "step": 12127 + }, + { + "epoch": 0.35, + "grad_norm": 1.424834522492307, + "learning_rate": 7.522068984449556e-06, + "loss": 0.3681, + "step": 12128 + }, + { + "epoch": 0.35, + "grad_norm": 1.5494396042873948, + "learning_rate": 7.521663398587549e-06, + "loss": 0.3366, + "step": 12129 + }, + { + "epoch": 0.35, + "grad_norm": 1.358470107136916, + "learning_rate": 7.521257790471836e-06, + "loss": 0.3482, + "step": 12130 + }, + { + "epoch": 0.35, + "grad_norm": 1.0907447750437214, + "learning_rate": 7.5208521601059955e-06, + "loss": 0.5905, + "step": 12131 + }, + { + "epoch": 0.35, + "grad_norm": 1.605713213126399, + "learning_rate": 7.520446507493606e-06, + "loss": 0.3446, + "step": 12132 + }, + { + "epoch": 0.35, + "grad_norm": 1.459688724629399, + "learning_rate": 7.520040832638248e-06, + "loss": 0.3629, + "step": 12133 + }, + { + "epoch": 0.35, + "grad_norm": 1.2695453733630486, + "learning_rate": 7.519635135543501e-06, + "loss": 0.3411, + "step": 12134 + }, + { + "epoch": 0.35, + "grad_norm": 0.9393773479790519, + "learning_rate": 7.519229416212947e-06, + "loss": 0.5832, + "step": 12135 + }, + { + "epoch": 0.35, + "grad_norm": 1.2936877995619744, + "learning_rate": 7.518823674650166e-06, + "loss": 0.3409, + "step": 12136 + }, + { + "epoch": 0.35, + "grad_norm": 1.2247315551812974, + "learning_rate": 7.518417910858737e-06, + "loss": 0.3383, + "step": 12137 + }, + { + "epoch": 0.35, + "grad_norm": 1.9422363335562576, + "learning_rate": 7.518012124842242e-06, + "loss": 0.4062, + "step": 12138 + }, + { + "epoch": 0.35, + "grad_norm": 1.2933334577108304, + "learning_rate": 7.517606316604264e-06, + "loss": 0.3247, + "step": 12139 + }, + { + "epoch": 0.35, + "grad_norm": 1.4092055872439437, + "learning_rate": 7.517200486148381e-06, + "loss": 0.3362, + "step": 12140 + }, + { + "epoch": 0.35, + "grad_norm": 1.2590049928293383, + "learning_rate": 7.516794633478175e-06, + "loss": 0.3471, + "step": 12141 + }, + { + "epoch": 0.35, + "grad_norm": 1.4016112609481821, + "learning_rate": 7.516388758597228e-06, + "loss": 0.3499, + "step": 12142 + }, + { + "epoch": 0.35, + "grad_norm": 1.5490452599773572, + "learning_rate": 7.5159828615091245e-06, + "loss": 0.3581, + "step": 12143 + }, + { + "epoch": 0.35, + "grad_norm": 1.4164181956635216, + "learning_rate": 7.5155769422174445e-06, + "loss": 0.3308, + "step": 12144 + }, + { + "epoch": 0.35, + "grad_norm": 1.3919768386483837, + "learning_rate": 7.515171000725767e-06, + "loss": 0.3351, + "step": 12145 + }, + { + "epoch": 0.35, + "grad_norm": 2.1925103450063, + "learning_rate": 7.51476503703768e-06, + "loss": 0.3733, + "step": 12146 + }, + { + "epoch": 0.35, + "grad_norm": 1.4251767465284808, + "learning_rate": 7.514359051156763e-06, + "loss": 0.3398, + "step": 12147 + }, + { + "epoch": 0.35, + "grad_norm": 1.4143386736611299, + "learning_rate": 7.513953043086599e-06, + "loss": 0.3526, + "step": 12148 + }, + { + "epoch": 0.35, + "grad_norm": 1.3271011738424674, + "learning_rate": 7.513547012830773e-06, + "loss": 0.3629, + "step": 12149 + }, + { + "epoch": 0.35, + "grad_norm": 2.0078546293041817, + "learning_rate": 7.513140960392864e-06, + "loss": 0.3633, + "step": 12150 + }, + { + "epoch": 0.35, + "grad_norm": 1.2172529895257096, + "learning_rate": 7.51273488577646e-06, + "loss": 0.3407, + "step": 12151 + }, + { + "epoch": 0.35, + "grad_norm": 1.3487693327460466, + "learning_rate": 7.512328788985142e-06, + "loss": 0.3615, + "step": 12152 + }, + { + "epoch": 0.35, + "grad_norm": 2.2281994014170996, + "learning_rate": 7.511922670022496e-06, + "loss": 0.3187, + "step": 12153 + }, + { + "epoch": 0.35, + "grad_norm": 1.3542276774797217, + "learning_rate": 7.511516528892104e-06, + "loss": 0.3518, + "step": 12154 + }, + { + "epoch": 0.35, + "grad_norm": 1.6232924513044216, + "learning_rate": 7.511110365597549e-06, + "loss": 0.3613, + "step": 12155 + }, + { + "epoch": 0.35, + "grad_norm": 1.2899087662987632, + "learning_rate": 7.510704180142419e-06, + "loss": 0.3483, + "step": 12156 + }, + { + "epoch": 0.35, + "grad_norm": 1.2967024913877938, + "learning_rate": 7.5102979725302974e-06, + "loss": 0.3257, + "step": 12157 + }, + { + "epoch": 0.35, + "grad_norm": 1.5296226241034914, + "learning_rate": 7.509891742764768e-06, + "loss": 0.3429, + "step": 12158 + }, + { + "epoch": 0.35, + "grad_norm": 1.2514704575370204, + "learning_rate": 7.509485490849415e-06, + "loss": 0.3212, + "step": 12159 + }, + { + "epoch": 0.35, + "grad_norm": 1.4032718089877618, + "learning_rate": 7.509079216787824e-06, + "loss": 0.3309, + "step": 12160 + }, + { + "epoch": 0.35, + "grad_norm": 1.3092941883714082, + "learning_rate": 7.508672920583583e-06, + "loss": 0.3385, + "step": 12161 + }, + { + "epoch": 0.35, + "grad_norm": 4.939625461665473, + "learning_rate": 7.508266602240275e-06, + "loss": 0.3291, + "step": 12162 + }, + { + "epoch": 0.35, + "grad_norm": 1.4224281822809839, + "learning_rate": 7.507860261761486e-06, + "loss": 0.3264, + "step": 12163 + }, + { + "epoch": 0.35, + "grad_norm": 1.3766547131015048, + "learning_rate": 7.507453899150803e-06, + "loss": 0.3221, + "step": 12164 + }, + { + "epoch": 0.35, + "grad_norm": 1.4318423639538336, + "learning_rate": 7.507047514411812e-06, + "loss": 0.3032, + "step": 12165 + }, + { + "epoch": 0.35, + "grad_norm": 1.2249846926311159, + "learning_rate": 7.506641107548096e-06, + "loss": 0.3364, + "step": 12166 + }, + { + "epoch": 0.35, + "grad_norm": 1.5428716060608445, + "learning_rate": 7.506234678563248e-06, + "loss": 0.3597, + "step": 12167 + }, + { + "epoch": 0.35, + "grad_norm": 1.4718239663730786, + "learning_rate": 7.50582822746085e-06, + "loss": 0.3274, + "step": 12168 + }, + { + "epoch": 0.35, + "grad_norm": 1.2716980354638456, + "learning_rate": 7.5054217542444885e-06, + "loss": 0.3175, + "step": 12169 + }, + { + "epoch": 0.35, + "grad_norm": 1.3551056560846666, + "learning_rate": 7.505015258917754e-06, + "loss": 0.3709, + "step": 12170 + }, + { + "epoch": 0.35, + "grad_norm": 1.3023013533827896, + "learning_rate": 7.504608741484231e-06, + "loss": 0.3463, + "step": 12171 + }, + { + "epoch": 0.35, + "grad_norm": 0.9733428752972346, + "learning_rate": 7.5042022019475105e-06, + "loss": 0.615, + "step": 12172 + }, + { + "epoch": 0.35, + "grad_norm": 1.4873076377721544, + "learning_rate": 7.503795640311176e-06, + "loss": 0.3627, + "step": 12173 + }, + { + "epoch": 0.35, + "grad_norm": 1.2740686197820572, + "learning_rate": 7.503389056578816e-06, + "loss": 0.3464, + "step": 12174 + }, + { + "epoch": 0.35, + "grad_norm": 1.2450527081771094, + "learning_rate": 7.502982450754021e-06, + "loss": 0.3492, + "step": 12175 + }, + { + "epoch": 0.35, + "grad_norm": 1.2735039880944643, + "learning_rate": 7.5025758228403775e-06, + "loss": 0.3064, + "step": 12176 + }, + { + "epoch": 0.35, + "grad_norm": 1.2987927604578422, + "learning_rate": 7.502169172841476e-06, + "loss": 0.3362, + "step": 12177 + }, + { + "epoch": 0.35, + "grad_norm": 1.3534361652902938, + "learning_rate": 7.501762500760902e-06, + "loss": 0.3308, + "step": 12178 + }, + { + "epoch": 0.35, + "grad_norm": 1.334968304141418, + "learning_rate": 7.501355806602246e-06, + "loss": 0.3336, + "step": 12179 + }, + { + "epoch": 0.35, + "grad_norm": 1.2961584769578136, + "learning_rate": 7.500949090369099e-06, + "loss": 0.3258, + "step": 12180 + }, + { + "epoch": 0.35, + "grad_norm": 0.95746520324072, + "learning_rate": 7.500542352065048e-06, + "loss": 0.5933, + "step": 12181 + }, + { + "epoch": 0.35, + "grad_norm": 1.269733928742905, + "learning_rate": 7.500135591693682e-06, + "loss": 0.3316, + "step": 12182 + }, + { + "epoch": 0.35, + "grad_norm": 1.2806986246116787, + "learning_rate": 7.4997288092585926e-06, + "loss": 0.344, + "step": 12183 + }, + { + "epoch": 0.35, + "grad_norm": 1.3016184800541128, + "learning_rate": 7.499322004763368e-06, + "loss": 0.3255, + "step": 12184 + }, + { + "epoch": 0.35, + "grad_norm": 1.4609958274384185, + "learning_rate": 7.498915178211601e-06, + "loss": 0.3573, + "step": 12185 + }, + { + "epoch": 0.35, + "grad_norm": 1.2320390808348358, + "learning_rate": 7.498508329606879e-06, + "loss": 0.3422, + "step": 12186 + }, + { + "epoch": 0.35, + "grad_norm": 2.3170790608687475, + "learning_rate": 7.498101458952792e-06, + "loss": 0.3459, + "step": 12187 + }, + { + "epoch": 0.35, + "grad_norm": 1.1571822503055054, + "learning_rate": 7.4976945662529336e-06, + "loss": 0.3651, + "step": 12188 + }, + { + "epoch": 0.35, + "grad_norm": 1.3766266916066217, + "learning_rate": 7.497287651510892e-06, + "loss": 0.3353, + "step": 12189 + }, + { + "epoch": 0.35, + "grad_norm": 1.343828346224102, + "learning_rate": 7.496880714730259e-06, + "loss": 0.3343, + "step": 12190 + }, + { + "epoch": 0.35, + "grad_norm": 1.2792342800323773, + "learning_rate": 7.496473755914626e-06, + "loss": 0.3236, + "step": 12191 + }, + { + "epoch": 0.35, + "grad_norm": 1.218210631782332, + "learning_rate": 7.496066775067584e-06, + "loss": 0.3209, + "step": 12192 + }, + { + "epoch": 0.35, + "grad_norm": 1.4302239927281193, + "learning_rate": 7.495659772192725e-06, + "loss": 0.3242, + "step": 12193 + }, + { + "epoch": 0.35, + "grad_norm": 1.4470087167352559, + "learning_rate": 7.4952527472936425e-06, + "loss": 0.3535, + "step": 12194 + }, + { + "epoch": 0.35, + "grad_norm": 1.3282772928699795, + "learning_rate": 7.494845700373925e-06, + "loss": 0.3154, + "step": 12195 + }, + { + "epoch": 0.35, + "grad_norm": 1.5399193975496186, + "learning_rate": 7.494438631437168e-06, + "loss": 0.336, + "step": 12196 + }, + { + "epoch": 0.35, + "grad_norm": 1.5925756715718422, + "learning_rate": 7.494031540486961e-06, + "loss": 0.3467, + "step": 12197 + }, + { + "epoch": 0.35, + "grad_norm": 1.8282479603109474, + "learning_rate": 7.493624427526899e-06, + "loss": 0.3496, + "step": 12198 + }, + { + "epoch": 0.35, + "grad_norm": 1.3061446380020816, + "learning_rate": 7.493217292560574e-06, + "loss": 0.3661, + "step": 12199 + }, + { + "epoch": 0.35, + "grad_norm": 1.2506302376905303, + "learning_rate": 7.4928101355915785e-06, + "loss": 0.3264, + "step": 12200 + }, + { + "epoch": 0.35, + "grad_norm": 1.4415317144388553, + "learning_rate": 7.492402956623505e-06, + "loss": 0.3581, + "step": 12201 + }, + { + "epoch": 0.35, + "grad_norm": 1.304274515166739, + "learning_rate": 7.4919957556599475e-06, + "loss": 0.3638, + "step": 12202 + }, + { + "epoch": 0.35, + "grad_norm": 1.2321055042045974, + "learning_rate": 7.491588532704501e-06, + "loss": 0.322, + "step": 12203 + }, + { + "epoch": 0.35, + "grad_norm": 1.6000596927170339, + "learning_rate": 7.491181287760758e-06, + "loss": 0.3566, + "step": 12204 + }, + { + "epoch": 0.35, + "grad_norm": 1.3852664699636446, + "learning_rate": 7.490774020832312e-06, + "loss": 0.344, + "step": 12205 + }, + { + "epoch": 0.35, + "grad_norm": 1.2688910205280814, + "learning_rate": 7.490366731922758e-06, + "loss": 0.3292, + "step": 12206 + }, + { + "epoch": 0.35, + "grad_norm": 1.3364410532494444, + "learning_rate": 7.489959421035688e-06, + "loss": 0.3558, + "step": 12207 + }, + { + "epoch": 0.35, + "grad_norm": 1.1844699012418678, + "learning_rate": 7.489552088174702e-06, + "loss": 0.3403, + "step": 12208 + }, + { + "epoch": 0.35, + "grad_norm": 1.3671382398167078, + "learning_rate": 7.489144733343388e-06, + "loss": 0.3333, + "step": 12209 + }, + { + "epoch": 0.35, + "grad_norm": 1.5709504312950326, + "learning_rate": 7.488737356545346e-06, + "loss": 0.3474, + "step": 12210 + }, + { + "epoch": 0.35, + "grad_norm": 1.151759467471285, + "learning_rate": 7.488329957784169e-06, + "loss": 0.3023, + "step": 12211 + }, + { + "epoch": 0.35, + "grad_norm": 1.2969405431690257, + "learning_rate": 7.487922537063452e-06, + "loss": 0.3461, + "step": 12212 + }, + { + "epoch": 0.35, + "grad_norm": 1.2075373717072273, + "learning_rate": 7.487515094386792e-06, + "loss": 0.3486, + "step": 12213 + }, + { + "epoch": 0.35, + "grad_norm": 1.1922851216530388, + "learning_rate": 7.487107629757784e-06, + "loss": 0.3385, + "step": 12214 + }, + { + "epoch": 0.35, + "grad_norm": 1.264190059215102, + "learning_rate": 7.486700143180022e-06, + "loss": 0.3353, + "step": 12215 + }, + { + "epoch": 0.35, + "grad_norm": 1.426263878080342, + "learning_rate": 7.486292634657104e-06, + "loss": 0.3491, + "step": 12216 + }, + { + "epoch": 0.35, + "grad_norm": 1.3009218410094072, + "learning_rate": 7.485885104192627e-06, + "loss": 0.3698, + "step": 12217 + }, + { + "epoch": 0.35, + "grad_norm": 1.3337504822375086, + "learning_rate": 7.485477551790185e-06, + "loss": 0.3443, + "step": 12218 + }, + { + "epoch": 0.35, + "grad_norm": 1.3099597062686068, + "learning_rate": 7.4850699774533765e-06, + "loss": 0.3447, + "step": 12219 + }, + { + "epoch": 0.35, + "grad_norm": 1.2497931155748208, + "learning_rate": 7.484662381185798e-06, + "loss": 0.3329, + "step": 12220 + }, + { + "epoch": 0.35, + "grad_norm": 1.4078092025664968, + "learning_rate": 7.484254762991046e-06, + "loss": 0.33, + "step": 12221 + }, + { + "epoch": 0.35, + "grad_norm": 1.2002045709690932, + "learning_rate": 7.483847122872718e-06, + "loss": 0.3321, + "step": 12222 + }, + { + "epoch": 0.35, + "grad_norm": 1.353196442112745, + "learning_rate": 7.483439460834413e-06, + "loss": 0.3351, + "step": 12223 + }, + { + "epoch": 0.35, + "grad_norm": 3.2730598937800077, + "learning_rate": 7.483031776879725e-06, + "loss": 0.3593, + "step": 12224 + }, + { + "epoch": 0.35, + "grad_norm": 2.0471487000012707, + "learning_rate": 7.482624071012256e-06, + "loss": 0.3642, + "step": 12225 + }, + { + "epoch": 0.35, + "grad_norm": 1.1760920798905423, + "learning_rate": 7.482216343235602e-06, + "loss": 0.3298, + "step": 12226 + }, + { + "epoch": 0.35, + "grad_norm": 1.2766259400442255, + "learning_rate": 7.481808593553361e-06, + "loss": 0.339, + "step": 12227 + }, + { + "epoch": 0.35, + "grad_norm": 1.493343538457288, + "learning_rate": 7.4814008219691314e-06, + "loss": 0.3652, + "step": 12228 + }, + { + "epoch": 0.35, + "grad_norm": 1.6077749528373921, + "learning_rate": 7.480993028486512e-06, + "loss": 0.3178, + "step": 12229 + }, + { + "epoch": 0.35, + "grad_norm": 1.2564006405108719, + "learning_rate": 7.480585213109101e-06, + "loss": 0.3468, + "step": 12230 + }, + { + "epoch": 0.35, + "grad_norm": 1.5056965196376095, + "learning_rate": 7.480177375840499e-06, + "loss": 0.3362, + "step": 12231 + }, + { + "epoch": 0.35, + "grad_norm": 1.0483613311504258, + "learning_rate": 7.479769516684303e-06, + "loss": 0.643, + "step": 12232 + }, + { + "epoch": 0.35, + "grad_norm": 1.0606745188100684, + "learning_rate": 7.479361635644113e-06, + "loss": 0.6313, + "step": 12233 + }, + { + "epoch": 0.35, + "grad_norm": 1.2798342540737564, + "learning_rate": 7.478953732723531e-06, + "loss": 0.322, + "step": 12234 + }, + { + "epoch": 0.35, + "grad_norm": 1.8098905693119807, + "learning_rate": 7.478545807926154e-06, + "loss": 0.3544, + "step": 12235 + }, + { + "epoch": 0.35, + "grad_norm": 1.3204722909043565, + "learning_rate": 7.478137861255583e-06, + "loss": 0.3442, + "step": 12236 + }, + { + "epoch": 0.35, + "grad_norm": 1.2813904260867133, + "learning_rate": 7.477729892715418e-06, + "loss": 0.3441, + "step": 12237 + }, + { + "epoch": 0.35, + "grad_norm": 1.2790983290553812, + "learning_rate": 7.477321902309258e-06, + "loss": 0.336, + "step": 12238 + }, + { + "epoch": 0.35, + "grad_norm": 1.8356972051937557, + "learning_rate": 7.476913890040705e-06, + "loss": 0.3561, + "step": 12239 + }, + { + "epoch": 0.36, + "grad_norm": 1.1588456861403424, + "learning_rate": 7.476505855913361e-06, + "loss": 0.3164, + "step": 12240 + }, + { + "epoch": 0.36, + "grad_norm": 1.19721849271639, + "learning_rate": 7.4760977999308245e-06, + "loss": 0.3186, + "step": 12241 + }, + { + "epoch": 0.36, + "grad_norm": 1.392684766293053, + "learning_rate": 7.475689722096697e-06, + "loss": 0.3413, + "step": 12242 + }, + { + "epoch": 0.36, + "grad_norm": 1.2288568802298416, + "learning_rate": 7.475281622414581e-06, + "loss": 0.3283, + "step": 12243 + }, + { + "epoch": 0.36, + "grad_norm": 1.4756302187797057, + "learning_rate": 7.474873500888076e-06, + "loss": 0.3298, + "step": 12244 + }, + { + "epoch": 0.36, + "grad_norm": 1.0450044356275996, + "learning_rate": 7.474465357520786e-06, + "loss": 0.6098, + "step": 12245 + }, + { + "epoch": 0.36, + "grad_norm": 1.245648674497058, + "learning_rate": 7.474057192316311e-06, + "loss": 0.3521, + "step": 12246 + }, + { + "epoch": 0.36, + "grad_norm": 1.6828582386814523, + "learning_rate": 7.473649005278254e-06, + "loss": 0.3613, + "step": 12247 + }, + { + "epoch": 0.36, + "grad_norm": 1.3463735172785358, + "learning_rate": 7.473240796410217e-06, + "loss": 0.3285, + "step": 12248 + }, + { + "epoch": 0.36, + "grad_norm": 1.256429363480894, + "learning_rate": 7.472832565715804e-06, + "loss": 0.3271, + "step": 12249 + }, + { + "epoch": 0.36, + "grad_norm": 1.371924966812685, + "learning_rate": 7.472424313198614e-06, + "loss": 0.3362, + "step": 12250 + }, + { + "epoch": 0.36, + "grad_norm": 1.6850460197388244, + "learning_rate": 7.472016038862253e-06, + "loss": 0.328, + "step": 12251 + }, + { + "epoch": 0.36, + "grad_norm": 1.2567002381404229, + "learning_rate": 7.471607742710323e-06, + "loss": 0.3191, + "step": 12252 + }, + { + "epoch": 0.36, + "grad_norm": 1.2826137395115094, + "learning_rate": 7.4711994247464285e-06, + "loss": 0.3386, + "step": 12253 + }, + { + "epoch": 0.36, + "grad_norm": 1.253898645825218, + "learning_rate": 7.470791084974169e-06, + "loss": 0.3505, + "step": 12254 + }, + { + "epoch": 0.36, + "grad_norm": 1.57417444659815, + "learning_rate": 7.470382723397154e-06, + "loss": 0.3593, + "step": 12255 + }, + { + "epoch": 0.36, + "grad_norm": 1.280547280086187, + "learning_rate": 7.469974340018981e-06, + "loss": 0.3363, + "step": 12256 + }, + { + "epoch": 0.36, + "grad_norm": 1.3922768151639795, + "learning_rate": 7.469565934843259e-06, + "loss": 0.3431, + "step": 12257 + }, + { + "epoch": 0.36, + "grad_norm": 1.328753956981408, + "learning_rate": 7.46915750787359e-06, + "loss": 0.3433, + "step": 12258 + }, + { + "epoch": 0.36, + "grad_norm": 1.2561328126630902, + "learning_rate": 7.468749059113578e-06, + "loss": 0.3343, + "step": 12259 + }, + { + "epoch": 0.36, + "grad_norm": 1.7355661084288387, + "learning_rate": 7.468340588566828e-06, + "loss": 0.3222, + "step": 12260 + }, + { + "epoch": 0.36, + "grad_norm": 1.2059084731066183, + "learning_rate": 7.467932096236945e-06, + "loss": 0.3356, + "step": 12261 + }, + { + "epoch": 0.36, + "grad_norm": 1.384699095973938, + "learning_rate": 7.467523582127534e-06, + "loss": 0.3259, + "step": 12262 + }, + { + "epoch": 0.36, + "grad_norm": 1.665876131088457, + "learning_rate": 7.467115046242201e-06, + "loss": 0.3828, + "step": 12263 + }, + { + "epoch": 0.36, + "grad_norm": 1.2151702250103473, + "learning_rate": 7.46670648858455e-06, + "loss": 0.3438, + "step": 12264 + }, + { + "epoch": 0.36, + "grad_norm": 1.2887384619320428, + "learning_rate": 7.466297909158186e-06, + "loss": 0.3384, + "step": 12265 + }, + { + "epoch": 0.36, + "grad_norm": 1.538166315462214, + "learning_rate": 7.465889307966715e-06, + "loss": 0.3337, + "step": 12266 + }, + { + "epoch": 0.36, + "grad_norm": 1.304907752247133, + "learning_rate": 7.465480685013745e-06, + "loss": 0.3337, + "step": 12267 + }, + { + "epoch": 0.36, + "grad_norm": 1.2817773833093953, + "learning_rate": 7.46507204030288e-06, + "loss": 0.3243, + "step": 12268 + }, + { + "epoch": 0.36, + "grad_norm": 1.3775303186151429, + "learning_rate": 7.464663373837728e-06, + "loss": 0.3214, + "step": 12269 + }, + { + "epoch": 0.36, + "grad_norm": 1.2125031855739483, + "learning_rate": 7.464254685621892e-06, + "loss": 0.3308, + "step": 12270 + }, + { + "epoch": 0.36, + "grad_norm": 1.3771945989398855, + "learning_rate": 7.463845975658983e-06, + "loss": 0.3216, + "step": 12271 + }, + { + "epoch": 0.36, + "grad_norm": 1.1799754538876015, + "learning_rate": 7.4634372439526055e-06, + "loss": 0.3301, + "step": 12272 + }, + { + "epoch": 0.36, + "grad_norm": 1.5038479122226238, + "learning_rate": 7.4630284905063675e-06, + "loss": 0.3402, + "step": 12273 + }, + { + "epoch": 0.36, + "grad_norm": 1.4148731250645958, + "learning_rate": 7.462619715323874e-06, + "loss": 0.3466, + "step": 12274 + }, + { + "epoch": 0.36, + "grad_norm": 1.2433163885142249, + "learning_rate": 7.462210918408735e-06, + "loss": 0.3612, + "step": 12275 + }, + { + "epoch": 0.36, + "grad_norm": 1.4317409283960247, + "learning_rate": 7.4618020997645594e-06, + "loss": 0.3574, + "step": 12276 + }, + { + "epoch": 0.36, + "grad_norm": 1.3127655353640055, + "learning_rate": 7.46139325939495e-06, + "loss": 0.3316, + "step": 12277 + }, + { + "epoch": 0.36, + "grad_norm": 1.2253111370823548, + "learning_rate": 7.460984397303519e-06, + "loss": 0.3308, + "step": 12278 + }, + { + "epoch": 0.36, + "grad_norm": 1.2133688952945214, + "learning_rate": 7.4605755134938725e-06, + "loss": 0.3355, + "step": 12279 + }, + { + "epoch": 0.36, + "grad_norm": 1.6242088711599052, + "learning_rate": 7.460166607969621e-06, + "loss": 0.3517, + "step": 12280 + }, + { + "epoch": 0.36, + "grad_norm": 1.272234639431391, + "learning_rate": 7.4597576807343716e-06, + "loss": 0.3427, + "step": 12281 + }, + { + "epoch": 0.36, + "grad_norm": 1.2224149774791122, + "learning_rate": 7.459348731791733e-06, + "loss": 0.315, + "step": 12282 + }, + { + "epoch": 0.36, + "grad_norm": 3.806256490561036, + "learning_rate": 7.458939761145315e-06, + "loss": 0.3296, + "step": 12283 + }, + { + "epoch": 0.36, + "grad_norm": 1.2876278681949724, + "learning_rate": 7.458530768798727e-06, + "loss": 0.3322, + "step": 12284 + }, + { + "epoch": 0.36, + "grad_norm": 1.2983398203580667, + "learning_rate": 7.458121754755578e-06, + "loss": 0.3572, + "step": 12285 + }, + { + "epoch": 0.36, + "grad_norm": 1.363853093331109, + "learning_rate": 7.4577127190194765e-06, + "loss": 0.3306, + "step": 12286 + }, + { + "epoch": 0.36, + "grad_norm": 1.649940020072637, + "learning_rate": 7.457303661594032e-06, + "loss": 0.3437, + "step": 12287 + }, + { + "epoch": 0.36, + "grad_norm": 0.9141551594857243, + "learning_rate": 7.456894582482855e-06, + "loss": 0.6095, + "step": 12288 + }, + { + "epoch": 0.36, + "grad_norm": 1.3097625626517935, + "learning_rate": 7.4564854816895575e-06, + "loss": 0.3507, + "step": 12289 + }, + { + "epoch": 0.36, + "grad_norm": 1.3143931628643986, + "learning_rate": 7.456076359217748e-06, + "loss": 0.3166, + "step": 12290 + }, + { + "epoch": 0.36, + "grad_norm": 1.6553432193176665, + "learning_rate": 7.455667215071037e-06, + "loss": 0.3276, + "step": 12291 + }, + { + "epoch": 0.36, + "grad_norm": 1.2893561640250917, + "learning_rate": 7.455258049253037e-06, + "loss": 0.3315, + "step": 12292 + }, + { + "epoch": 0.36, + "grad_norm": 1.230978436005465, + "learning_rate": 7.454848861767355e-06, + "loss": 0.3341, + "step": 12293 + }, + { + "epoch": 0.36, + "grad_norm": 1.3577118512480641, + "learning_rate": 7.454439652617608e-06, + "loss": 0.3307, + "step": 12294 + }, + { + "epoch": 0.36, + "grad_norm": 1.2450727586351324, + "learning_rate": 7.454030421807401e-06, + "loss": 0.3551, + "step": 12295 + }, + { + "epoch": 0.36, + "grad_norm": 0.9080438583081579, + "learning_rate": 7.453621169340349e-06, + "loss": 0.5775, + "step": 12296 + }, + { + "epoch": 0.36, + "grad_norm": 1.4861941264415361, + "learning_rate": 7.453211895220063e-06, + "loss": 0.3698, + "step": 12297 + }, + { + "epoch": 0.36, + "grad_norm": 1.2451450127811179, + "learning_rate": 7.4528025994501555e-06, + "loss": 0.3188, + "step": 12298 + }, + { + "epoch": 0.36, + "grad_norm": 1.2467260220119252, + "learning_rate": 7.452393282034237e-06, + "loss": 0.347, + "step": 12299 + }, + { + "epoch": 0.36, + "grad_norm": 1.2666721702507853, + "learning_rate": 7.451983942975921e-06, + "loss": 0.3363, + "step": 12300 + }, + { + "epoch": 0.36, + "grad_norm": 1.2193778803858666, + "learning_rate": 7.45157458227882e-06, + "loss": 0.3576, + "step": 12301 + }, + { + "epoch": 0.36, + "grad_norm": 1.2945932384038639, + "learning_rate": 7.451165199946545e-06, + "loss": 0.3485, + "step": 12302 + }, + { + "epoch": 0.36, + "grad_norm": 1.3678058378791738, + "learning_rate": 7.4507557959827096e-06, + "loss": 0.3169, + "step": 12303 + }, + { + "epoch": 0.36, + "grad_norm": 1.3358496135199172, + "learning_rate": 7.4503463703909285e-06, + "loss": 0.3073, + "step": 12304 + }, + { + "epoch": 0.36, + "grad_norm": 1.3312565618636218, + "learning_rate": 7.449936923174813e-06, + "loss": 0.3159, + "step": 12305 + }, + { + "epoch": 0.36, + "grad_norm": 1.2605741708837959, + "learning_rate": 7.449527454337977e-06, + "loss": 0.3237, + "step": 12306 + }, + { + "epoch": 0.36, + "grad_norm": 1.2231941149320635, + "learning_rate": 7.4491179638840335e-06, + "loss": 0.3307, + "step": 12307 + }, + { + "epoch": 0.36, + "grad_norm": 1.722323455570957, + "learning_rate": 7.448708451816597e-06, + "loss": 0.3371, + "step": 12308 + }, + { + "epoch": 0.36, + "grad_norm": 1.536763126389579, + "learning_rate": 7.448298918139282e-06, + "loss": 0.361, + "step": 12309 + }, + { + "epoch": 0.36, + "grad_norm": 1.18141380309692, + "learning_rate": 7.447889362855701e-06, + "loss": 0.3254, + "step": 12310 + }, + { + "epoch": 0.36, + "grad_norm": 1.3779866523120046, + "learning_rate": 7.447479785969469e-06, + "loss": 0.3695, + "step": 12311 + }, + { + "epoch": 0.36, + "grad_norm": 1.280629025335455, + "learning_rate": 7.447070187484202e-06, + "loss": 0.3386, + "step": 12312 + }, + { + "epoch": 0.36, + "grad_norm": 2.1106920682548624, + "learning_rate": 7.446660567403514e-06, + "loss": 0.3254, + "step": 12313 + }, + { + "epoch": 0.36, + "grad_norm": 1.4071565745619203, + "learning_rate": 7.446250925731018e-06, + "loss": 0.3397, + "step": 12314 + }, + { + "epoch": 0.36, + "grad_norm": 1.3634980557434342, + "learning_rate": 7.44584126247033e-06, + "loss": 0.3389, + "step": 12315 + }, + { + "epoch": 0.36, + "grad_norm": 1.6823680013575015, + "learning_rate": 7.445431577625067e-06, + "loss": 0.3256, + "step": 12316 + }, + { + "epoch": 0.36, + "grad_norm": 1.2880898911511527, + "learning_rate": 7.445021871198841e-06, + "loss": 0.3655, + "step": 12317 + }, + { + "epoch": 0.36, + "grad_norm": 1.2219259168823067, + "learning_rate": 7.444612143195272e-06, + "loss": 0.3273, + "step": 12318 + }, + { + "epoch": 0.36, + "grad_norm": 1.235407655925638, + "learning_rate": 7.444202393617973e-06, + "loss": 0.3406, + "step": 12319 + }, + { + "epoch": 0.36, + "grad_norm": 2.001222346171674, + "learning_rate": 7.44379262247056e-06, + "loss": 0.3718, + "step": 12320 + }, + { + "epoch": 0.36, + "grad_norm": 1.169906191005028, + "learning_rate": 7.443382829756651e-06, + "loss": 0.3165, + "step": 12321 + }, + { + "epoch": 0.36, + "grad_norm": 1.8166577232676324, + "learning_rate": 7.442973015479862e-06, + "loss": 0.3313, + "step": 12322 + }, + { + "epoch": 0.36, + "grad_norm": 1.2685537416317514, + "learning_rate": 7.442563179643808e-06, + "loss": 0.336, + "step": 12323 + }, + { + "epoch": 0.36, + "grad_norm": 1.9703268120728306, + "learning_rate": 7.442153322252107e-06, + "loss": 0.3325, + "step": 12324 + }, + { + "epoch": 0.36, + "grad_norm": 1.354831171794516, + "learning_rate": 7.4417434433083755e-06, + "loss": 0.3332, + "step": 12325 + }, + { + "epoch": 0.36, + "grad_norm": 1.339659013722976, + "learning_rate": 7.4413335428162324e-06, + "loss": 0.3305, + "step": 12326 + }, + { + "epoch": 0.36, + "grad_norm": 2.470631405154939, + "learning_rate": 7.440923620779293e-06, + "loss": 0.3367, + "step": 12327 + }, + { + "epoch": 0.36, + "grad_norm": 1.485594441730379, + "learning_rate": 7.440513677201175e-06, + "loss": 0.3699, + "step": 12328 + }, + { + "epoch": 0.36, + "grad_norm": 2.366100569481919, + "learning_rate": 7.440103712085497e-06, + "loss": 0.347, + "step": 12329 + }, + { + "epoch": 0.36, + "grad_norm": 1.4755398686664598, + "learning_rate": 7.439693725435877e-06, + "loss": 0.3566, + "step": 12330 + }, + { + "epoch": 0.36, + "grad_norm": 1.4091765140585597, + "learning_rate": 7.439283717255933e-06, + "loss": 0.3293, + "step": 12331 + }, + { + "epoch": 0.36, + "grad_norm": 1.2888947981753671, + "learning_rate": 7.438873687549283e-06, + "loss": 0.3441, + "step": 12332 + }, + { + "epoch": 0.36, + "grad_norm": 1.4865983478751974, + "learning_rate": 7.438463636319544e-06, + "loss": 0.3433, + "step": 12333 + }, + { + "epoch": 0.36, + "grad_norm": 1.3286876529455638, + "learning_rate": 7.438053563570338e-06, + "loss": 0.3108, + "step": 12334 + }, + { + "epoch": 0.36, + "grad_norm": 3.1214946935269676, + "learning_rate": 7.437643469305282e-06, + "loss": 0.3547, + "step": 12335 + }, + { + "epoch": 0.36, + "grad_norm": 1.2171121137154233, + "learning_rate": 7.437233353527995e-06, + "loss": 0.3426, + "step": 12336 + }, + { + "epoch": 0.36, + "grad_norm": 1.2780791376931975, + "learning_rate": 7.436823216242097e-06, + "loss": 0.3553, + "step": 12337 + }, + { + "epoch": 0.36, + "grad_norm": 1.33730001534234, + "learning_rate": 7.4364130574512076e-06, + "loss": 0.3507, + "step": 12338 + }, + { + "epoch": 0.36, + "grad_norm": 1.5267682746409748, + "learning_rate": 7.436002877158945e-06, + "loss": 0.3773, + "step": 12339 + }, + { + "epoch": 0.36, + "grad_norm": 1.1785435658565622, + "learning_rate": 7.43559267536893e-06, + "loss": 0.3564, + "step": 12340 + }, + { + "epoch": 0.36, + "grad_norm": 1.4058868228376424, + "learning_rate": 7.435182452084784e-06, + "loss": 0.3609, + "step": 12341 + }, + { + "epoch": 0.36, + "grad_norm": 1.2363241193765573, + "learning_rate": 7.434772207310125e-06, + "loss": 0.3285, + "step": 12342 + }, + { + "epoch": 0.36, + "grad_norm": 1.2377151873190713, + "learning_rate": 7.434361941048573e-06, + "loss": 0.3218, + "step": 12343 + }, + { + "epoch": 0.36, + "grad_norm": 22.60924560239553, + "learning_rate": 7.433951653303751e-06, + "loss": 0.3644, + "step": 12344 + }, + { + "epoch": 0.36, + "grad_norm": 1.1959521098796608, + "learning_rate": 7.433541344079279e-06, + "loss": 0.3172, + "step": 12345 + }, + { + "epoch": 0.36, + "grad_norm": 1.4258038964385613, + "learning_rate": 7.433131013378777e-06, + "loss": 0.3366, + "step": 12346 + }, + { + "epoch": 0.36, + "grad_norm": 1.352576877555809, + "learning_rate": 7.4327206612058674e-06, + "loss": 0.4078, + "step": 12347 + }, + { + "epoch": 0.36, + "grad_norm": 1.4482994524762054, + "learning_rate": 7.43231028756417e-06, + "loss": 0.3268, + "step": 12348 + }, + { + "epoch": 0.36, + "grad_norm": 1.3453176157397353, + "learning_rate": 7.431899892457308e-06, + "loss": 0.344, + "step": 12349 + }, + { + "epoch": 0.36, + "grad_norm": 1.616171244273975, + "learning_rate": 7.4314894758889034e-06, + "loss": 0.3457, + "step": 12350 + }, + { + "epoch": 0.36, + "grad_norm": 1.699514376622135, + "learning_rate": 7.431079037862575e-06, + "loss": 0.3768, + "step": 12351 + }, + { + "epoch": 0.36, + "grad_norm": 1.2778272947260052, + "learning_rate": 7.430668578381949e-06, + "loss": 0.3437, + "step": 12352 + }, + { + "epoch": 0.36, + "grad_norm": 1.2474069725771, + "learning_rate": 7.430258097450644e-06, + "loss": 0.3169, + "step": 12353 + }, + { + "epoch": 0.36, + "grad_norm": 1.2043810148380987, + "learning_rate": 7.429847595072287e-06, + "loss": 0.34, + "step": 12354 + }, + { + "epoch": 0.36, + "grad_norm": 1.423709094623371, + "learning_rate": 7.429437071250497e-06, + "loss": 0.361, + "step": 12355 + }, + { + "epoch": 0.36, + "grad_norm": 1.497262853097298, + "learning_rate": 7.429026525988897e-06, + "loss": 0.3367, + "step": 12356 + }, + { + "epoch": 0.36, + "grad_norm": 1.5197722588704723, + "learning_rate": 7.428615959291111e-06, + "loss": 0.3278, + "step": 12357 + }, + { + "epoch": 0.36, + "grad_norm": 1.5104659034904997, + "learning_rate": 7.428205371160763e-06, + "loss": 0.3391, + "step": 12358 + }, + { + "epoch": 0.36, + "grad_norm": 1.6551970742185589, + "learning_rate": 7.427794761601476e-06, + "loss": 0.3298, + "step": 12359 + }, + { + "epoch": 0.36, + "grad_norm": 1.4067202825981888, + "learning_rate": 7.427384130616871e-06, + "loss": 0.346, + "step": 12360 + }, + { + "epoch": 0.36, + "grad_norm": 1.2989812050892926, + "learning_rate": 7.4269734782105766e-06, + "loss": 0.3256, + "step": 12361 + }, + { + "epoch": 0.36, + "grad_norm": 1.4198419645309073, + "learning_rate": 7.426562804386213e-06, + "loss": 0.3284, + "step": 12362 + }, + { + "epoch": 0.36, + "grad_norm": 1.292847253832906, + "learning_rate": 7.4261521091474066e-06, + "loss": 0.3318, + "step": 12363 + }, + { + "epoch": 0.36, + "grad_norm": 1.1888898366209133, + "learning_rate": 7.42574139249778e-06, + "loss": 0.3067, + "step": 12364 + }, + { + "epoch": 0.36, + "grad_norm": 1.1893367132151786, + "learning_rate": 7.425330654440959e-06, + "loss": 0.3551, + "step": 12365 + }, + { + "epoch": 0.36, + "grad_norm": 1.317163552638997, + "learning_rate": 7.424919894980568e-06, + "loss": 0.3358, + "step": 12366 + }, + { + "epoch": 0.36, + "grad_norm": 1.3712106887330262, + "learning_rate": 7.424509114120233e-06, + "loss": 0.3681, + "step": 12367 + }, + { + "epoch": 0.36, + "grad_norm": 1.317801766770376, + "learning_rate": 7.424098311863578e-06, + "loss": 0.3358, + "step": 12368 + }, + { + "epoch": 0.36, + "grad_norm": 1.2776849218143291, + "learning_rate": 7.4236874882142274e-06, + "loss": 0.3256, + "step": 12369 + }, + { + "epoch": 0.36, + "grad_norm": 1.4757469773725596, + "learning_rate": 7.423276643175808e-06, + "loss": 0.3432, + "step": 12370 + }, + { + "epoch": 0.36, + "grad_norm": 1.2536027620680246, + "learning_rate": 7.422865776751946e-06, + "loss": 0.3478, + "step": 12371 + }, + { + "epoch": 0.36, + "grad_norm": 1.3556035174272503, + "learning_rate": 7.422454888946267e-06, + "loss": 0.3197, + "step": 12372 + }, + { + "epoch": 0.36, + "grad_norm": 1.4756898592795533, + "learning_rate": 7.4220439797623955e-06, + "loss": 0.3608, + "step": 12373 + }, + { + "epoch": 0.36, + "grad_norm": 1.387096544000597, + "learning_rate": 7.42163304920396e-06, + "loss": 0.3301, + "step": 12374 + }, + { + "epoch": 0.36, + "grad_norm": 0.9485849872890924, + "learning_rate": 7.421222097274585e-06, + "loss": 0.5797, + "step": 12375 + }, + { + "epoch": 0.36, + "grad_norm": 1.3913194172058576, + "learning_rate": 7.420811123977898e-06, + "loss": 0.3297, + "step": 12376 + }, + { + "epoch": 0.36, + "grad_norm": 1.2166557943114154, + "learning_rate": 7.420400129317527e-06, + "loss": 0.3067, + "step": 12377 + }, + { + "epoch": 0.36, + "grad_norm": 1.3461719782661559, + "learning_rate": 7.419989113297098e-06, + "loss": 0.338, + "step": 12378 + }, + { + "epoch": 0.36, + "grad_norm": 1.6695200092174902, + "learning_rate": 7.419578075920237e-06, + "loss": 0.3425, + "step": 12379 + }, + { + "epoch": 0.36, + "grad_norm": 1.30737080740963, + "learning_rate": 7.419167017190574e-06, + "loss": 0.3631, + "step": 12380 + }, + { + "epoch": 0.36, + "grad_norm": 1.2601123884585541, + "learning_rate": 7.418755937111736e-06, + "loss": 0.3069, + "step": 12381 + }, + { + "epoch": 0.36, + "grad_norm": 1.242739544524621, + "learning_rate": 7.4183448356873485e-06, + "loss": 0.3279, + "step": 12382 + }, + { + "epoch": 0.36, + "grad_norm": 1.2692921152280379, + "learning_rate": 7.417933712921042e-06, + "loss": 0.3242, + "step": 12383 + }, + { + "epoch": 0.36, + "grad_norm": 1.4289292525368937, + "learning_rate": 7.417522568816442e-06, + "loss": 0.3376, + "step": 12384 + }, + { + "epoch": 0.36, + "grad_norm": 1.4280734188778144, + "learning_rate": 7.41711140337718e-06, + "loss": 0.364, + "step": 12385 + }, + { + "epoch": 0.36, + "grad_norm": 1.653504689962903, + "learning_rate": 7.416700216606883e-06, + "loss": 0.365, + "step": 12386 + }, + { + "epoch": 0.36, + "grad_norm": 1.326301909937967, + "learning_rate": 7.416289008509179e-06, + "loss": 0.3601, + "step": 12387 + }, + { + "epoch": 0.36, + "grad_norm": 1.4470933266621422, + "learning_rate": 7.415877779087697e-06, + "loss": 0.365, + "step": 12388 + }, + { + "epoch": 0.36, + "grad_norm": 1.2960563392530418, + "learning_rate": 7.415466528346068e-06, + "loss": 0.338, + "step": 12389 + }, + { + "epoch": 0.36, + "grad_norm": 1.2453640546939875, + "learning_rate": 7.41505525628792e-06, + "loss": 0.3361, + "step": 12390 + }, + { + "epoch": 0.36, + "grad_norm": 1.445213182661882, + "learning_rate": 7.414643962916883e-06, + "loss": 0.3259, + "step": 12391 + }, + { + "epoch": 0.36, + "grad_norm": 0.9300009075024617, + "learning_rate": 7.4142326482365856e-06, + "loss": 0.5354, + "step": 12392 + }, + { + "epoch": 0.36, + "grad_norm": 1.1540731048829085, + "learning_rate": 7.413821312250658e-06, + "loss": 0.3192, + "step": 12393 + }, + { + "epoch": 0.36, + "grad_norm": 1.2924091545814167, + "learning_rate": 7.41340995496273e-06, + "loss": 0.3457, + "step": 12394 + }, + { + "epoch": 0.36, + "grad_norm": 1.4817678002613368, + "learning_rate": 7.412998576376435e-06, + "loss": 0.3195, + "step": 12395 + }, + { + "epoch": 0.36, + "grad_norm": 1.3666447858882522, + "learning_rate": 7.412587176495399e-06, + "loss": 0.343, + "step": 12396 + }, + { + "epoch": 0.36, + "grad_norm": 1.2448838542462204, + "learning_rate": 7.412175755323254e-06, + "loss": 0.3235, + "step": 12397 + }, + { + "epoch": 0.36, + "grad_norm": 1.3346548688617363, + "learning_rate": 7.4117643128636315e-06, + "loss": 0.348, + "step": 12398 + }, + { + "epoch": 0.36, + "grad_norm": 1.4277419640496143, + "learning_rate": 7.411352849120163e-06, + "loss": 0.3266, + "step": 12399 + }, + { + "epoch": 0.36, + "grad_norm": 1.1839406858125152, + "learning_rate": 7.410941364096479e-06, + "loss": 0.3263, + "step": 12400 + }, + { + "epoch": 0.36, + "grad_norm": 1.272040282178429, + "learning_rate": 7.410529857796209e-06, + "loss": 0.3357, + "step": 12401 + }, + { + "epoch": 0.36, + "grad_norm": 1.3492895905581936, + "learning_rate": 7.410118330222987e-06, + "loss": 0.3431, + "step": 12402 + }, + { + "epoch": 0.36, + "grad_norm": 1.518530768854871, + "learning_rate": 7.409706781380444e-06, + "loss": 0.3238, + "step": 12403 + }, + { + "epoch": 0.36, + "grad_norm": 1.209410655393962, + "learning_rate": 7.409295211272213e-06, + "loss": 0.3136, + "step": 12404 + }, + { + "epoch": 0.36, + "grad_norm": 1.3128805951758216, + "learning_rate": 7.408883619901924e-06, + "loss": 0.3216, + "step": 12405 + }, + { + "epoch": 0.36, + "grad_norm": 1.3697578221952185, + "learning_rate": 7.4084720072732106e-06, + "loss": 0.332, + "step": 12406 + }, + { + "epoch": 0.36, + "grad_norm": 1.3952159168594929, + "learning_rate": 7.408060373389705e-06, + "loss": 0.3445, + "step": 12407 + }, + { + "epoch": 0.36, + "grad_norm": 1.551582211986883, + "learning_rate": 7.407648718255038e-06, + "loss": 0.3689, + "step": 12408 + }, + { + "epoch": 0.36, + "grad_norm": 1.3597436146106614, + "learning_rate": 7.407237041872848e-06, + "loss": 0.3256, + "step": 12409 + }, + { + "epoch": 0.36, + "grad_norm": 1.346756289850571, + "learning_rate": 7.406825344246763e-06, + "loss": 0.3279, + "step": 12410 + }, + { + "epoch": 0.36, + "grad_norm": 1.3250564095844524, + "learning_rate": 7.4064136253804174e-06, + "loss": 0.3216, + "step": 12411 + }, + { + "epoch": 0.36, + "grad_norm": 1.5540679393014754, + "learning_rate": 7.406001885277444e-06, + "loss": 0.3427, + "step": 12412 + }, + { + "epoch": 0.36, + "grad_norm": 1.3969031583103872, + "learning_rate": 7.405590123941478e-06, + "loss": 0.3558, + "step": 12413 + }, + { + "epoch": 0.36, + "grad_norm": 1.3642167501691853, + "learning_rate": 7.405178341376154e-06, + "loss": 0.3031, + "step": 12414 + }, + { + "epoch": 0.36, + "grad_norm": 1.2591907349869333, + "learning_rate": 7.404766537585102e-06, + "loss": 0.3108, + "step": 12415 + }, + { + "epoch": 0.36, + "grad_norm": 1.4211554652475369, + "learning_rate": 7.404354712571958e-06, + "loss": 0.3328, + "step": 12416 + }, + { + "epoch": 0.36, + "grad_norm": 1.3257360297645193, + "learning_rate": 7.403942866340359e-06, + "loss": 0.3304, + "step": 12417 + }, + { + "epoch": 0.36, + "grad_norm": 2.459136563597161, + "learning_rate": 7.403530998893938e-06, + "loss": 0.3322, + "step": 12418 + }, + { + "epoch": 0.36, + "grad_norm": 1.3108060249864362, + "learning_rate": 7.403119110236328e-06, + "loss": 0.341, + "step": 12419 + }, + { + "epoch": 0.36, + "grad_norm": 1.3293725631169908, + "learning_rate": 7.402707200371165e-06, + "loss": 0.3249, + "step": 12420 + }, + { + "epoch": 0.36, + "grad_norm": 1.3392801618620052, + "learning_rate": 7.402295269302084e-06, + "loss": 0.3399, + "step": 12421 + }, + { + "epoch": 0.36, + "grad_norm": 1.3313462367695044, + "learning_rate": 7.401883317032723e-06, + "loss": 0.3384, + "step": 12422 + }, + { + "epoch": 0.36, + "grad_norm": 2.847734407710481, + "learning_rate": 7.401471343566713e-06, + "loss": 0.3269, + "step": 12423 + }, + { + "epoch": 0.36, + "grad_norm": 1.4337049625081593, + "learning_rate": 7.401059348907693e-06, + "loss": 0.3292, + "step": 12424 + }, + { + "epoch": 0.36, + "grad_norm": 1.4126164216992152, + "learning_rate": 7.400647333059296e-06, + "loss": 0.3225, + "step": 12425 + }, + { + "epoch": 0.36, + "grad_norm": 1.1870244415328004, + "learning_rate": 7.40023529602516e-06, + "loss": 0.3217, + "step": 12426 + }, + { + "epoch": 0.36, + "grad_norm": 1.4112133617709657, + "learning_rate": 7.399823237808922e-06, + "loss": 0.3672, + "step": 12427 + }, + { + "epoch": 0.36, + "grad_norm": 1.2501578410986152, + "learning_rate": 7.399411158414217e-06, + "loss": 0.3219, + "step": 12428 + }, + { + "epoch": 0.36, + "grad_norm": 1.2348189560150373, + "learning_rate": 7.398999057844682e-06, + "loss": 0.3207, + "step": 12429 + }, + { + "epoch": 0.36, + "grad_norm": 1.2822797620946673, + "learning_rate": 7.398586936103953e-06, + "loss": 0.3434, + "step": 12430 + }, + { + "epoch": 0.36, + "grad_norm": 1.3908881459613012, + "learning_rate": 7.398174793195667e-06, + "loss": 0.3448, + "step": 12431 + }, + { + "epoch": 0.36, + "grad_norm": 1.2753544305251516, + "learning_rate": 7.397762629123464e-06, + "loss": 0.3296, + "step": 12432 + }, + { + "epoch": 0.36, + "grad_norm": 1.2362227992885884, + "learning_rate": 7.3973504438909775e-06, + "loss": 0.3492, + "step": 12433 + }, + { + "epoch": 0.36, + "grad_norm": 1.7383673892786922, + "learning_rate": 7.3969382375018475e-06, + "loss": 0.3331, + "step": 12434 + }, + { + "epoch": 0.36, + "grad_norm": 0.9529785477044221, + "learning_rate": 7.39652600995971e-06, + "loss": 0.5759, + "step": 12435 + }, + { + "epoch": 0.36, + "grad_norm": 1.2855299905299917, + "learning_rate": 7.396113761268206e-06, + "loss": 0.3664, + "step": 12436 + }, + { + "epoch": 0.36, + "grad_norm": 1.434530534428598, + "learning_rate": 7.3957014914309684e-06, + "loss": 0.323, + "step": 12437 + }, + { + "epoch": 0.36, + "grad_norm": 1.4854156861192112, + "learning_rate": 7.395289200451642e-06, + "loss": 0.3377, + "step": 12438 + }, + { + "epoch": 0.36, + "grad_norm": 1.3113499245592615, + "learning_rate": 7.39487688833386e-06, + "loss": 0.3293, + "step": 12439 + }, + { + "epoch": 0.36, + "grad_norm": 1.4551396108015668, + "learning_rate": 7.394464555081262e-06, + "loss": 0.3388, + "step": 12440 + }, + { + "epoch": 0.36, + "grad_norm": 1.3801780976248739, + "learning_rate": 7.3940522006974904e-06, + "loss": 0.3326, + "step": 12441 + }, + { + "epoch": 0.36, + "grad_norm": 1.2557696295291527, + "learning_rate": 7.3936398251861784e-06, + "loss": 0.3423, + "step": 12442 + }, + { + "epoch": 0.36, + "grad_norm": 1.2182411071113153, + "learning_rate": 7.39322742855097e-06, + "loss": 0.3378, + "step": 12443 + }, + { + "epoch": 0.36, + "grad_norm": 1.1611022120662249, + "learning_rate": 7.392815010795502e-06, + "loss": 0.3374, + "step": 12444 + }, + { + "epoch": 0.36, + "grad_norm": 1.6089534680277948, + "learning_rate": 7.392402571923416e-06, + "loss": 0.3883, + "step": 12445 + }, + { + "epoch": 0.36, + "grad_norm": 1.3725190284458277, + "learning_rate": 7.3919901119383515e-06, + "loss": 0.3276, + "step": 12446 + }, + { + "epoch": 0.36, + "grad_norm": 1.2198373689142354, + "learning_rate": 7.391577630843947e-06, + "loss": 0.3395, + "step": 12447 + }, + { + "epoch": 0.36, + "grad_norm": 1.2601949184969776, + "learning_rate": 7.391165128643843e-06, + "loss": 0.3212, + "step": 12448 + }, + { + "epoch": 0.36, + "grad_norm": 1.3020491494545887, + "learning_rate": 7.39075260534168e-06, + "loss": 0.3464, + "step": 12449 + }, + { + "epoch": 0.36, + "grad_norm": 1.3145401483258512, + "learning_rate": 7.390340060941099e-06, + "loss": 0.4011, + "step": 12450 + }, + { + "epoch": 0.36, + "grad_norm": 1.9499637870351239, + "learning_rate": 7.389927495445741e-06, + "loss": 0.358, + "step": 12451 + }, + { + "epoch": 0.36, + "grad_norm": 1.2446353088781517, + "learning_rate": 7.389514908859247e-06, + "loss": 0.3179, + "step": 12452 + }, + { + "epoch": 0.36, + "grad_norm": 0.9245234856790558, + "learning_rate": 7.389102301185256e-06, + "loss": 0.6214, + "step": 12453 + }, + { + "epoch": 0.36, + "grad_norm": 1.5025175196398417, + "learning_rate": 7.38868967242741e-06, + "loss": 0.3492, + "step": 12454 + }, + { + "epoch": 0.36, + "grad_norm": 1.3553051786523183, + "learning_rate": 7.388277022589353e-06, + "loss": 0.3464, + "step": 12455 + }, + { + "epoch": 0.36, + "grad_norm": 1.3293210206086419, + "learning_rate": 7.3878643516747236e-06, + "loss": 0.3296, + "step": 12456 + }, + { + "epoch": 0.36, + "grad_norm": 1.7229314600944319, + "learning_rate": 7.387451659687166e-06, + "loss": 0.3291, + "step": 12457 + }, + { + "epoch": 0.36, + "grad_norm": 1.5377954351192387, + "learning_rate": 7.38703894663032e-06, + "loss": 0.3253, + "step": 12458 + }, + { + "epoch": 0.36, + "grad_norm": 1.3523886437861876, + "learning_rate": 7.386626212507829e-06, + "loss": 0.3222, + "step": 12459 + }, + { + "epoch": 0.36, + "grad_norm": 1.3454017701137173, + "learning_rate": 7.386213457323334e-06, + "loss": 0.3345, + "step": 12460 + }, + { + "epoch": 0.36, + "grad_norm": 1.3161297820348816, + "learning_rate": 7.385800681080481e-06, + "loss": 0.337, + "step": 12461 + }, + { + "epoch": 0.36, + "grad_norm": 1.256969747393856, + "learning_rate": 7.385387883782908e-06, + "loss": 0.3403, + "step": 12462 + }, + { + "epoch": 0.36, + "grad_norm": 1.2550321492146, + "learning_rate": 7.384975065434264e-06, + "loss": 0.3299, + "step": 12463 + }, + { + "epoch": 0.36, + "grad_norm": 1.3481564600425986, + "learning_rate": 7.3845622260381855e-06, + "loss": 0.3676, + "step": 12464 + }, + { + "epoch": 0.36, + "grad_norm": 1.3785340860897253, + "learning_rate": 7.38414936559832e-06, + "loss": 0.3317, + "step": 12465 + }, + { + "epoch": 0.36, + "grad_norm": 1.2805305037215016, + "learning_rate": 7.383736484118311e-06, + "loss": 0.3568, + "step": 12466 + }, + { + "epoch": 0.36, + "grad_norm": 1.2905286478397173, + "learning_rate": 7.383323581601799e-06, + "loss": 0.3374, + "step": 12467 + }, + { + "epoch": 0.36, + "grad_norm": 1.2415580863542963, + "learning_rate": 7.382910658052431e-06, + "loss": 0.3361, + "step": 12468 + }, + { + "epoch": 0.36, + "grad_norm": 1.6938304603133072, + "learning_rate": 7.382497713473851e-06, + "loss": 0.3391, + "step": 12469 + }, + { + "epoch": 0.36, + "grad_norm": 1.2058829676119547, + "learning_rate": 7.3820847478697e-06, + "loss": 0.3311, + "step": 12470 + }, + { + "epoch": 0.36, + "grad_norm": 1.4221037941567563, + "learning_rate": 7.3816717612436265e-06, + "loss": 0.336, + "step": 12471 + }, + { + "epoch": 0.36, + "grad_norm": 1.3307738024948281, + "learning_rate": 7.381258753599272e-06, + "loss": 0.3253, + "step": 12472 + }, + { + "epoch": 0.36, + "grad_norm": 1.4588074522060273, + "learning_rate": 7.3808457249402845e-06, + "loss": 0.3561, + "step": 12473 + }, + { + "epoch": 0.36, + "grad_norm": 1.2969911136544134, + "learning_rate": 7.380432675270305e-06, + "loss": 0.3303, + "step": 12474 + }, + { + "epoch": 0.36, + "grad_norm": 1.4450162544101335, + "learning_rate": 7.380019604592981e-06, + "loss": 0.3388, + "step": 12475 + }, + { + "epoch": 0.36, + "grad_norm": 1.3096588705641694, + "learning_rate": 7.3796065129119575e-06, + "loss": 0.356, + "step": 12476 + }, + { + "epoch": 0.36, + "grad_norm": 1.5588572399249938, + "learning_rate": 7.379193400230882e-06, + "loss": 0.3213, + "step": 12477 + }, + { + "epoch": 0.36, + "grad_norm": 1.3161703530953204, + "learning_rate": 7.378780266553397e-06, + "loss": 0.3465, + "step": 12478 + }, + { + "epoch": 0.36, + "grad_norm": 1.2362534282311548, + "learning_rate": 7.378367111883149e-06, + "loss": 0.3344, + "step": 12479 + }, + { + "epoch": 0.36, + "grad_norm": 1.5038772467973598, + "learning_rate": 7.377953936223786e-06, + "loss": 0.3496, + "step": 12480 + }, + { + "epoch": 0.36, + "grad_norm": 1.2244678027342386, + "learning_rate": 7.3775407395789535e-06, + "loss": 0.3359, + "step": 12481 + }, + { + "epoch": 0.36, + "grad_norm": 1.543776583956292, + "learning_rate": 7.377127521952297e-06, + "loss": 0.3457, + "step": 12482 + }, + { + "epoch": 0.36, + "grad_norm": 1.2306966990254755, + "learning_rate": 7.376714283347463e-06, + "loss": 0.3288, + "step": 12483 + }, + { + "epoch": 0.36, + "grad_norm": 1.524453112712199, + "learning_rate": 7.3763010237681e-06, + "loss": 0.3395, + "step": 12484 + }, + { + "epoch": 0.36, + "grad_norm": 1.2034970591988656, + "learning_rate": 7.3758877432178536e-06, + "loss": 0.3464, + "step": 12485 + }, + { + "epoch": 0.36, + "grad_norm": 1.4369875748039924, + "learning_rate": 7.3754744417003725e-06, + "loss": 0.3488, + "step": 12486 + }, + { + "epoch": 0.36, + "grad_norm": 1.2804785442387427, + "learning_rate": 7.375061119219302e-06, + "loss": 0.3613, + "step": 12487 + }, + { + "epoch": 0.36, + "grad_norm": 1.2410860206748302, + "learning_rate": 7.374647775778291e-06, + "loss": 0.3384, + "step": 12488 + }, + { + "epoch": 0.36, + "grad_norm": 1.3970208504435015, + "learning_rate": 7.374234411380987e-06, + "loss": 0.3195, + "step": 12489 + }, + { + "epoch": 0.36, + "grad_norm": 1.335907386278743, + "learning_rate": 7.373821026031038e-06, + "loss": 0.3262, + "step": 12490 + }, + { + "epoch": 0.36, + "grad_norm": 1.4340849293826352, + "learning_rate": 7.3734076197320925e-06, + "loss": 0.3508, + "step": 12491 + }, + { + "epoch": 0.36, + "grad_norm": 1.9077888851171885, + "learning_rate": 7.372994192487798e-06, + "loss": 0.3367, + "step": 12492 + }, + { + "epoch": 0.36, + "grad_norm": 1.2748059248025683, + "learning_rate": 7.372580744301804e-06, + "loss": 0.344, + "step": 12493 + }, + { + "epoch": 0.36, + "grad_norm": 2.252515069567373, + "learning_rate": 7.372167275177759e-06, + "loss": 0.3535, + "step": 12494 + }, + { + "epoch": 0.36, + "grad_norm": 1.4675282522225341, + "learning_rate": 7.37175378511931e-06, + "loss": 0.3445, + "step": 12495 + }, + { + "epoch": 0.36, + "grad_norm": 1.2987517878146315, + "learning_rate": 7.371340274130109e-06, + "loss": 0.36, + "step": 12496 + }, + { + "epoch": 0.36, + "grad_norm": 1.2694402534165001, + "learning_rate": 7.370926742213802e-06, + "loss": 0.3477, + "step": 12497 + }, + { + "epoch": 0.36, + "grad_norm": 1.264404878382372, + "learning_rate": 7.370513189374041e-06, + "loss": 0.343, + "step": 12498 + }, + { + "epoch": 0.36, + "grad_norm": 1.3410729450733379, + "learning_rate": 7.370099615614476e-06, + "loss": 0.3461, + "step": 12499 + }, + { + "epoch": 0.36, + "grad_norm": 1.3176973076441194, + "learning_rate": 7.369686020938754e-06, + "loss": 0.3724, + "step": 12500 + }, + { + "epoch": 0.36, + "grad_norm": 1.4521003710965283, + "learning_rate": 7.369272405350527e-06, + "loss": 0.3408, + "step": 12501 + }, + { + "epoch": 0.36, + "grad_norm": 1.3882477968447633, + "learning_rate": 7.368858768853445e-06, + "loss": 0.3273, + "step": 12502 + }, + { + "epoch": 0.36, + "grad_norm": 1.288868138050252, + "learning_rate": 7.368445111451158e-06, + "loss": 0.3392, + "step": 12503 + }, + { + "epoch": 0.36, + "grad_norm": 1.393059761853651, + "learning_rate": 7.3680314331473156e-06, + "loss": 0.3739, + "step": 12504 + }, + { + "epoch": 0.36, + "grad_norm": 1.2800063763735303, + "learning_rate": 7.367617733945571e-06, + "loss": 0.3348, + "step": 12505 + }, + { + "epoch": 0.36, + "grad_norm": 1.3958116510210454, + "learning_rate": 7.367204013849574e-06, + "loss": 0.3501, + "step": 12506 + }, + { + "epoch": 0.36, + "grad_norm": 1.3506051877037897, + "learning_rate": 7.366790272862974e-06, + "loss": 0.3521, + "step": 12507 + }, + { + "epoch": 0.36, + "grad_norm": 1.362215353327184, + "learning_rate": 7.366376510989425e-06, + "loss": 0.3274, + "step": 12508 + }, + { + "epoch": 0.36, + "grad_norm": 1.3410954477627859, + "learning_rate": 7.365962728232576e-06, + "loss": 0.3474, + "step": 12509 + }, + { + "epoch": 0.36, + "grad_norm": 1.1907041908334708, + "learning_rate": 7.36554892459608e-06, + "loss": 0.3289, + "step": 12510 + }, + { + "epoch": 0.36, + "grad_norm": 1.5089426789092868, + "learning_rate": 7.365135100083588e-06, + "loss": 0.3615, + "step": 12511 + }, + { + "epoch": 0.36, + "grad_norm": 1.2235537630868838, + "learning_rate": 7.364721254698752e-06, + "loss": 0.338, + "step": 12512 + }, + { + "epoch": 0.36, + "grad_norm": 1.2960079304109413, + "learning_rate": 7.364307388445227e-06, + "loss": 0.3521, + "step": 12513 + }, + { + "epoch": 0.36, + "grad_norm": 1.3526790315170132, + "learning_rate": 7.363893501326661e-06, + "loss": 0.3371, + "step": 12514 + }, + { + "epoch": 0.36, + "grad_norm": 1.408191936313451, + "learning_rate": 7.36347959334671e-06, + "loss": 0.3456, + "step": 12515 + }, + { + "epoch": 0.36, + "grad_norm": 2.8783497232724016, + "learning_rate": 7.363065664509023e-06, + "loss": 0.3325, + "step": 12516 + }, + { + "epoch": 0.36, + "grad_norm": 1.4221198072702337, + "learning_rate": 7.362651714817257e-06, + "loss": 0.3842, + "step": 12517 + }, + { + "epoch": 0.36, + "grad_norm": 1.2952402096021314, + "learning_rate": 7.362237744275065e-06, + "loss": 0.3265, + "step": 12518 + }, + { + "epoch": 0.36, + "grad_norm": 1.282192001848863, + "learning_rate": 7.361823752886096e-06, + "loss": 0.3241, + "step": 12519 + }, + { + "epoch": 0.36, + "grad_norm": 1.417973397222984, + "learning_rate": 7.3614097406540065e-06, + "loss": 0.3386, + "step": 12520 + }, + { + "epoch": 0.36, + "grad_norm": 1.3395012804631876, + "learning_rate": 7.3609957075824505e-06, + "loss": 0.3094, + "step": 12521 + }, + { + "epoch": 0.36, + "grad_norm": 1.4070636393860831, + "learning_rate": 7.360581653675082e-06, + "loss": 0.3481, + "step": 12522 + }, + { + "epoch": 0.36, + "grad_norm": 1.2031277715520443, + "learning_rate": 7.360167578935553e-06, + "loss": 0.3158, + "step": 12523 + }, + { + "epoch": 0.36, + "grad_norm": 1.3449142094531419, + "learning_rate": 7.35975348336752e-06, + "loss": 0.3373, + "step": 12524 + }, + { + "epoch": 0.36, + "grad_norm": 1.2613994845872858, + "learning_rate": 7.359339366974636e-06, + "loss": 0.3282, + "step": 12525 + }, + { + "epoch": 0.36, + "grad_norm": 1.4457451686783895, + "learning_rate": 7.358925229760555e-06, + "loss": 0.3381, + "step": 12526 + }, + { + "epoch": 0.36, + "grad_norm": 1.308047399352952, + "learning_rate": 7.358511071728933e-06, + "loss": 0.3483, + "step": 12527 + }, + { + "epoch": 0.36, + "grad_norm": 1.4310097965728947, + "learning_rate": 7.358096892883427e-06, + "loss": 0.3326, + "step": 12528 + }, + { + "epoch": 0.36, + "grad_norm": 1.2036771889376465, + "learning_rate": 7.3576826932276855e-06, + "loss": 0.3254, + "step": 12529 + }, + { + "epoch": 0.36, + "grad_norm": 1.1218219569180505, + "learning_rate": 7.357268472765371e-06, + "loss": 0.339, + "step": 12530 + }, + { + "epoch": 0.36, + "grad_norm": 1.243552614414739, + "learning_rate": 7.356854231500134e-06, + "loss": 0.3295, + "step": 12531 + }, + { + "epoch": 0.36, + "grad_norm": 1.3047079050877564, + "learning_rate": 7.356439969435635e-06, + "loss": 0.3407, + "step": 12532 + }, + { + "epoch": 0.36, + "grad_norm": 1.368085206778791, + "learning_rate": 7.356025686575526e-06, + "loss": 0.3788, + "step": 12533 + }, + { + "epoch": 0.36, + "grad_norm": 1.268904788628113, + "learning_rate": 7.355611382923464e-06, + "loss": 0.3594, + "step": 12534 + }, + { + "epoch": 0.36, + "grad_norm": 1.3598898488074052, + "learning_rate": 7.355197058483103e-06, + "loss": 0.3196, + "step": 12535 + }, + { + "epoch": 0.36, + "grad_norm": 1.3005940530718472, + "learning_rate": 7.354782713258106e-06, + "loss": 0.3435, + "step": 12536 + }, + { + "epoch": 0.36, + "grad_norm": 1.3142732806728663, + "learning_rate": 7.354368347252124e-06, + "loss": 0.3151, + "step": 12537 + }, + { + "epoch": 0.36, + "grad_norm": 1.2731276165914227, + "learning_rate": 7.353953960468815e-06, + "loss": 0.3642, + "step": 12538 + }, + { + "epoch": 0.36, + "grad_norm": 1.23401236324413, + "learning_rate": 7.353539552911835e-06, + "loss": 0.3186, + "step": 12539 + }, + { + "epoch": 0.36, + "grad_norm": 0.9974348587483584, + "learning_rate": 7.353125124584843e-06, + "loss": 0.6578, + "step": 12540 + }, + { + "epoch": 0.36, + "grad_norm": 1.8906317327573225, + "learning_rate": 7.352710675491497e-06, + "loss": 0.3295, + "step": 12541 + }, + { + "epoch": 0.36, + "grad_norm": 1.219477982395346, + "learning_rate": 7.352296205635452e-06, + "loss": 0.3244, + "step": 12542 + }, + { + "epoch": 0.36, + "grad_norm": 4.022332627310475, + "learning_rate": 7.351881715020367e-06, + "loss": 0.3431, + "step": 12543 + }, + { + "epoch": 0.36, + "grad_norm": 1.3069339028688511, + "learning_rate": 7.351467203649899e-06, + "loss": 0.3522, + "step": 12544 + }, + { + "epoch": 0.36, + "grad_norm": 2.252547703135682, + "learning_rate": 7.351052671527707e-06, + "loss": 0.3257, + "step": 12545 + }, + { + "epoch": 0.36, + "grad_norm": 1.1737410651646145, + "learning_rate": 7.350638118657452e-06, + "loss": 0.3238, + "step": 12546 + }, + { + "epoch": 0.36, + "grad_norm": 1.362122906806194, + "learning_rate": 7.350223545042786e-06, + "loss": 0.3255, + "step": 12547 + }, + { + "epoch": 0.36, + "grad_norm": 1.3331929362487307, + "learning_rate": 7.349808950687372e-06, + "loss": 0.341, + "step": 12548 + }, + { + "epoch": 0.36, + "grad_norm": 1.2774805880566709, + "learning_rate": 7.3493943355948685e-06, + "loss": 0.3841, + "step": 12549 + }, + { + "epoch": 0.36, + "grad_norm": 1.292870798437802, + "learning_rate": 7.348979699768934e-06, + "loss": 0.3692, + "step": 12550 + }, + { + "epoch": 0.36, + "grad_norm": 2.4323995486111003, + "learning_rate": 7.3485650432132284e-06, + "loss": 0.3154, + "step": 12551 + }, + { + "epoch": 0.36, + "grad_norm": 1.53158439607607, + "learning_rate": 7.348150365931409e-06, + "loss": 0.3387, + "step": 12552 + }, + { + "epoch": 0.36, + "grad_norm": 1.2761981742662347, + "learning_rate": 7.347735667927136e-06, + "loss": 0.3393, + "step": 12553 + }, + { + "epoch": 0.36, + "grad_norm": 1.3471930235544987, + "learning_rate": 7.347320949204071e-06, + "loss": 0.3304, + "step": 12554 + }, + { + "epoch": 0.36, + "grad_norm": 1.3113729246866908, + "learning_rate": 7.346906209765873e-06, + "loss": 0.3302, + "step": 12555 + }, + { + "epoch": 0.36, + "grad_norm": 1.2111789881822232, + "learning_rate": 7.3464914496162e-06, + "loss": 0.3163, + "step": 12556 + }, + { + "epoch": 0.36, + "grad_norm": 1.2775523573422676, + "learning_rate": 7.346076668758715e-06, + "loss": 0.3117, + "step": 12557 + }, + { + "epoch": 0.36, + "grad_norm": 1.3093378309717187, + "learning_rate": 7.345661867197076e-06, + "loss": 0.3419, + "step": 12558 + }, + { + "epoch": 0.36, + "grad_norm": 1.3830050081211598, + "learning_rate": 7.345247044934947e-06, + "loss": 0.3331, + "step": 12559 + }, + { + "epoch": 0.36, + "grad_norm": 1.4194183972734382, + "learning_rate": 7.344832201975986e-06, + "loss": 0.3402, + "step": 12560 + }, + { + "epoch": 0.36, + "grad_norm": 1.6765030253230815, + "learning_rate": 7.344417338323855e-06, + "loss": 0.3442, + "step": 12561 + }, + { + "epoch": 0.36, + "grad_norm": 1.4107644130697978, + "learning_rate": 7.344002453982213e-06, + "loss": 0.3267, + "step": 12562 + }, + { + "epoch": 0.36, + "grad_norm": 1.2224059934181846, + "learning_rate": 7.343587548954725e-06, + "loss": 0.3201, + "step": 12563 + }, + { + "epoch": 0.36, + "grad_norm": 1.1998578182581494, + "learning_rate": 7.3431726232450515e-06, + "loss": 0.3335, + "step": 12564 + }, + { + "epoch": 0.36, + "grad_norm": 1.6152947359818166, + "learning_rate": 7.342757676856854e-06, + "loss": 0.3574, + "step": 12565 + }, + { + "epoch": 0.36, + "grad_norm": 1.1612429760342824, + "learning_rate": 7.3423427097937925e-06, + "loss": 0.3232, + "step": 12566 + }, + { + "epoch": 0.36, + "grad_norm": 1.4028685956118292, + "learning_rate": 7.341927722059531e-06, + "loss": 0.3391, + "step": 12567 + }, + { + "epoch": 0.36, + "grad_norm": 1.2496178290681594, + "learning_rate": 7.341512713657731e-06, + "loss": 0.33, + "step": 12568 + }, + { + "epoch": 0.36, + "grad_norm": 1.4279799167773886, + "learning_rate": 7.341097684592057e-06, + "loss": 0.3097, + "step": 12569 + }, + { + "epoch": 0.36, + "grad_norm": 1.2444917719614812, + "learning_rate": 7.340682634866168e-06, + "loss": 0.3813, + "step": 12570 + }, + { + "epoch": 0.36, + "grad_norm": 1.554582319415482, + "learning_rate": 7.34026756448373e-06, + "loss": 0.3346, + "step": 12571 + }, + { + "epoch": 0.36, + "grad_norm": 1.286280581499577, + "learning_rate": 7.339852473448404e-06, + "loss": 0.3324, + "step": 12572 + }, + { + "epoch": 0.36, + "grad_norm": 1.278377438069859, + "learning_rate": 7.339437361763855e-06, + "loss": 0.3584, + "step": 12573 + }, + { + "epoch": 0.36, + "grad_norm": 1.2504963447098485, + "learning_rate": 7.339022229433743e-06, + "loss": 0.3472, + "step": 12574 + }, + { + "epoch": 0.36, + "grad_norm": 1.3175633593616582, + "learning_rate": 7.338607076461736e-06, + "loss": 0.3392, + "step": 12575 + }, + { + "epoch": 0.36, + "grad_norm": 1.3877833570355997, + "learning_rate": 7.338191902851495e-06, + "loss": 0.3312, + "step": 12576 + }, + { + "epoch": 0.36, + "grad_norm": 1.3564309708828326, + "learning_rate": 7.337776708606684e-06, + "loss": 0.3357, + "step": 12577 + }, + { + "epoch": 0.36, + "grad_norm": 1.870404647617725, + "learning_rate": 7.337361493730969e-06, + "loss": 0.3449, + "step": 12578 + }, + { + "epoch": 0.36, + "grad_norm": 1.4683788221552545, + "learning_rate": 7.336946258228013e-06, + "loss": 0.34, + "step": 12579 + }, + { + "epoch": 0.36, + "grad_norm": 1.3647034744665818, + "learning_rate": 7.336531002101479e-06, + "loss": 0.3397, + "step": 12580 + }, + { + "epoch": 0.36, + "grad_norm": 1.3502758374606756, + "learning_rate": 7.336115725355033e-06, + "loss": 0.3187, + "step": 12581 + }, + { + "epoch": 0.36, + "grad_norm": 1.3035881174067796, + "learning_rate": 7.3357004279923405e-06, + "loss": 0.3351, + "step": 12582 + }, + { + "epoch": 0.36, + "grad_norm": 1.296830779729428, + "learning_rate": 7.335285110017066e-06, + "loss": 0.3234, + "step": 12583 + }, + { + "epoch": 0.36, + "grad_norm": 1.3530757536759281, + "learning_rate": 7.334869771432873e-06, + "loss": 0.341, + "step": 12584 + }, + { + "epoch": 0.37, + "grad_norm": 1.2898066000166482, + "learning_rate": 7.3344544122434305e-06, + "loss": 0.3384, + "step": 12585 + }, + { + "epoch": 0.37, + "grad_norm": 1.2363208785883721, + "learning_rate": 7.3340390324524e-06, + "loss": 0.3372, + "step": 12586 + }, + { + "epoch": 0.37, + "grad_norm": 1.2604233767256117, + "learning_rate": 7.333623632063451e-06, + "loss": 0.3346, + "step": 12587 + }, + { + "epoch": 0.37, + "grad_norm": 1.270329925817329, + "learning_rate": 7.333208211080246e-06, + "loss": 0.3437, + "step": 12588 + }, + { + "epoch": 0.37, + "grad_norm": 1.2798894160692487, + "learning_rate": 7.332792769506453e-06, + "loss": 0.3487, + "step": 12589 + }, + { + "epoch": 0.37, + "grad_norm": 1.2854516415623576, + "learning_rate": 7.332377307345737e-06, + "loss": 0.3524, + "step": 12590 + }, + { + "epoch": 0.37, + "grad_norm": 1.2717330158411249, + "learning_rate": 7.331961824601768e-06, + "loss": 0.3386, + "step": 12591 + }, + { + "epoch": 0.37, + "grad_norm": 1.3115406946011496, + "learning_rate": 7.331546321278209e-06, + "loss": 0.3373, + "step": 12592 + }, + { + "epoch": 0.37, + "grad_norm": 1.4190057683602553, + "learning_rate": 7.331130797378727e-06, + "loss": 0.3516, + "step": 12593 + }, + { + "epoch": 0.37, + "grad_norm": 1.2907656238620142, + "learning_rate": 7.33071525290699e-06, + "loss": 0.3256, + "step": 12594 + }, + { + "epoch": 0.37, + "grad_norm": 1.2567408777082238, + "learning_rate": 7.330299687866666e-06, + "loss": 0.3375, + "step": 12595 + }, + { + "epoch": 0.37, + "grad_norm": 1.2819181252585596, + "learning_rate": 7.32988410226142e-06, + "loss": 0.3239, + "step": 12596 + }, + { + "epoch": 0.37, + "grad_norm": 2.033091154877332, + "learning_rate": 7.329468496094923e-06, + "loss": 0.3459, + "step": 12597 + }, + { + "epoch": 0.37, + "grad_norm": 1.3504378540775843, + "learning_rate": 7.3290528693708385e-06, + "loss": 0.3335, + "step": 12598 + }, + { + "epoch": 0.37, + "grad_norm": 1.5354342858336159, + "learning_rate": 7.328637222092837e-06, + "loss": 0.3465, + "step": 12599 + }, + { + "epoch": 0.37, + "grad_norm": 1.3125780402138747, + "learning_rate": 7.3282215542645864e-06, + "loss": 0.3188, + "step": 12600 + }, + { + "epoch": 0.37, + "grad_norm": 1.3121271495159246, + "learning_rate": 7.3278058658897555e-06, + "loss": 0.3424, + "step": 12601 + }, + { + "epoch": 0.37, + "grad_norm": 1.406537940679953, + "learning_rate": 7.327390156972011e-06, + "loss": 0.3469, + "step": 12602 + }, + { + "epoch": 0.37, + "grad_norm": 1.1805258595286932, + "learning_rate": 7.326974427515023e-06, + "loss": 0.3289, + "step": 12603 + }, + { + "epoch": 0.37, + "grad_norm": 1.4977384718352915, + "learning_rate": 7.3265586775224595e-06, + "loss": 0.313, + "step": 12604 + }, + { + "epoch": 0.37, + "grad_norm": 1.7323854816579545, + "learning_rate": 7.326142906997991e-06, + "loss": 0.371, + "step": 12605 + }, + { + "epoch": 0.37, + "grad_norm": 1.4444201026180128, + "learning_rate": 7.325727115945285e-06, + "loss": 0.3511, + "step": 12606 + }, + { + "epoch": 0.37, + "grad_norm": 1.2605240441777994, + "learning_rate": 7.325311304368012e-06, + "loss": 0.3219, + "step": 12607 + }, + { + "epoch": 0.37, + "grad_norm": 1.5283437027817421, + "learning_rate": 7.32489547226984e-06, + "loss": 0.3487, + "step": 12608 + }, + { + "epoch": 0.37, + "grad_norm": 1.2565470667915888, + "learning_rate": 7.324479619654439e-06, + "loss": 0.3664, + "step": 12609 + }, + { + "epoch": 0.37, + "grad_norm": 1.2826695477868169, + "learning_rate": 7.32406374652548e-06, + "loss": 0.3538, + "step": 12610 + }, + { + "epoch": 0.37, + "grad_norm": 1.2232829654806916, + "learning_rate": 7.323647852886633e-06, + "loss": 0.3283, + "step": 12611 + }, + { + "epoch": 0.37, + "grad_norm": 1.3506840880543478, + "learning_rate": 7.323231938741568e-06, + "loss": 0.3294, + "step": 12612 + }, + { + "epoch": 0.37, + "grad_norm": 1.4438846492252604, + "learning_rate": 7.322816004093956e-06, + "loss": 0.3644, + "step": 12613 + }, + { + "epoch": 0.37, + "grad_norm": 1.349226850906406, + "learning_rate": 7.3224000489474645e-06, + "loss": 0.3609, + "step": 12614 + }, + { + "epoch": 0.37, + "grad_norm": 1.357493925682295, + "learning_rate": 7.32198407330577e-06, + "loss": 0.3725, + "step": 12615 + }, + { + "epoch": 0.37, + "grad_norm": 2.5121367511917603, + "learning_rate": 7.3215680771725385e-06, + "loss": 0.3205, + "step": 12616 + }, + { + "epoch": 0.37, + "grad_norm": 1.212446417190049, + "learning_rate": 7.321152060551443e-06, + "loss": 0.334, + "step": 12617 + }, + { + "epoch": 0.37, + "grad_norm": 1.2858746886273311, + "learning_rate": 7.320736023446156e-06, + "loss": 0.4057, + "step": 12618 + }, + { + "epoch": 0.37, + "grad_norm": 1.3047464549576142, + "learning_rate": 7.320319965860348e-06, + "loss": 0.3384, + "step": 12619 + }, + { + "epoch": 0.37, + "grad_norm": 1.2943469781911108, + "learning_rate": 7.3199038877976884e-06, + "loss": 0.3284, + "step": 12620 + }, + { + "epoch": 0.37, + "grad_norm": 1.260242885535475, + "learning_rate": 7.319487789261853e-06, + "loss": 0.3362, + "step": 12621 + }, + { + "epoch": 0.37, + "grad_norm": 1.240443114230378, + "learning_rate": 7.319071670256511e-06, + "loss": 0.2996, + "step": 12622 + }, + { + "epoch": 0.37, + "grad_norm": 1.2206354816036804, + "learning_rate": 7.318655530785337e-06, + "loss": 0.3543, + "step": 12623 + }, + { + "epoch": 0.37, + "grad_norm": 0.9500280405392681, + "learning_rate": 7.318239370852002e-06, + "loss": 0.6187, + "step": 12624 + }, + { + "epoch": 0.37, + "grad_norm": 1.2451787340927263, + "learning_rate": 7.317823190460178e-06, + "loss": 0.3115, + "step": 12625 + }, + { + "epoch": 0.37, + "grad_norm": 1.3208083575828793, + "learning_rate": 7.3174069896135384e-06, + "loss": 0.3509, + "step": 12626 + }, + { + "epoch": 0.37, + "grad_norm": 1.3068662869428254, + "learning_rate": 7.316990768315757e-06, + "loss": 0.3483, + "step": 12627 + }, + { + "epoch": 0.37, + "grad_norm": 1.2836445190631411, + "learning_rate": 7.316574526570507e-06, + "loss": 0.3161, + "step": 12628 + }, + { + "epoch": 0.37, + "grad_norm": 1.41180727241563, + "learning_rate": 7.31615826438146e-06, + "loss": 0.3463, + "step": 12629 + }, + { + "epoch": 0.37, + "grad_norm": 1.3971724692399412, + "learning_rate": 7.315741981752291e-06, + "loss": 0.3139, + "step": 12630 + }, + { + "epoch": 0.37, + "grad_norm": 1.316392891688238, + "learning_rate": 7.315325678686673e-06, + "loss": 0.3251, + "step": 12631 + }, + { + "epoch": 0.37, + "grad_norm": 2.346787731950735, + "learning_rate": 7.3149093551882795e-06, + "loss": 0.3098, + "step": 12632 + }, + { + "epoch": 0.37, + "grad_norm": 1.5721741256954402, + "learning_rate": 7.314493011260788e-06, + "loss": 0.3279, + "step": 12633 + }, + { + "epoch": 0.37, + "grad_norm": 1.407953166684785, + "learning_rate": 7.314076646907869e-06, + "loss": 0.329, + "step": 12634 + }, + { + "epoch": 0.37, + "grad_norm": 1.3985821159174454, + "learning_rate": 7.313660262133198e-06, + "loss": 0.3405, + "step": 12635 + }, + { + "epoch": 0.37, + "grad_norm": 1.2551602985517476, + "learning_rate": 7.313243856940448e-06, + "loss": 0.3359, + "step": 12636 + }, + { + "epoch": 0.37, + "grad_norm": 1.6605703189819072, + "learning_rate": 7.312827431333296e-06, + "loss": 0.3373, + "step": 12637 + }, + { + "epoch": 0.37, + "grad_norm": 1.2220341541952473, + "learning_rate": 7.3124109853154164e-06, + "loss": 0.3573, + "step": 12638 + }, + { + "epoch": 0.37, + "grad_norm": 1.297376655085023, + "learning_rate": 7.311994518890484e-06, + "loss": 0.3282, + "step": 12639 + }, + { + "epoch": 0.37, + "grad_norm": 1.3219920042851736, + "learning_rate": 7.311578032062174e-06, + "loss": 0.3411, + "step": 12640 + }, + { + "epoch": 0.37, + "grad_norm": 1.1794651900414612, + "learning_rate": 7.3111615248341625e-06, + "loss": 0.3389, + "step": 12641 + }, + { + "epoch": 0.37, + "grad_norm": 1.6049462621459278, + "learning_rate": 7.310744997210127e-06, + "loss": 0.3578, + "step": 12642 + }, + { + "epoch": 0.37, + "grad_norm": 1.8009507140115963, + "learning_rate": 7.310328449193739e-06, + "loss": 0.3573, + "step": 12643 + }, + { + "epoch": 0.37, + "grad_norm": 1.8014498484383494, + "learning_rate": 7.3099118807886775e-06, + "loss": 0.3347, + "step": 12644 + }, + { + "epoch": 0.37, + "grad_norm": 1.335187533026408, + "learning_rate": 7.309495291998616e-06, + "loss": 0.3468, + "step": 12645 + }, + { + "epoch": 0.37, + "grad_norm": 1.5640771606567743, + "learning_rate": 7.3090786828272365e-06, + "loss": 0.3601, + "step": 12646 + }, + { + "epoch": 0.37, + "grad_norm": 1.2974618189861837, + "learning_rate": 7.308662053278209e-06, + "loss": 0.3519, + "step": 12647 + }, + { + "epoch": 0.37, + "grad_norm": 1.330640051163945, + "learning_rate": 7.308245403355215e-06, + "loss": 0.3403, + "step": 12648 + }, + { + "epoch": 0.37, + "grad_norm": 1.4770736008071426, + "learning_rate": 7.3078287330619305e-06, + "loss": 0.3344, + "step": 12649 + }, + { + "epoch": 0.37, + "grad_norm": 1.196820648348701, + "learning_rate": 7.307412042402029e-06, + "loss": 0.3192, + "step": 12650 + }, + { + "epoch": 0.37, + "grad_norm": 1.3417649303274826, + "learning_rate": 7.306995331379193e-06, + "loss": 0.315, + "step": 12651 + }, + { + "epoch": 0.37, + "grad_norm": 1.8242969547386334, + "learning_rate": 7.306578599997097e-06, + "loss": 0.352, + "step": 12652 + }, + { + "epoch": 0.37, + "grad_norm": 1.2963871740825963, + "learning_rate": 7.306161848259418e-06, + "loss": 0.3271, + "step": 12653 + }, + { + "epoch": 0.37, + "grad_norm": 1.558792398471841, + "learning_rate": 7.305745076169837e-06, + "loss": 0.3233, + "step": 12654 + }, + { + "epoch": 0.37, + "grad_norm": 1.3650782568182502, + "learning_rate": 7.305328283732028e-06, + "loss": 0.3759, + "step": 12655 + }, + { + "epoch": 0.37, + "grad_norm": 1.5554629445450598, + "learning_rate": 7.3049114709496735e-06, + "loss": 0.3288, + "step": 12656 + }, + { + "epoch": 0.37, + "grad_norm": 1.6998729934351364, + "learning_rate": 7.304494637826449e-06, + "loss": 0.3276, + "step": 12657 + }, + { + "epoch": 0.37, + "grad_norm": 1.292679487364217, + "learning_rate": 7.3040777843660325e-06, + "loss": 0.331, + "step": 12658 + }, + { + "epoch": 0.37, + "grad_norm": 1.2257104816272102, + "learning_rate": 7.3036609105721055e-06, + "loss": 0.3201, + "step": 12659 + }, + { + "epoch": 0.37, + "grad_norm": 2.0596102275965977, + "learning_rate": 7.303244016448345e-06, + "loss": 0.3453, + "step": 12660 + }, + { + "epoch": 0.37, + "grad_norm": 1.3900107506038828, + "learning_rate": 7.302827101998431e-06, + "loss": 0.3584, + "step": 12661 + }, + { + "epoch": 0.37, + "grad_norm": 1.3131656480468505, + "learning_rate": 7.302410167226042e-06, + "loss": 0.3049, + "step": 12662 + }, + { + "epoch": 0.37, + "grad_norm": 1.4345830422065042, + "learning_rate": 7.301993212134857e-06, + "loss": 0.3517, + "step": 12663 + }, + { + "epoch": 0.37, + "grad_norm": 1.456551190090042, + "learning_rate": 7.301576236728557e-06, + "loss": 0.3424, + "step": 12664 + }, + { + "epoch": 0.37, + "grad_norm": 1.5499198376551793, + "learning_rate": 7.301159241010821e-06, + "loss": 0.3747, + "step": 12665 + }, + { + "epoch": 0.37, + "grad_norm": 2.2359586148166732, + "learning_rate": 7.300742224985329e-06, + "loss": 0.3522, + "step": 12666 + }, + { + "epoch": 0.37, + "grad_norm": 1.33059444625862, + "learning_rate": 7.300325188655762e-06, + "loss": 0.3115, + "step": 12667 + }, + { + "epoch": 0.37, + "grad_norm": 1.4136745716758723, + "learning_rate": 7.299908132025798e-06, + "loss": 0.324, + "step": 12668 + }, + { + "epoch": 0.37, + "grad_norm": 1.3670503298131638, + "learning_rate": 7.299491055099122e-06, + "loss": 0.3348, + "step": 12669 + }, + { + "epoch": 0.37, + "grad_norm": 1.517163115513161, + "learning_rate": 7.29907395787941e-06, + "loss": 0.333, + "step": 12670 + }, + { + "epoch": 0.37, + "grad_norm": 1.2495932151640718, + "learning_rate": 7.298656840370344e-06, + "loss": 0.3209, + "step": 12671 + }, + { + "epoch": 0.37, + "grad_norm": 1.3397506453600019, + "learning_rate": 7.298239702575607e-06, + "loss": 0.3545, + "step": 12672 + }, + { + "epoch": 0.37, + "grad_norm": 0.9242620971802825, + "learning_rate": 7.297822544498879e-06, + "loss": 0.5838, + "step": 12673 + }, + { + "epoch": 0.37, + "grad_norm": 1.3948136814186478, + "learning_rate": 7.297405366143842e-06, + "loss": 0.3645, + "step": 12674 + }, + { + "epoch": 0.37, + "grad_norm": 1.3055028963803836, + "learning_rate": 7.2969881675141775e-06, + "loss": 0.3556, + "step": 12675 + }, + { + "epoch": 0.37, + "grad_norm": 1.2968865134748153, + "learning_rate": 7.296570948613566e-06, + "loss": 0.3356, + "step": 12676 + }, + { + "epoch": 0.37, + "grad_norm": 1.259659192959129, + "learning_rate": 7.296153709445691e-06, + "loss": 0.3526, + "step": 12677 + }, + { + "epoch": 0.37, + "grad_norm": 1.3401792157451595, + "learning_rate": 7.295736450014234e-06, + "loss": 0.3563, + "step": 12678 + }, + { + "epoch": 0.37, + "grad_norm": 1.2989669812507347, + "learning_rate": 7.295319170322877e-06, + "loss": 0.3434, + "step": 12679 + }, + { + "epoch": 0.37, + "grad_norm": 1.2966923076849364, + "learning_rate": 7.294901870375304e-06, + "loss": 0.3277, + "step": 12680 + }, + { + "epoch": 0.37, + "grad_norm": 1.552212654439486, + "learning_rate": 7.294484550175195e-06, + "loss": 0.3806, + "step": 12681 + }, + { + "epoch": 0.37, + "grad_norm": 1.4299272802359577, + "learning_rate": 7.294067209726234e-06, + "loss": 0.3161, + "step": 12682 + }, + { + "epoch": 0.37, + "grad_norm": 1.4211456247488246, + "learning_rate": 7.293649849032106e-06, + "loss": 0.3257, + "step": 12683 + }, + { + "epoch": 0.37, + "grad_norm": 1.703893423047942, + "learning_rate": 7.293232468096492e-06, + "loss": 0.3269, + "step": 12684 + }, + { + "epoch": 0.37, + "grad_norm": 1.422082806278552, + "learning_rate": 7.2928150669230754e-06, + "loss": 0.3195, + "step": 12685 + }, + { + "epoch": 0.37, + "grad_norm": 1.535931065531857, + "learning_rate": 7.292397645515541e-06, + "loss": 0.3665, + "step": 12686 + }, + { + "epoch": 0.37, + "grad_norm": 1.3760364538365277, + "learning_rate": 7.291980203877574e-06, + "loss": 0.355, + "step": 12687 + }, + { + "epoch": 0.37, + "grad_norm": 1.4358024114920243, + "learning_rate": 7.291562742012852e-06, + "loss": 0.34, + "step": 12688 + }, + { + "epoch": 0.37, + "grad_norm": 1.3883884881608786, + "learning_rate": 7.291145259925065e-06, + "loss": 0.3501, + "step": 12689 + }, + { + "epoch": 0.37, + "grad_norm": 2.8455013041501567, + "learning_rate": 7.290727757617897e-06, + "loss": 0.3322, + "step": 12690 + }, + { + "epoch": 0.37, + "grad_norm": 1.5086409499329865, + "learning_rate": 7.290310235095031e-06, + "loss": 0.3721, + "step": 12691 + }, + { + "epoch": 0.37, + "grad_norm": 1.2784218562292322, + "learning_rate": 7.289892692360151e-06, + "loss": 0.3244, + "step": 12692 + }, + { + "epoch": 0.37, + "grad_norm": 1.2108704760528102, + "learning_rate": 7.289475129416943e-06, + "loss": 0.3231, + "step": 12693 + }, + { + "epoch": 0.37, + "grad_norm": 1.235804985581986, + "learning_rate": 7.2890575462690915e-06, + "loss": 0.3194, + "step": 12694 + }, + { + "epoch": 0.37, + "grad_norm": 1.3088011288174342, + "learning_rate": 7.288639942920281e-06, + "loss": 0.3171, + "step": 12695 + }, + { + "epoch": 0.37, + "grad_norm": 1.3111035067894103, + "learning_rate": 7.288222319374199e-06, + "loss": 0.3328, + "step": 12696 + }, + { + "epoch": 0.37, + "grad_norm": 1.215882760167099, + "learning_rate": 7.2878046756345285e-06, + "loss": 0.3406, + "step": 12697 + }, + { + "epoch": 0.37, + "grad_norm": 1.355642073148198, + "learning_rate": 7.287387011704957e-06, + "loss": 0.3165, + "step": 12698 + }, + { + "epoch": 0.37, + "grad_norm": 1.4100969884141354, + "learning_rate": 7.286969327589168e-06, + "loss": 0.3601, + "step": 12699 + }, + { + "epoch": 0.37, + "grad_norm": 1.4282722717354523, + "learning_rate": 7.286551623290852e-06, + "loss": 0.3519, + "step": 12700 + }, + { + "epoch": 0.37, + "grad_norm": 2.2868673256680707, + "learning_rate": 7.286133898813692e-06, + "loss": 0.3631, + "step": 12701 + }, + { + "epoch": 0.37, + "grad_norm": 1.8910498381138099, + "learning_rate": 7.285716154161373e-06, + "loss": 0.3433, + "step": 12702 + }, + { + "epoch": 0.37, + "grad_norm": 7.603946732071545, + "learning_rate": 7.285298389337585e-06, + "loss": 0.3441, + "step": 12703 + }, + { + "epoch": 0.37, + "grad_norm": 1.8573766042397266, + "learning_rate": 7.284880604346013e-06, + "loss": 0.3223, + "step": 12704 + }, + { + "epoch": 0.37, + "grad_norm": 1.6529701319875123, + "learning_rate": 7.284462799190346e-06, + "loss": 0.3363, + "step": 12705 + }, + { + "epoch": 0.37, + "grad_norm": 1.2765767521403568, + "learning_rate": 7.284044973874268e-06, + "loss": 0.3173, + "step": 12706 + }, + { + "epoch": 0.37, + "grad_norm": 1.6076674122061474, + "learning_rate": 7.283627128401468e-06, + "loss": 0.3786, + "step": 12707 + }, + { + "epoch": 0.37, + "grad_norm": 0.9448651431384378, + "learning_rate": 7.283209262775632e-06, + "loss": 0.5884, + "step": 12708 + }, + { + "epoch": 0.37, + "grad_norm": 1.228892729713827, + "learning_rate": 7.282791377000451e-06, + "loss": 0.3329, + "step": 12709 + }, + { + "epoch": 0.37, + "grad_norm": 1.3356385579216044, + "learning_rate": 7.282373471079609e-06, + "loss": 0.3583, + "step": 12710 + }, + { + "epoch": 0.37, + "grad_norm": 1.9083026784772583, + "learning_rate": 7.2819555450167966e-06, + "loss": 0.3512, + "step": 12711 + }, + { + "epoch": 0.37, + "grad_norm": 1.3419980339207858, + "learning_rate": 7.281537598815701e-06, + "loss": 0.3389, + "step": 12712 + }, + { + "epoch": 0.37, + "grad_norm": 1.296763419940254, + "learning_rate": 7.2811196324800116e-06, + "loss": 0.3551, + "step": 12713 + }, + { + "epoch": 0.37, + "grad_norm": 1.4164101031618324, + "learning_rate": 7.280701646013415e-06, + "loss": 0.3326, + "step": 12714 + }, + { + "epoch": 0.37, + "grad_norm": 1.2966949072806104, + "learning_rate": 7.2802836394196015e-06, + "loss": 0.3258, + "step": 12715 + }, + { + "epoch": 0.37, + "grad_norm": 1.394391408225954, + "learning_rate": 7.2798656127022595e-06, + "loss": 0.3359, + "step": 12716 + }, + { + "epoch": 0.37, + "grad_norm": 1.2228574817500302, + "learning_rate": 7.279447565865079e-06, + "loss": 0.3419, + "step": 12717 + }, + { + "epoch": 0.37, + "grad_norm": 1.348034860979925, + "learning_rate": 7.279029498911747e-06, + "loss": 0.3239, + "step": 12718 + }, + { + "epoch": 0.37, + "grad_norm": 1.3746882491046377, + "learning_rate": 7.2786114118459564e-06, + "loss": 0.3602, + "step": 12719 + }, + { + "epoch": 0.37, + "grad_norm": 1.9021165553928516, + "learning_rate": 7.278193304671393e-06, + "loss": 0.3356, + "step": 12720 + }, + { + "epoch": 0.37, + "grad_norm": 1.4050643911909244, + "learning_rate": 7.277775177391749e-06, + "loss": 0.343, + "step": 12721 + }, + { + "epoch": 0.37, + "grad_norm": 1.281611094244089, + "learning_rate": 7.277357030010714e-06, + "loss": 0.3319, + "step": 12722 + }, + { + "epoch": 0.37, + "grad_norm": 1.2377919574659184, + "learning_rate": 7.276938862531977e-06, + "loss": 0.3296, + "step": 12723 + }, + { + "epoch": 0.37, + "grad_norm": 1.3586675924824505, + "learning_rate": 7.276520674959231e-06, + "loss": 0.3373, + "step": 12724 + }, + { + "epoch": 0.37, + "grad_norm": 1.3865971090107607, + "learning_rate": 7.276102467296164e-06, + "loss": 0.3396, + "step": 12725 + }, + { + "epoch": 0.37, + "grad_norm": 1.8066816928402372, + "learning_rate": 7.2756842395464664e-06, + "loss": 0.3387, + "step": 12726 + }, + { + "epoch": 0.37, + "grad_norm": 1.6617365117915612, + "learning_rate": 7.2752659917138315e-06, + "loss": 0.3539, + "step": 12727 + }, + { + "epoch": 0.37, + "grad_norm": 1.4457229336734312, + "learning_rate": 7.274847723801948e-06, + "loss": 0.3473, + "step": 12728 + }, + { + "epoch": 0.37, + "grad_norm": 1.3089585698122208, + "learning_rate": 7.27442943581451e-06, + "loss": 0.3319, + "step": 12729 + }, + { + "epoch": 0.37, + "grad_norm": 1.2217273842903738, + "learning_rate": 7.274011127755204e-06, + "loss": 0.3148, + "step": 12730 + }, + { + "epoch": 0.37, + "grad_norm": 1.191817988214152, + "learning_rate": 7.273592799627727e-06, + "loss": 0.3314, + "step": 12731 + }, + { + "epoch": 0.37, + "grad_norm": 1.3283275675135469, + "learning_rate": 7.273174451435766e-06, + "loss": 0.3506, + "step": 12732 + }, + { + "epoch": 0.37, + "grad_norm": 1.7495029920304412, + "learning_rate": 7.272756083183018e-06, + "loss": 0.3447, + "step": 12733 + }, + { + "epoch": 0.37, + "grad_norm": 1.559375565821317, + "learning_rate": 7.272337694873172e-06, + "loss": 0.3715, + "step": 12734 + }, + { + "epoch": 0.37, + "grad_norm": 1.1891463357123195, + "learning_rate": 7.2719192865099186e-06, + "loss": 0.3191, + "step": 12735 + }, + { + "epoch": 0.37, + "grad_norm": 1.2511373397245136, + "learning_rate": 7.271500858096954e-06, + "loss": 0.3625, + "step": 12736 + }, + { + "epoch": 0.37, + "grad_norm": 1.889754090790052, + "learning_rate": 7.271082409637968e-06, + "loss": 0.3302, + "step": 12737 + }, + { + "epoch": 0.37, + "grad_norm": 1.3979296766389921, + "learning_rate": 7.270663941136655e-06, + "loss": 0.3393, + "step": 12738 + }, + { + "epoch": 0.37, + "grad_norm": 1.7688869246707537, + "learning_rate": 7.270245452596708e-06, + "loss": 0.341, + "step": 12739 + }, + { + "epoch": 0.37, + "grad_norm": 1.854633254343247, + "learning_rate": 7.2698269440218185e-06, + "loss": 0.3491, + "step": 12740 + }, + { + "epoch": 0.37, + "grad_norm": 1.653544434576907, + "learning_rate": 7.269408415415682e-06, + "loss": 0.3659, + "step": 12741 + }, + { + "epoch": 0.37, + "grad_norm": 1.5303255854624618, + "learning_rate": 7.2689898667819915e-06, + "loss": 0.3326, + "step": 12742 + }, + { + "epoch": 0.37, + "grad_norm": 1.3836790800495797, + "learning_rate": 7.268571298124439e-06, + "loss": 0.3227, + "step": 12743 + }, + { + "epoch": 0.37, + "grad_norm": 1.3668484401793897, + "learning_rate": 7.2681527094467205e-06, + "loss": 0.3145, + "step": 12744 + }, + { + "epoch": 0.37, + "grad_norm": 1.2929226275793844, + "learning_rate": 7.267734100752528e-06, + "loss": 0.3091, + "step": 12745 + }, + { + "epoch": 0.37, + "grad_norm": 1.3423206964586014, + "learning_rate": 7.267315472045559e-06, + "loss": 0.3219, + "step": 12746 + }, + { + "epoch": 0.37, + "grad_norm": 1.5753039647904084, + "learning_rate": 7.266896823329504e-06, + "loss": 0.3238, + "step": 12747 + }, + { + "epoch": 0.37, + "grad_norm": 1.3054925531658688, + "learning_rate": 7.266478154608061e-06, + "loss": 0.3363, + "step": 12748 + }, + { + "epoch": 0.37, + "grad_norm": 1.2557475964171212, + "learning_rate": 7.266059465884923e-06, + "loss": 0.3335, + "step": 12749 + }, + { + "epoch": 0.37, + "grad_norm": 1.4489887596920792, + "learning_rate": 7.265640757163783e-06, + "loss": 0.3201, + "step": 12750 + }, + { + "epoch": 0.37, + "grad_norm": 1.327327972464399, + "learning_rate": 7.26522202844834e-06, + "loss": 0.3068, + "step": 12751 + }, + { + "epoch": 0.37, + "grad_norm": 1.4846672786500839, + "learning_rate": 7.264803279742287e-06, + "loss": 0.3387, + "step": 12752 + }, + { + "epoch": 0.37, + "grad_norm": 1.3281142086142455, + "learning_rate": 7.264384511049321e-06, + "loss": 0.3171, + "step": 12753 + }, + { + "epoch": 0.37, + "grad_norm": 1.576173356657519, + "learning_rate": 7.263965722373134e-06, + "loss": 0.3485, + "step": 12754 + }, + { + "epoch": 0.37, + "grad_norm": 1.3098579994683937, + "learning_rate": 7.263546913717425e-06, + "loss": 0.3249, + "step": 12755 + }, + { + "epoch": 0.37, + "grad_norm": 0.9398392154820063, + "learning_rate": 7.263128085085891e-06, + "loss": 0.5868, + "step": 12756 + }, + { + "epoch": 0.37, + "grad_norm": 1.4388222737786345, + "learning_rate": 7.262709236482226e-06, + "loss": 0.3748, + "step": 12757 + }, + { + "epoch": 0.37, + "grad_norm": 1.2596915145958327, + "learning_rate": 7.262290367910126e-06, + "loss": 0.3284, + "step": 12758 + }, + { + "epoch": 0.37, + "grad_norm": 1.0258267680713777, + "learning_rate": 7.261871479373289e-06, + "loss": 0.6588, + "step": 12759 + }, + { + "epoch": 0.37, + "grad_norm": 1.3015499479669168, + "learning_rate": 7.2614525708754115e-06, + "loss": 0.3467, + "step": 12760 + }, + { + "epoch": 0.37, + "grad_norm": 1.3057669131855647, + "learning_rate": 7.261033642420191e-06, + "loss": 0.3375, + "step": 12761 + }, + { + "epoch": 0.37, + "grad_norm": 1.284455824124472, + "learning_rate": 7.260614694011321e-06, + "loss": 0.3327, + "step": 12762 + }, + { + "epoch": 0.37, + "grad_norm": 1.4568018757550008, + "learning_rate": 7.2601957256525025e-06, + "loss": 0.3595, + "step": 12763 + }, + { + "epoch": 0.37, + "grad_norm": 1.3043689492590653, + "learning_rate": 7.259776737347432e-06, + "loss": 0.3189, + "step": 12764 + }, + { + "epoch": 0.37, + "grad_norm": 1.1836915174843339, + "learning_rate": 7.259357729099805e-06, + "loss": 0.3217, + "step": 12765 + }, + { + "epoch": 0.37, + "grad_norm": 1.5302405121568425, + "learning_rate": 7.258938700913323e-06, + "loss": 0.3264, + "step": 12766 + }, + { + "epoch": 0.37, + "grad_norm": 1.601165594361816, + "learning_rate": 7.258519652791682e-06, + "loss": 0.325, + "step": 12767 + }, + { + "epoch": 0.37, + "grad_norm": 1.430804446797614, + "learning_rate": 7.258100584738578e-06, + "loss": 0.3365, + "step": 12768 + }, + { + "epoch": 0.37, + "grad_norm": 1.775930854246525, + "learning_rate": 7.257681496757712e-06, + "loss": 0.3244, + "step": 12769 + }, + { + "epoch": 0.37, + "grad_norm": 1.4730221358632447, + "learning_rate": 7.257262388852784e-06, + "loss": 0.3292, + "step": 12770 + }, + { + "epoch": 0.37, + "grad_norm": 1.7214276857777078, + "learning_rate": 7.256843261027488e-06, + "loss": 0.345, + "step": 12771 + }, + { + "epoch": 0.37, + "grad_norm": 1.8774136198363622, + "learning_rate": 7.256424113285525e-06, + "loss": 0.3396, + "step": 12772 + }, + { + "epoch": 0.37, + "grad_norm": 1.307060871451976, + "learning_rate": 7.256004945630595e-06, + "loss": 0.3333, + "step": 12773 + }, + { + "epoch": 0.37, + "grad_norm": 1.2620838657755633, + "learning_rate": 7.255585758066397e-06, + "loss": 0.3338, + "step": 12774 + }, + { + "epoch": 0.37, + "grad_norm": 1.2253321151280054, + "learning_rate": 7.255166550596628e-06, + "loss": 0.3369, + "step": 12775 + }, + { + "epoch": 0.37, + "grad_norm": 3.90150357571281, + "learning_rate": 7.254747323224991e-06, + "loss": 0.3494, + "step": 12776 + }, + { + "epoch": 0.37, + "grad_norm": 1.4377483499078563, + "learning_rate": 7.2543280759551826e-06, + "loss": 0.3133, + "step": 12777 + }, + { + "epoch": 0.37, + "grad_norm": 1.2738597848771642, + "learning_rate": 7.253908808790903e-06, + "loss": 0.3292, + "step": 12778 + }, + { + "epoch": 0.37, + "grad_norm": 1.344150769463234, + "learning_rate": 7.253489521735856e-06, + "loss": 0.3492, + "step": 12779 + }, + { + "epoch": 0.37, + "grad_norm": 1.8668229875116509, + "learning_rate": 7.2530702147937374e-06, + "loss": 0.3315, + "step": 12780 + }, + { + "epoch": 0.37, + "grad_norm": 1.2714015177937572, + "learning_rate": 7.252650887968249e-06, + "loss": 0.3369, + "step": 12781 + }, + { + "epoch": 0.37, + "grad_norm": 1.2502115201020074, + "learning_rate": 7.252231541263091e-06, + "loss": 0.3442, + "step": 12782 + }, + { + "epoch": 0.37, + "grad_norm": 1.360282461024957, + "learning_rate": 7.251812174681965e-06, + "loss": 0.3491, + "step": 12783 + }, + { + "epoch": 0.37, + "grad_norm": 1.6552036203262106, + "learning_rate": 7.251392788228573e-06, + "loss": 0.3566, + "step": 12784 + }, + { + "epoch": 0.37, + "grad_norm": 1.241952368712871, + "learning_rate": 7.250973381906613e-06, + "loss": 0.3341, + "step": 12785 + }, + { + "epoch": 0.37, + "grad_norm": 1.267248080232298, + "learning_rate": 7.250553955719788e-06, + "loss": 0.3267, + "step": 12786 + }, + { + "epoch": 0.37, + "grad_norm": 1.3066691352582744, + "learning_rate": 7.2501345096718004e-06, + "loss": 0.3286, + "step": 12787 + }, + { + "epoch": 0.37, + "grad_norm": 1.6653278219459673, + "learning_rate": 7.2497150437663495e-06, + "loss": 0.3226, + "step": 12788 + }, + { + "epoch": 0.37, + "grad_norm": 1.4124905634525764, + "learning_rate": 7.249295558007141e-06, + "loss": 0.339, + "step": 12789 + }, + { + "epoch": 0.37, + "grad_norm": 1.4944243760998581, + "learning_rate": 7.248876052397872e-06, + "loss": 0.3258, + "step": 12790 + }, + { + "epoch": 0.37, + "grad_norm": 1.412335658070823, + "learning_rate": 7.248456526942247e-06, + "loss": 0.3046, + "step": 12791 + }, + { + "epoch": 0.37, + "grad_norm": 1.6435136689302614, + "learning_rate": 7.248036981643969e-06, + "loss": 0.3084, + "step": 12792 + }, + { + "epoch": 0.37, + "grad_norm": 1.250672900916256, + "learning_rate": 7.2476174165067395e-06, + "loss": 0.3255, + "step": 12793 + }, + { + "epoch": 0.37, + "grad_norm": 1.3393274142192815, + "learning_rate": 7.24719783153426e-06, + "loss": 0.3383, + "step": 12794 + }, + { + "epoch": 0.37, + "grad_norm": 1.430217710711767, + "learning_rate": 7.246778226730236e-06, + "loss": 0.3497, + "step": 12795 + }, + { + "epoch": 0.37, + "grad_norm": 1.2994958840063673, + "learning_rate": 7.246358602098369e-06, + "loss": 0.3318, + "step": 12796 + }, + { + "epoch": 0.37, + "grad_norm": 1.2702336743264766, + "learning_rate": 7.245938957642362e-06, + "loss": 0.3618, + "step": 12797 + }, + { + "epoch": 0.37, + "grad_norm": 1.3537299196348627, + "learning_rate": 7.245519293365919e-06, + "loss": 0.3493, + "step": 12798 + }, + { + "epoch": 0.37, + "grad_norm": 1.2535458639807218, + "learning_rate": 7.245099609272745e-06, + "loss": 0.3093, + "step": 12799 + }, + { + "epoch": 0.37, + "grad_norm": 1.2723102469094976, + "learning_rate": 7.24467990536654e-06, + "loss": 0.3445, + "step": 12800 + }, + { + "epoch": 0.37, + "grad_norm": 1.4672322685301011, + "learning_rate": 7.24426018165101e-06, + "loss": 0.3146, + "step": 12801 + }, + { + "epoch": 0.37, + "grad_norm": 1.2977910242836785, + "learning_rate": 7.24384043812986e-06, + "loss": 0.3312, + "step": 12802 + }, + { + "epoch": 0.37, + "grad_norm": 1.703839861233268, + "learning_rate": 7.243420674806794e-06, + "loss": 0.3033, + "step": 12803 + }, + { + "epoch": 0.37, + "grad_norm": 1.3264883434208037, + "learning_rate": 7.243000891685513e-06, + "loss": 0.3273, + "step": 12804 + }, + { + "epoch": 0.37, + "grad_norm": 1.2628041087441884, + "learning_rate": 7.242581088769726e-06, + "loss": 0.3151, + "step": 12805 + }, + { + "epoch": 0.37, + "grad_norm": 1.236310386318788, + "learning_rate": 7.242161266063134e-06, + "loss": 0.3355, + "step": 12806 + }, + { + "epoch": 0.37, + "grad_norm": 1.3847451706106746, + "learning_rate": 7.241741423569447e-06, + "loss": 0.3307, + "step": 12807 + }, + { + "epoch": 0.37, + "grad_norm": 1.3941798935435326, + "learning_rate": 7.241321561292366e-06, + "loss": 0.346, + "step": 12808 + }, + { + "epoch": 0.37, + "grad_norm": 1.7517414730537846, + "learning_rate": 7.2409016792355965e-06, + "loss": 0.339, + "step": 12809 + }, + { + "epoch": 0.37, + "grad_norm": 1.2855836877718005, + "learning_rate": 7.240481777402846e-06, + "loss": 0.3278, + "step": 12810 + }, + { + "epoch": 0.37, + "grad_norm": 1.2071448232056394, + "learning_rate": 7.240061855797818e-06, + "loss": 0.3321, + "step": 12811 + }, + { + "epoch": 0.37, + "grad_norm": 1.4261183038705676, + "learning_rate": 7.239641914424221e-06, + "loss": 0.3408, + "step": 12812 + }, + { + "epoch": 0.37, + "grad_norm": 1.6072559625516674, + "learning_rate": 7.2392219532857554e-06, + "loss": 0.3519, + "step": 12813 + }, + { + "epoch": 0.37, + "grad_norm": 1.2498389526092732, + "learning_rate": 7.2388019723861335e-06, + "loss": 0.326, + "step": 12814 + }, + { + "epoch": 0.37, + "grad_norm": 1.2842208244526319, + "learning_rate": 7.238381971729061e-06, + "loss": 0.3261, + "step": 12815 + }, + { + "epoch": 0.37, + "grad_norm": 1.372872809606264, + "learning_rate": 7.237961951318241e-06, + "loss": 0.3453, + "step": 12816 + }, + { + "epoch": 0.37, + "grad_norm": 1.5207602184115077, + "learning_rate": 7.237541911157383e-06, + "loss": 0.3283, + "step": 12817 + }, + { + "epoch": 0.37, + "grad_norm": 1.3152421887367969, + "learning_rate": 7.237121851250192e-06, + "loss": 0.3449, + "step": 12818 + }, + { + "epoch": 0.37, + "grad_norm": 1.5838829586265168, + "learning_rate": 7.2367017716003755e-06, + "loss": 0.3337, + "step": 12819 + }, + { + "epoch": 0.37, + "grad_norm": 1.1974255536766074, + "learning_rate": 7.236281672211642e-06, + "loss": 0.3117, + "step": 12820 + }, + { + "epoch": 0.37, + "grad_norm": 1.3733995251802105, + "learning_rate": 7.235861553087698e-06, + "loss": 0.3544, + "step": 12821 + }, + { + "epoch": 0.37, + "grad_norm": 1.3622791396774205, + "learning_rate": 7.2354414142322495e-06, + "loss": 0.3305, + "step": 12822 + }, + { + "epoch": 0.37, + "grad_norm": 1.3441940295552228, + "learning_rate": 7.235021255649006e-06, + "loss": 0.3572, + "step": 12823 + }, + { + "epoch": 0.37, + "grad_norm": 1.6930660420590113, + "learning_rate": 7.234601077341675e-06, + "loss": 0.3616, + "step": 12824 + }, + { + "epoch": 0.37, + "grad_norm": 1.3893317282774362, + "learning_rate": 7.234180879313967e-06, + "loss": 0.3219, + "step": 12825 + }, + { + "epoch": 0.37, + "grad_norm": 1.409701388073576, + "learning_rate": 7.233760661569587e-06, + "loss": 0.3323, + "step": 12826 + }, + { + "epoch": 0.37, + "grad_norm": 1.345444402514415, + "learning_rate": 7.233340424112243e-06, + "loss": 0.3561, + "step": 12827 + }, + { + "epoch": 0.37, + "grad_norm": 1.446177644249387, + "learning_rate": 7.232920166945645e-06, + "loss": 0.3209, + "step": 12828 + }, + { + "epoch": 0.37, + "grad_norm": 1.2369046939290012, + "learning_rate": 7.232499890073503e-06, + "loss": 0.3257, + "step": 12829 + }, + { + "epoch": 0.37, + "grad_norm": 1.354195414987728, + "learning_rate": 7.232079593499524e-06, + "loss": 0.3166, + "step": 12830 + }, + { + "epoch": 0.37, + "grad_norm": 1.588461840886797, + "learning_rate": 7.2316592772274175e-06, + "loss": 0.3431, + "step": 12831 + }, + { + "epoch": 0.37, + "grad_norm": 1.1842149094486145, + "learning_rate": 7.231238941260894e-06, + "loss": 0.3266, + "step": 12832 + }, + { + "epoch": 0.37, + "grad_norm": 1.2177852526942279, + "learning_rate": 7.230818585603663e-06, + "loss": 0.3083, + "step": 12833 + }, + { + "epoch": 0.37, + "grad_norm": 1.167735166057084, + "learning_rate": 7.230398210259431e-06, + "loss": 0.314, + "step": 12834 + }, + { + "epoch": 0.37, + "grad_norm": 0.9644683595610278, + "learning_rate": 7.2299778152319124e-06, + "loss": 0.5626, + "step": 12835 + }, + { + "epoch": 0.37, + "grad_norm": 1.4343566993554488, + "learning_rate": 7.229557400524813e-06, + "loss": 0.3543, + "step": 12836 + }, + { + "epoch": 0.37, + "grad_norm": 1.4180311590468737, + "learning_rate": 7.2291369661418446e-06, + "loss": 0.344, + "step": 12837 + }, + { + "epoch": 0.37, + "grad_norm": 1.4518890343142332, + "learning_rate": 7.22871651208672e-06, + "loss": 0.3399, + "step": 12838 + }, + { + "epoch": 0.37, + "grad_norm": 1.3666467277724674, + "learning_rate": 7.228296038363145e-06, + "loss": 0.3342, + "step": 12839 + }, + { + "epoch": 0.37, + "grad_norm": 1.3166663427742646, + "learning_rate": 7.2278755449748325e-06, + "loss": 0.3127, + "step": 12840 + }, + { + "epoch": 0.37, + "grad_norm": 1.8482442371305796, + "learning_rate": 7.227455031925494e-06, + "loss": 0.331, + "step": 12841 + }, + { + "epoch": 0.37, + "grad_norm": 1.3707782410683742, + "learning_rate": 7.2270344992188414e-06, + "loss": 0.3181, + "step": 12842 + }, + { + "epoch": 0.37, + "grad_norm": 1.1889694838120473, + "learning_rate": 7.226613946858585e-06, + "loss": 0.3215, + "step": 12843 + }, + { + "epoch": 0.37, + "grad_norm": 1.7036320501299052, + "learning_rate": 7.226193374848435e-06, + "loss": 0.3594, + "step": 12844 + }, + { + "epoch": 0.37, + "grad_norm": 2.3114304060434123, + "learning_rate": 7.225772783192103e-06, + "loss": 0.3236, + "step": 12845 + }, + { + "epoch": 0.37, + "grad_norm": 1.3247038146531291, + "learning_rate": 7.225352171893303e-06, + "loss": 0.3255, + "step": 12846 + }, + { + "epoch": 0.37, + "grad_norm": 1.4298636020480116, + "learning_rate": 7.224931540955744e-06, + "loss": 0.3314, + "step": 12847 + }, + { + "epoch": 0.37, + "grad_norm": 1.2234283190996584, + "learning_rate": 7.2245108903831405e-06, + "loss": 0.3276, + "step": 12848 + }, + { + "epoch": 0.37, + "grad_norm": 1.3896594235721134, + "learning_rate": 7.224090220179204e-06, + "loss": 0.3398, + "step": 12849 + }, + { + "epoch": 0.37, + "grad_norm": 11.5395431314587, + "learning_rate": 7.223669530347645e-06, + "loss": 0.3633, + "step": 12850 + }, + { + "epoch": 0.37, + "grad_norm": 1.641089269381366, + "learning_rate": 7.223248820892179e-06, + "loss": 0.3311, + "step": 12851 + }, + { + "epoch": 0.37, + "grad_norm": 1.1712509303111065, + "learning_rate": 7.222828091816517e-06, + "loss": 0.3237, + "step": 12852 + }, + { + "epoch": 0.37, + "grad_norm": 1.677244237367356, + "learning_rate": 7.222407343124372e-06, + "loss": 0.3502, + "step": 12853 + }, + { + "epoch": 0.37, + "grad_norm": 1.2489304675783752, + "learning_rate": 7.221986574819458e-06, + "loss": 0.3581, + "step": 12854 + }, + { + "epoch": 0.37, + "grad_norm": 1.2722412512978183, + "learning_rate": 7.221565786905489e-06, + "loss": 0.3348, + "step": 12855 + }, + { + "epoch": 0.37, + "grad_norm": 1.1619765304837915, + "learning_rate": 7.221144979386177e-06, + "loss": 0.3185, + "step": 12856 + }, + { + "epoch": 0.37, + "grad_norm": 1.337389059403582, + "learning_rate": 7.220724152265234e-06, + "loss": 0.3327, + "step": 12857 + }, + { + "epoch": 0.37, + "grad_norm": 1.1753725511647395, + "learning_rate": 7.220303305546377e-06, + "loss": 0.3444, + "step": 12858 + }, + { + "epoch": 0.37, + "grad_norm": 1.3637799520626688, + "learning_rate": 7.219882439233317e-06, + "loss": 0.3325, + "step": 12859 + }, + { + "epoch": 0.37, + "grad_norm": 1.3808422462777743, + "learning_rate": 7.219461553329773e-06, + "loss": 0.3475, + "step": 12860 + }, + { + "epoch": 0.37, + "grad_norm": 1.2686639656738061, + "learning_rate": 7.219040647839455e-06, + "loss": 0.339, + "step": 12861 + }, + { + "epoch": 0.37, + "grad_norm": 1.2929925144847123, + "learning_rate": 7.218619722766078e-06, + "loss": 0.3765, + "step": 12862 + }, + { + "epoch": 0.37, + "grad_norm": 1.4256925194575805, + "learning_rate": 7.218198778113358e-06, + "loss": 0.3262, + "step": 12863 + }, + { + "epoch": 0.37, + "grad_norm": 4.0537198656684, + "learning_rate": 7.217777813885008e-06, + "loss": 0.3242, + "step": 12864 + }, + { + "epoch": 0.37, + "grad_norm": 1.2573496262032273, + "learning_rate": 7.217356830084745e-06, + "loss": 0.3088, + "step": 12865 + }, + { + "epoch": 0.37, + "grad_norm": 1.621369223656037, + "learning_rate": 7.216935826716284e-06, + "loss": 0.3218, + "step": 12866 + }, + { + "epoch": 0.37, + "grad_norm": 1.240259317842214, + "learning_rate": 7.2165148037833385e-06, + "loss": 0.3405, + "step": 12867 + }, + { + "epoch": 0.37, + "grad_norm": 1.5121024381068018, + "learning_rate": 7.216093761289625e-06, + "loss": 0.3323, + "step": 12868 + }, + { + "epoch": 0.37, + "grad_norm": 1.5246019741387793, + "learning_rate": 7.21567269923886e-06, + "loss": 0.3446, + "step": 12869 + }, + { + "epoch": 0.37, + "grad_norm": 1.5476452273547792, + "learning_rate": 7.215251617634761e-06, + "loss": 0.3299, + "step": 12870 + }, + { + "epoch": 0.37, + "grad_norm": 1.271353492634846, + "learning_rate": 7.2148305164810394e-06, + "loss": 0.3156, + "step": 12871 + }, + { + "epoch": 0.37, + "grad_norm": 1.4443843633491094, + "learning_rate": 7.214409395781414e-06, + "loss": 0.3401, + "step": 12872 + }, + { + "epoch": 0.37, + "grad_norm": 1.2409760430326473, + "learning_rate": 7.213988255539602e-06, + "loss": 0.3212, + "step": 12873 + }, + { + "epoch": 0.37, + "grad_norm": 1.7208684236085634, + "learning_rate": 7.21356709575932e-06, + "loss": 0.365, + "step": 12874 + }, + { + "epoch": 0.37, + "grad_norm": 1.410817696780861, + "learning_rate": 7.213145916444282e-06, + "loss": 0.3369, + "step": 12875 + }, + { + "epoch": 0.37, + "grad_norm": 1.3314058853565136, + "learning_rate": 7.212724717598207e-06, + "loss": 0.3263, + "step": 12876 + }, + { + "epoch": 0.37, + "grad_norm": 1.3366987653610016, + "learning_rate": 7.212303499224812e-06, + "loss": 0.3821, + "step": 12877 + }, + { + "epoch": 0.37, + "grad_norm": 1.250801090184991, + "learning_rate": 7.211882261327814e-06, + "loss": 0.3454, + "step": 12878 + }, + { + "epoch": 0.37, + "grad_norm": 1.257907017421407, + "learning_rate": 7.211461003910931e-06, + "loss": 0.3188, + "step": 12879 + }, + { + "epoch": 0.37, + "grad_norm": 1.3919332588487439, + "learning_rate": 7.21103972697788e-06, + "loss": 0.3296, + "step": 12880 + }, + { + "epoch": 0.37, + "grad_norm": 1.2488486389918285, + "learning_rate": 7.210618430532379e-06, + "loss": 0.298, + "step": 12881 + }, + { + "epoch": 0.37, + "grad_norm": 1.2831467469975988, + "learning_rate": 7.210197114578144e-06, + "loss": 0.3504, + "step": 12882 + }, + { + "epoch": 0.37, + "grad_norm": 1.4075722029602213, + "learning_rate": 7.2097757791188975e-06, + "loss": 0.335, + "step": 12883 + }, + { + "epoch": 0.37, + "grad_norm": 1.298118463985772, + "learning_rate": 7.209354424158354e-06, + "loss": 0.3201, + "step": 12884 + }, + { + "epoch": 0.37, + "grad_norm": 1.4491543183460123, + "learning_rate": 7.208933049700234e-06, + "loss": 0.3398, + "step": 12885 + }, + { + "epoch": 0.37, + "grad_norm": 1.3038530004012636, + "learning_rate": 7.208511655748254e-06, + "loss": 0.3351, + "step": 12886 + }, + { + "epoch": 0.37, + "grad_norm": 1.294960727637822, + "learning_rate": 7.208090242306135e-06, + "loss": 0.3044, + "step": 12887 + }, + { + "epoch": 0.37, + "grad_norm": 1.2990163461057684, + "learning_rate": 7.207668809377596e-06, + "loss": 0.3117, + "step": 12888 + }, + { + "epoch": 0.37, + "grad_norm": 1.3177702514803797, + "learning_rate": 7.207247356966355e-06, + "loss": 0.3053, + "step": 12889 + }, + { + "epoch": 0.37, + "grad_norm": 1.2994043857189097, + "learning_rate": 7.20682588507613e-06, + "loss": 0.3228, + "step": 12890 + }, + { + "epoch": 0.37, + "grad_norm": 1.2896890328110457, + "learning_rate": 7.206404393710643e-06, + "loss": 0.3275, + "step": 12891 + }, + { + "epoch": 0.37, + "grad_norm": 1.8084527938370387, + "learning_rate": 7.2059828828736125e-06, + "loss": 0.3359, + "step": 12892 + }, + { + "epoch": 0.37, + "grad_norm": 1.332332045328436, + "learning_rate": 7.205561352568759e-06, + "loss": 0.3239, + "step": 12893 + }, + { + "epoch": 0.37, + "grad_norm": 1.37462594739588, + "learning_rate": 7.205139802799802e-06, + "loss": 0.3234, + "step": 12894 + }, + { + "epoch": 0.37, + "grad_norm": 1.4276888526897527, + "learning_rate": 7.204718233570462e-06, + "loss": 0.3601, + "step": 12895 + }, + { + "epoch": 0.37, + "grad_norm": 1.2869518438503578, + "learning_rate": 7.204296644884457e-06, + "loss": 0.323, + "step": 12896 + }, + { + "epoch": 0.37, + "grad_norm": 1.5736819678711151, + "learning_rate": 7.203875036745511e-06, + "loss": 0.3241, + "step": 12897 + }, + { + "epoch": 0.37, + "grad_norm": 1.40632404338742, + "learning_rate": 7.203453409157346e-06, + "loss": 0.3423, + "step": 12898 + }, + { + "epoch": 0.37, + "grad_norm": 1.3222960805534956, + "learning_rate": 7.203031762123677e-06, + "loss": 0.3174, + "step": 12899 + }, + { + "epoch": 0.37, + "grad_norm": 1.258663693163491, + "learning_rate": 7.202610095648229e-06, + "loss": 0.3588, + "step": 12900 + }, + { + "epoch": 0.37, + "grad_norm": 1.4144137995666037, + "learning_rate": 7.202188409734723e-06, + "loss": 0.3351, + "step": 12901 + }, + { + "epoch": 0.37, + "grad_norm": 1.2418434985341618, + "learning_rate": 7.20176670438688e-06, + "loss": 0.3454, + "step": 12902 + }, + { + "epoch": 0.37, + "grad_norm": 1.7983674249922896, + "learning_rate": 7.201344979608423e-06, + "loss": 0.3341, + "step": 12903 + }, + { + "epoch": 0.37, + "grad_norm": 1.3088946556451024, + "learning_rate": 7.200923235403069e-06, + "loss": 0.3558, + "step": 12904 + }, + { + "epoch": 0.37, + "grad_norm": 1.2350841872029812, + "learning_rate": 7.200501471774544e-06, + "loss": 0.3162, + "step": 12905 + }, + { + "epoch": 0.37, + "grad_norm": 1.2706511716332465, + "learning_rate": 7.200079688726569e-06, + "loss": 0.3228, + "step": 12906 + }, + { + "epoch": 0.37, + "grad_norm": 1.9122046139217963, + "learning_rate": 7.199657886262867e-06, + "loss": 0.3303, + "step": 12907 + }, + { + "epoch": 0.37, + "grad_norm": 1.4148156997659405, + "learning_rate": 7.1992360643871584e-06, + "loss": 0.3461, + "step": 12908 + }, + { + "epoch": 0.37, + "grad_norm": 1.287960366443092, + "learning_rate": 7.198814223103168e-06, + "loss": 0.3334, + "step": 12909 + }, + { + "epoch": 0.37, + "grad_norm": 1.3537518169181635, + "learning_rate": 7.198392362414618e-06, + "loss": 0.3407, + "step": 12910 + }, + { + "epoch": 0.37, + "grad_norm": 1.5151060821126354, + "learning_rate": 7.197970482325232e-06, + "loss": 0.363, + "step": 12911 + }, + { + "epoch": 0.37, + "grad_norm": 1.1739949658675868, + "learning_rate": 7.197548582838729e-06, + "loss": 0.296, + "step": 12912 + }, + { + "epoch": 0.37, + "grad_norm": 1.3970067633672096, + "learning_rate": 7.197126663958838e-06, + "loss": 0.3219, + "step": 12913 + }, + { + "epoch": 0.37, + "grad_norm": 1.3474422514363238, + "learning_rate": 7.196704725689278e-06, + "loss": 0.3586, + "step": 12914 + }, + { + "epoch": 0.37, + "grad_norm": 1.5785890355825682, + "learning_rate": 7.196282768033776e-06, + "loss": 0.3414, + "step": 12915 + }, + { + "epoch": 0.37, + "grad_norm": 1.3529501894516145, + "learning_rate": 7.195860790996054e-06, + "loss": 0.34, + "step": 12916 + }, + { + "epoch": 0.37, + "grad_norm": 1.2543903309101303, + "learning_rate": 7.195438794579837e-06, + "loss": 0.3497, + "step": 12917 + }, + { + "epoch": 0.37, + "grad_norm": 1.2871332883567421, + "learning_rate": 7.195016778788847e-06, + "loss": 0.3398, + "step": 12918 + }, + { + "epoch": 0.37, + "grad_norm": 1.3101721521393181, + "learning_rate": 7.194594743626809e-06, + "loss": 0.3068, + "step": 12919 + }, + { + "epoch": 0.37, + "grad_norm": 1.2709753358554674, + "learning_rate": 7.1941726890974495e-06, + "loss": 0.3487, + "step": 12920 + }, + { + "epoch": 0.37, + "grad_norm": 1.3643284286869752, + "learning_rate": 7.193750615204491e-06, + "loss": 0.3431, + "step": 12921 + }, + { + "epoch": 0.37, + "grad_norm": 1.2962844358453336, + "learning_rate": 7.193328521951659e-06, + "loss": 0.3256, + "step": 12922 + }, + { + "epoch": 0.37, + "grad_norm": 1.2391571428126544, + "learning_rate": 7.192906409342678e-06, + "loss": 0.3484, + "step": 12923 + }, + { + "epoch": 0.37, + "grad_norm": 1.367901630848783, + "learning_rate": 7.192484277381274e-06, + "loss": 0.3128, + "step": 12924 + }, + { + "epoch": 0.37, + "grad_norm": 1.481838145680643, + "learning_rate": 7.192062126071173e-06, + "loss": 0.3274, + "step": 12925 + }, + { + "epoch": 0.37, + "grad_norm": 1.0361165225369557, + "learning_rate": 7.191639955416097e-06, + "loss": 0.6104, + "step": 12926 + }, + { + "epoch": 0.37, + "grad_norm": 1.3076958461156862, + "learning_rate": 7.191217765419776e-06, + "loss": 0.3454, + "step": 12927 + }, + { + "epoch": 0.37, + "grad_norm": 2.3329702776052534, + "learning_rate": 7.190795556085933e-06, + "loss": 0.34, + "step": 12928 + }, + { + "epoch": 0.38, + "grad_norm": 1.23626747788817, + "learning_rate": 7.190373327418296e-06, + "loss": 0.3387, + "step": 12929 + }, + { + "epoch": 0.38, + "grad_norm": 1.5394770421726451, + "learning_rate": 7.18995107942059e-06, + "loss": 0.3807, + "step": 12930 + }, + { + "epoch": 0.38, + "grad_norm": 1.3871680004381826, + "learning_rate": 7.18952881209654e-06, + "loss": 0.343, + "step": 12931 + }, + { + "epoch": 0.38, + "grad_norm": 1.6205417937273154, + "learning_rate": 7.189106525449875e-06, + "loss": 0.3338, + "step": 12932 + }, + { + "epoch": 0.38, + "grad_norm": 1.2389295916410865, + "learning_rate": 7.188684219484321e-06, + "loss": 0.3343, + "step": 12933 + }, + { + "epoch": 0.38, + "grad_norm": 1.65944162550754, + "learning_rate": 7.188261894203603e-06, + "loss": 0.3261, + "step": 12934 + }, + { + "epoch": 0.38, + "grad_norm": 1.9070375970618065, + "learning_rate": 7.1878395496114505e-06, + "loss": 0.3423, + "step": 12935 + }, + { + "epoch": 0.38, + "grad_norm": 1.1399402572439397, + "learning_rate": 7.187417185711589e-06, + "loss": 0.3061, + "step": 12936 + }, + { + "epoch": 0.38, + "grad_norm": 1.8214282800292994, + "learning_rate": 7.1869948025077475e-06, + "loss": 0.3274, + "step": 12937 + }, + { + "epoch": 0.38, + "grad_norm": 1.3149630718623724, + "learning_rate": 7.186572400003652e-06, + "loss": 0.3279, + "step": 12938 + }, + { + "epoch": 0.38, + "grad_norm": 1.3236604443717728, + "learning_rate": 7.186149978203032e-06, + "loss": 0.3366, + "step": 12939 + }, + { + "epoch": 0.38, + "grad_norm": 1.3531540102647996, + "learning_rate": 7.185727537109613e-06, + "loss": 0.3276, + "step": 12940 + }, + { + "epoch": 0.38, + "grad_norm": 1.3073671462889027, + "learning_rate": 7.185305076727125e-06, + "loss": 0.3261, + "step": 12941 + }, + { + "epoch": 0.38, + "grad_norm": 1.407565790866657, + "learning_rate": 7.184882597059294e-06, + "loss": 0.3515, + "step": 12942 + }, + { + "epoch": 0.38, + "grad_norm": 1.2743117830964796, + "learning_rate": 7.184460098109851e-06, + "loss": 0.3248, + "step": 12943 + }, + { + "epoch": 0.38, + "grad_norm": 1.0318042094593933, + "learning_rate": 7.184037579882525e-06, + "loss": 0.6227, + "step": 12944 + }, + { + "epoch": 0.38, + "grad_norm": 1.2974938725847502, + "learning_rate": 7.183615042381041e-06, + "loss": 0.3621, + "step": 12945 + }, + { + "epoch": 0.38, + "grad_norm": 1.2326559431778268, + "learning_rate": 7.183192485609129e-06, + "loss": 0.3195, + "step": 12946 + }, + { + "epoch": 0.38, + "grad_norm": 2.2969927748612604, + "learning_rate": 7.182769909570521e-06, + "loss": 0.3505, + "step": 12947 + }, + { + "epoch": 0.38, + "grad_norm": 3.459035454621226, + "learning_rate": 7.182347314268944e-06, + "loss": 0.3538, + "step": 12948 + }, + { + "epoch": 0.38, + "grad_norm": 1.3870046026096572, + "learning_rate": 7.181924699708127e-06, + "loss": 0.321, + "step": 12949 + }, + { + "epoch": 0.38, + "grad_norm": 1.4367726956910003, + "learning_rate": 7.181502065891801e-06, + "loss": 0.3342, + "step": 12950 + }, + { + "epoch": 0.38, + "grad_norm": 1.8148256288035343, + "learning_rate": 7.181079412823695e-06, + "loss": 0.3852, + "step": 12951 + }, + { + "epoch": 0.38, + "grad_norm": 1.2901711361217167, + "learning_rate": 7.180656740507538e-06, + "loss": 0.3555, + "step": 12952 + }, + { + "epoch": 0.38, + "grad_norm": 1.2771426124859873, + "learning_rate": 7.1802340489470634e-06, + "loss": 0.3292, + "step": 12953 + }, + { + "epoch": 0.38, + "grad_norm": 1.6612734582249478, + "learning_rate": 7.179811338145997e-06, + "loss": 0.332, + "step": 12954 + }, + { + "epoch": 0.38, + "grad_norm": 1.3608491898511321, + "learning_rate": 7.1793886081080725e-06, + "loss": 0.3426, + "step": 12955 + }, + { + "epoch": 0.38, + "grad_norm": 1.334107517622103, + "learning_rate": 7.178965858837018e-06, + "loss": 0.3137, + "step": 12956 + }, + { + "epoch": 0.38, + "grad_norm": 1.209531910642284, + "learning_rate": 7.178543090336567e-06, + "loss": 0.3435, + "step": 12957 + }, + { + "epoch": 0.38, + "grad_norm": 1.701577272615997, + "learning_rate": 7.17812030261045e-06, + "loss": 0.3521, + "step": 12958 + }, + { + "epoch": 0.38, + "grad_norm": 0.8909093281883685, + "learning_rate": 7.177697495662395e-06, + "loss": 0.592, + "step": 12959 + }, + { + "epoch": 0.38, + "grad_norm": 1.2612182679189736, + "learning_rate": 7.177274669496136e-06, + "loss": 0.3356, + "step": 12960 + }, + { + "epoch": 0.38, + "grad_norm": 1.3349107103015445, + "learning_rate": 7.176851824115404e-06, + "loss": 0.3317, + "step": 12961 + }, + { + "epoch": 0.38, + "grad_norm": 1.4917830895166264, + "learning_rate": 7.1764289595239315e-06, + "loss": 0.3297, + "step": 12962 + }, + { + "epoch": 0.38, + "grad_norm": 1.5317229885501746, + "learning_rate": 7.176006075725448e-06, + "loss": 0.3498, + "step": 12963 + }, + { + "epoch": 0.38, + "grad_norm": 1.4647163502316842, + "learning_rate": 7.175583172723687e-06, + "loss": 0.3527, + "step": 12964 + }, + { + "epoch": 0.38, + "grad_norm": 1.239979904297628, + "learning_rate": 7.1751602505223796e-06, + "loss": 0.3558, + "step": 12965 + }, + { + "epoch": 0.38, + "grad_norm": 1.2681740765717002, + "learning_rate": 7.174737309125261e-06, + "loss": 0.3412, + "step": 12966 + }, + { + "epoch": 0.38, + "grad_norm": 1.2237243873835557, + "learning_rate": 7.17431434853606e-06, + "loss": 0.3273, + "step": 12967 + }, + { + "epoch": 0.38, + "grad_norm": 1.4075438424748405, + "learning_rate": 7.17389136875851e-06, + "loss": 0.3353, + "step": 12968 + }, + { + "epoch": 0.38, + "grad_norm": 1.7683138254415156, + "learning_rate": 7.173468369796346e-06, + "loss": 0.3742, + "step": 12969 + }, + { + "epoch": 0.38, + "grad_norm": 3.1094067014260003, + "learning_rate": 7.173045351653298e-06, + "loss": 0.355, + "step": 12970 + }, + { + "epoch": 0.38, + "grad_norm": 1.2417621181519793, + "learning_rate": 7.172622314333103e-06, + "loss": 0.3229, + "step": 12971 + }, + { + "epoch": 0.38, + "grad_norm": 1.3618627718299403, + "learning_rate": 7.172199257839492e-06, + "loss": 0.3483, + "step": 12972 + }, + { + "epoch": 0.38, + "grad_norm": 0.9380153284993038, + "learning_rate": 7.171776182176196e-06, + "loss": 0.6033, + "step": 12973 + }, + { + "epoch": 0.38, + "grad_norm": 1.5023003714972578, + "learning_rate": 7.1713530873469526e-06, + "loss": 0.3272, + "step": 12974 + }, + { + "epoch": 0.38, + "grad_norm": 1.3982391360077995, + "learning_rate": 7.170929973355493e-06, + "loss": 0.3494, + "step": 12975 + }, + { + "epoch": 0.38, + "grad_norm": 1.2421315329862201, + "learning_rate": 7.170506840205554e-06, + "loss": 0.3358, + "step": 12976 + }, + { + "epoch": 0.38, + "grad_norm": 1.2373512206350095, + "learning_rate": 7.170083687900867e-06, + "loss": 0.3247, + "step": 12977 + }, + { + "epoch": 0.38, + "grad_norm": 1.3329419305938062, + "learning_rate": 7.169660516445167e-06, + "loss": 0.3184, + "step": 12978 + }, + { + "epoch": 0.38, + "grad_norm": 1.2445727474027426, + "learning_rate": 7.16923732584219e-06, + "loss": 0.3306, + "step": 12979 + }, + { + "epoch": 0.38, + "grad_norm": 1.294797197629789, + "learning_rate": 7.16881411609567e-06, + "loss": 0.3398, + "step": 12980 + }, + { + "epoch": 0.38, + "grad_norm": 1.2645187775241016, + "learning_rate": 7.16839088720934e-06, + "loss": 0.3387, + "step": 12981 + }, + { + "epoch": 0.38, + "grad_norm": 1.7068576738325258, + "learning_rate": 7.167967639186936e-06, + "loss": 0.3268, + "step": 12982 + }, + { + "epoch": 0.38, + "grad_norm": 1.3141911504581183, + "learning_rate": 7.167544372032193e-06, + "loss": 0.3553, + "step": 12983 + }, + { + "epoch": 0.38, + "grad_norm": 1.4380093084795162, + "learning_rate": 7.16712108574885e-06, + "loss": 0.3405, + "step": 12984 + }, + { + "epoch": 0.38, + "grad_norm": 1.2476245929008702, + "learning_rate": 7.166697780340637e-06, + "loss": 0.3379, + "step": 12985 + }, + { + "epoch": 0.38, + "grad_norm": 1.505357736775807, + "learning_rate": 7.166274455811293e-06, + "loss": 0.3454, + "step": 12986 + }, + { + "epoch": 0.38, + "grad_norm": 1.3166920803535616, + "learning_rate": 7.165851112164552e-06, + "loss": 0.3174, + "step": 12987 + }, + { + "epoch": 0.38, + "grad_norm": 1.198648600989502, + "learning_rate": 7.165427749404151e-06, + "loss": 0.3011, + "step": 12988 + }, + { + "epoch": 0.38, + "grad_norm": 1.3095203535875886, + "learning_rate": 7.1650043675338276e-06, + "loss": 0.3355, + "step": 12989 + }, + { + "epoch": 0.38, + "grad_norm": 1.2461271249032897, + "learning_rate": 7.164580966557315e-06, + "loss": 0.3317, + "step": 12990 + }, + { + "epoch": 0.38, + "grad_norm": 1.3182963916209733, + "learning_rate": 7.164157546478352e-06, + "loss": 0.3399, + "step": 12991 + }, + { + "epoch": 0.38, + "grad_norm": 2.2201931710052722, + "learning_rate": 7.163734107300674e-06, + "loss": 0.3254, + "step": 12992 + }, + { + "epoch": 0.38, + "grad_norm": 1.37015443533249, + "learning_rate": 7.1633106490280195e-06, + "loss": 0.3363, + "step": 12993 + }, + { + "epoch": 0.38, + "grad_norm": 1.3711912305771643, + "learning_rate": 7.162887171664124e-06, + "loss": 0.3396, + "step": 12994 + }, + { + "epoch": 0.38, + "grad_norm": 2.2216192429266783, + "learning_rate": 7.162463675212726e-06, + "loss": 0.3078, + "step": 12995 + }, + { + "epoch": 0.38, + "grad_norm": 1.24623809325777, + "learning_rate": 7.16204015967756e-06, + "loss": 0.3153, + "step": 12996 + }, + { + "epoch": 0.38, + "grad_norm": 1.5176441221239314, + "learning_rate": 7.161616625062367e-06, + "loss": 0.3274, + "step": 12997 + }, + { + "epoch": 0.38, + "grad_norm": 1.2966233332884907, + "learning_rate": 7.161193071370884e-06, + "loss": 0.3747, + "step": 12998 + }, + { + "epoch": 0.38, + "grad_norm": 1.339390482326032, + "learning_rate": 7.160769498606847e-06, + "loss": 0.3407, + "step": 12999 + }, + { + "epoch": 0.38, + "grad_norm": 1.3166405804526435, + "learning_rate": 7.160345906773995e-06, + "loss": 0.3347, + "step": 13000 + }, + { + "epoch": 0.38, + "grad_norm": 1.379928988658206, + "learning_rate": 7.1599222958760675e-06, + "loss": 0.3181, + "step": 13001 + }, + { + "epoch": 0.38, + "grad_norm": 1.48563721520913, + "learning_rate": 7.159498665916801e-06, + "loss": 0.3446, + "step": 13002 + }, + { + "epoch": 0.38, + "grad_norm": 1.1673174149222016, + "learning_rate": 7.159075016899936e-06, + "loss": 0.3178, + "step": 13003 + }, + { + "epoch": 0.38, + "grad_norm": 1.2988229848410793, + "learning_rate": 7.158651348829209e-06, + "loss": 0.335, + "step": 13004 + }, + { + "epoch": 0.38, + "grad_norm": 1.4817424428895558, + "learning_rate": 7.158227661708359e-06, + "loss": 0.3547, + "step": 13005 + }, + { + "epoch": 0.38, + "grad_norm": 1.2049499967223185, + "learning_rate": 7.157803955541127e-06, + "loss": 0.322, + "step": 13006 + }, + { + "epoch": 0.38, + "grad_norm": 1.3425121513162226, + "learning_rate": 7.1573802303312525e-06, + "loss": 0.324, + "step": 13007 + }, + { + "epoch": 0.38, + "grad_norm": 1.201589079534406, + "learning_rate": 7.1569564860824715e-06, + "loss": 0.3259, + "step": 13008 + }, + { + "epoch": 0.38, + "grad_norm": 1.7809587733720957, + "learning_rate": 7.156532722798526e-06, + "loss": 0.3488, + "step": 13009 + }, + { + "epoch": 0.38, + "grad_norm": 1.190340975184748, + "learning_rate": 7.156108940483155e-06, + "loss": 0.3171, + "step": 13010 + }, + { + "epoch": 0.38, + "grad_norm": 1.4291781656301268, + "learning_rate": 7.155685139140098e-06, + "loss": 0.3332, + "step": 13011 + }, + { + "epoch": 0.38, + "grad_norm": 1.269657665976152, + "learning_rate": 7.155261318773099e-06, + "loss": 0.3339, + "step": 13012 + }, + { + "epoch": 0.38, + "grad_norm": 1.2596727355043307, + "learning_rate": 7.154837479385892e-06, + "loss": 0.3208, + "step": 13013 + }, + { + "epoch": 0.38, + "grad_norm": 1.3471012494697663, + "learning_rate": 7.154413620982222e-06, + "loss": 0.3501, + "step": 13014 + }, + { + "epoch": 0.38, + "grad_norm": 1.3563928207302045, + "learning_rate": 7.153989743565827e-06, + "loss": 0.3165, + "step": 13015 + }, + { + "epoch": 0.38, + "grad_norm": 1.3212314745234972, + "learning_rate": 7.153565847140449e-06, + "loss": 0.3212, + "step": 13016 + }, + { + "epoch": 0.38, + "grad_norm": 1.4577873523754177, + "learning_rate": 7.153141931709828e-06, + "loss": 0.3126, + "step": 13017 + }, + { + "epoch": 0.38, + "grad_norm": 1.2912766603260128, + "learning_rate": 7.152717997277706e-06, + "loss": 0.331, + "step": 13018 + }, + { + "epoch": 0.38, + "grad_norm": 1.2466876407501637, + "learning_rate": 7.152294043847823e-06, + "loss": 0.3444, + "step": 13019 + }, + { + "epoch": 0.38, + "grad_norm": 1.213032553591892, + "learning_rate": 7.151870071423922e-06, + "loss": 0.3171, + "step": 13020 + }, + { + "epoch": 0.38, + "grad_norm": 1.356342138558467, + "learning_rate": 7.151446080009744e-06, + "loss": 0.3183, + "step": 13021 + }, + { + "epoch": 0.38, + "grad_norm": 1.472556954821218, + "learning_rate": 7.1510220696090295e-06, + "loss": 0.3305, + "step": 13022 + }, + { + "epoch": 0.38, + "grad_norm": 1.5487427582131237, + "learning_rate": 7.150598040225521e-06, + "loss": 0.3241, + "step": 13023 + }, + { + "epoch": 0.38, + "grad_norm": 1.364359321497854, + "learning_rate": 7.150173991862962e-06, + "loss": 0.3514, + "step": 13024 + }, + { + "epoch": 0.38, + "grad_norm": 1.2228265388403252, + "learning_rate": 7.149749924525094e-06, + "loss": 0.3216, + "step": 13025 + }, + { + "epoch": 0.38, + "grad_norm": 1.255067981561485, + "learning_rate": 7.149325838215659e-06, + "loss": 0.3416, + "step": 13026 + }, + { + "epoch": 0.38, + "grad_norm": 1.242780042573641, + "learning_rate": 7.148901732938399e-06, + "loss": 0.3272, + "step": 13027 + }, + { + "epoch": 0.38, + "grad_norm": 1.4723129710675162, + "learning_rate": 7.148477608697058e-06, + "loss": 0.3542, + "step": 13028 + }, + { + "epoch": 0.38, + "grad_norm": 0.8721912884738673, + "learning_rate": 7.148053465495377e-06, + "loss": 0.5903, + "step": 13029 + }, + { + "epoch": 0.38, + "grad_norm": 1.4136648385261084, + "learning_rate": 7.147629303337101e-06, + "loss": 0.3434, + "step": 13030 + }, + { + "epoch": 0.38, + "grad_norm": 2.561495162638032, + "learning_rate": 7.1472051222259745e-06, + "loss": 0.3251, + "step": 13031 + }, + { + "epoch": 0.38, + "grad_norm": 1.384619345553637, + "learning_rate": 7.146780922165737e-06, + "loss": 0.3673, + "step": 13032 + }, + { + "epoch": 0.38, + "grad_norm": 1.20099344944075, + "learning_rate": 7.146356703160133e-06, + "loss": 0.3197, + "step": 13033 + }, + { + "epoch": 0.38, + "grad_norm": 1.2920327282004431, + "learning_rate": 7.145932465212909e-06, + "loss": 0.334, + "step": 13034 + }, + { + "epoch": 0.38, + "grad_norm": 1.7518774732828288, + "learning_rate": 7.145508208327807e-06, + "loss": 0.3685, + "step": 13035 + }, + { + "epoch": 0.38, + "grad_norm": 1.6739086607017533, + "learning_rate": 7.145083932508571e-06, + "loss": 0.3301, + "step": 13036 + }, + { + "epoch": 0.38, + "grad_norm": 1.3339138452299457, + "learning_rate": 7.144659637758945e-06, + "loss": 0.3354, + "step": 13037 + }, + { + "epoch": 0.38, + "grad_norm": 1.2676153122273788, + "learning_rate": 7.144235324082675e-06, + "loss": 0.3406, + "step": 13038 + }, + { + "epoch": 0.38, + "grad_norm": 1.2254513270258107, + "learning_rate": 7.143810991483504e-06, + "loss": 0.3251, + "step": 13039 + }, + { + "epoch": 0.38, + "grad_norm": 1.4997274353442756, + "learning_rate": 7.1433866399651775e-06, + "loss": 0.3336, + "step": 13040 + }, + { + "epoch": 0.38, + "grad_norm": 1.2571114327530062, + "learning_rate": 7.142962269531439e-06, + "loss": 0.3265, + "step": 13041 + }, + { + "epoch": 0.38, + "grad_norm": 1.2814761069483078, + "learning_rate": 7.1425378801860355e-06, + "loss": 0.3317, + "step": 13042 + }, + { + "epoch": 0.38, + "grad_norm": 1.348219608705566, + "learning_rate": 7.142113471932711e-06, + "loss": 0.3207, + "step": 13043 + }, + { + "epoch": 0.38, + "grad_norm": 1.2322836292401569, + "learning_rate": 7.141689044775213e-06, + "loss": 0.3244, + "step": 13044 + }, + { + "epoch": 0.38, + "grad_norm": 1.0029254523052515, + "learning_rate": 7.141264598717284e-06, + "loss": 0.6294, + "step": 13045 + }, + { + "epoch": 0.38, + "grad_norm": 1.2487533300090303, + "learning_rate": 7.140840133762671e-06, + "loss": 0.3163, + "step": 13046 + }, + { + "epoch": 0.38, + "grad_norm": 1.2628588514595598, + "learning_rate": 7.14041564991512e-06, + "loss": 0.3381, + "step": 13047 + }, + { + "epoch": 0.38, + "grad_norm": 1.246725165343446, + "learning_rate": 7.139991147178378e-06, + "loss": 0.3368, + "step": 13048 + }, + { + "epoch": 0.38, + "grad_norm": 1.317187700105046, + "learning_rate": 7.13956662555619e-06, + "loss": 0.3363, + "step": 13049 + }, + { + "epoch": 0.38, + "grad_norm": 1.6276240201364323, + "learning_rate": 7.139142085052302e-06, + "loss": 0.3402, + "step": 13050 + }, + { + "epoch": 0.38, + "grad_norm": 1.2875395277869248, + "learning_rate": 7.138717525670462e-06, + "loss": 0.3279, + "step": 13051 + }, + { + "epoch": 0.38, + "grad_norm": 1.3590248778138687, + "learning_rate": 7.138292947414416e-06, + "loss": 0.3161, + "step": 13052 + }, + { + "epoch": 0.38, + "grad_norm": 1.6493573740358891, + "learning_rate": 7.137868350287912e-06, + "loss": 0.3348, + "step": 13053 + }, + { + "epoch": 0.38, + "grad_norm": 1.4516485508226762, + "learning_rate": 7.137443734294695e-06, + "loss": 0.3557, + "step": 13054 + }, + { + "epoch": 0.38, + "grad_norm": 1.4602151896879092, + "learning_rate": 7.1370190994385136e-06, + "loss": 0.3361, + "step": 13055 + }, + { + "epoch": 0.38, + "grad_norm": 1.2599723490465895, + "learning_rate": 7.136594445723115e-06, + "loss": 0.3322, + "step": 13056 + }, + { + "epoch": 0.38, + "grad_norm": 1.2483273529520742, + "learning_rate": 7.1361697731522464e-06, + "loss": 0.3444, + "step": 13057 + }, + { + "epoch": 0.38, + "grad_norm": 1.8353633964456746, + "learning_rate": 7.135745081729657e-06, + "loss": 0.3441, + "step": 13058 + }, + { + "epoch": 0.38, + "grad_norm": 1.2291083193783594, + "learning_rate": 7.135320371459092e-06, + "loss": 0.348, + "step": 13059 + }, + { + "epoch": 0.38, + "grad_norm": 1.210082165186734, + "learning_rate": 7.134895642344301e-06, + "loss": 0.3279, + "step": 13060 + }, + { + "epoch": 0.38, + "grad_norm": 1.8339470448792579, + "learning_rate": 7.134470894389034e-06, + "loss": 0.323, + "step": 13061 + }, + { + "epoch": 0.38, + "grad_norm": 1.35847510795129, + "learning_rate": 7.134046127597037e-06, + "loss": 0.3267, + "step": 13062 + }, + { + "epoch": 0.38, + "grad_norm": 1.8141025852608128, + "learning_rate": 7.133621341972059e-06, + "loss": 0.357, + "step": 13063 + }, + { + "epoch": 0.38, + "grad_norm": 1.2695426970594779, + "learning_rate": 7.133196537517848e-06, + "loss": 0.3184, + "step": 13064 + }, + { + "epoch": 0.38, + "grad_norm": 1.3450814216893852, + "learning_rate": 7.1327717142381546e-06, + "loss": 0.3325, + "step": 13065 + }, + { + "epoch": 0.38, + "grad_norm": 1.2321189317815373, + "learning_rate": 7.132346872136727e-06, + "loss": 0.3335, + "step": 13066 + }, + { + "epoch": 0.38, + "grad_norm": 1.8464789986642156, + "learning_rate": 7.131922011217316e-06, + "loss": 0.3428, + "step": 13067 + }, + { + "epoch": 0.38, + "grad_norm": 1.243919984925331, + "learning_rate": 7.131497131483669e-06, + "loss": 0.3114, + "step": 13068 + }, + { + "epoch": 0.38, + "grad_norm": 1.2657461072407912, + "learning_rate": 7.131072232939535e-06, + "loss": 0.3226, + "step": 13069 + }, + { + "epoch": 0.38, + "grad_norm": 1.3024479501719117, + "learning_rate": 7.130647315588666e-06, + "loss": 0.3197, + "step": 13070 + }, + { + "epoch": 0.38, + "grad_norm": 1.5938271751204043, + "learning_rate": 7.13022237943481e-06, + "loss": 0.3205, + "step": 13071 + }, + { + "epoch": 0.38, + "grad_norm": 1.3305007118543446, + "learning_rate": 7.129797424481719e-06, + "loss": 0.3458, + "step": 13072 + }, + { + "epoch": 0.38, + "grad_norm": 1.3318345331532715, + "learning_rate": 7.129372450733142e-06, + "loss": 0.3157, + "step": 13073 + }, + { + "epoch": 0.38, + "grad_norm": 1.278115601259114, + "learning_rate": 7.1289474581928275e-06, + "loss": 0.3342, + "step": 13074 + }, + { + "epoch": 0.38, + "grad_norm": 1.2956267205429657, + "learning_rate": 7.12852244686453e-06, + "loss": 0.3276, + "step": 13075 + }, + { + "epoch": 0.38, + "grad_norm": 1.6215197478253789, + "learning_rate": 7.128097416751998e-06, + "loss": 0.3383, + "step": 13076 + }, + { + "epoch": 0.38, + "grad_norm": 1.4542668891409625, + "learning_rate": 7.127672367858982e-06, + "loss": 0.324, + "step": 13077 + }, + { + "epoch": 0.38, + "grad_norm": 1.4621229491813266, + "learning_rate": 7.127247300189235e-06, + "loss": 0.3678, + "step": 13078 + }, + { + "epoch": 0.38, + "grad_norm": 1.2820877177708097, + "learning_rate": 7.126822213746505e-06, + "loss": 0.348, + "step": 13079 + }, + { + "epoch": 0.38, + "grad_norm": 1.253098353217689, + "learning_rate": 7.126397108534547e-06, + "loss": 0.317, + "step": 13080 + }, + { + "epoch": 0.38, + "grad_norm": 1.2442022538899835, + "learning_rate": 7.1259719845571106e-06, + "loss": 0.3162, + "step": 13081 + }, + { + "epoch": 0.38, + "grad_norm": 1.8110330267538794, + "learning_rate": 7.125546841817947e-06, + "loss": 0.3371, + "step": 13082 + }, + { + "epoch": 0.38, + "grad_norm": 1.44693557279172, + "learning_rate": 7.12512168032081e-06, + "loss": 0.3409, + "step": 13083 + }, + { + "epoch": 0.38, + "grad_norm": 1.1935168056865042, + "learning_rate": 7.124696500069451e-06, + "loss": 0.3299, + "step": 13084 + }, + { + "epoch": 0.38, + "grad_norm": 1.589377411969565, + "learning_rate": 7.124271301067621e-06, + "loss": 0.347, + "step": 13085 + }, + { + "epoch": 0.38, + "grad_norm": 1.6292453906486184, + "learning_rate": 7.123846083319074e-06, + "loss": 0.3503, + "step": 13086 + }, + { + "epoch": 0.38, + "grad_norm": 1.7077363742331677, + "learning_rate": 7.12342084682756e-06, + "loss": 0.3398, + "step": 13087 + }, + { + "epoch": 0.38, + "grad_norm": 1.2265365676443587, + "learning_rate": 7.1229955915968355e-06, + "loss": 0.3381, + "step": 13088 + }, + { + "epoch": 0.38, + "grad_norm": 1.2984490685707482, + "learning_rate": 7.122570317630649e-06, + "loss": 0.3373, + "step": 13089 + }, + { + "epoch": 0.38, + "grad_norm": 0.8749182606358762, + "learning_rate": 7.122145024932758e-06, + "loss": 0.5651, + "step": 13090 + }, + { + "epoch": 0.38, + "grad_norm": 1.3867061067261763, + "learning_rate": 7.121719713506913e-06, + "loss": 0.3872, + "step": 13091 + }, + { + "epoch": 0.38, + "grad_norm": 1.3399116301078797, + "learning_rate": 7.121294383356868e-06, + "loss": 0.3081, + "step": 13092 + }, + { + "epoch": 0.38, + "grad_norm": 1.2315082575623049, + "learning_rate": 7.1208690344863765e-06, + "loss": 0.3556, + "step": 13093 + }, + { + "epoch": 0.38, + "grad_norm": 1.316361005804351, + "learning_rate": 7.120443666899193e-06, + "loss": 0.3644, + "step": 13094 + }, + { + "epoch": 0.38, + "grad_norm": 1.1313287110569357, + "learning_rate": 7.120018280599069e-06, + "loss": 0.3201, + "step": 13095 + }, + { + "epoch": 0.38, + "grad_norm": 1.3152453639514838, + "learning_rate": 7.11959287558976e-06, + "loss": 0.3188, + "step": 13096 + }, + { + "epoch": 0.38, + "grad_norm": 1.486082423314219, + "learning_rate": 7.119167451875022e-06, + "loss": 0.3402, + "step": 13097 + }, + { + "epoch": 0.38, + "grad_norm": 1.2297922642017973, + "learning_rate": 7.118742009458607e-06, + "loss": 0.3307, + "step": 13098 + }, + { + "epoch": 0.38, + "grad_norm": 1.345357681963785, + "learning_rate": 7.118316548344271e-06, + "loss": 0.3304, + "step": 13099 + }, + { + "epoch": 0.38, + "grad_norm": 1.3894881461962219, + "learning_rate": 7.117891068535767e-06, + "loss": 0.3428, + "step": 13100 + }, + { + "epoch": 0.38, + "grad_norm": 1.921112661071267, + "learning_rate": 7.117465570036851e-06, + "loss": 0.3257, + "step": 13101 + }, + { + "epoch": 0.38, + "grad_norm": 1.2859649586301336, + "learning_rate": 7.117040052851277e-06, + "loss": 0.3403, + "step": 13102 + }, + { + "epoch": 0.38, + "grad_norm": 1.5589223615522085, + "learning_rate": 7.1166145169828025e-06, + "loss": 0.3573, + "step": 13103 + }, + { + "epoch": 0.38, + "grad_norm": 2.0769249166625556, + "learning_rate": 7.116188962435182e-06, + "loss": 0.3339, + "step": 13104 + }, + { + "epoch": 0.38, + "grad_norm": 1.3014806932488652, + "learning_rate": 7.115763389212168e-06, + "loss": 0.3357, + "step": 13105 + }, + { + "epoch": 0.38, + "grad_norm": 1.1924459039825948, + "learning_rate": 7.11533779731752e-06, + "loss": 0.3153, + "step": 13106 + }, + { + "epoch": 0.38, + "grad_norm": 1.7406621577529042, + "learning_rate": 7.114912186754992e-06, + "loss": 0.3176, + "step": 13107 + }, + { + "epoch": 0.38, + "grad_norm": 1.4104627015783318, + "learning_rate": 7.1144865575283415e-06, + "loss": 0.3413, + "step": 13108 + }, + { + "epoch": 0.38, + "grad_norm": 1.4466737216073562, + "learning_rate": 7.114060909641322e-06, + "loss": 0.3374, + "step": 13109 + }, + { + "epoch": 0.38, + "grad_norm": 1.330701847047944, + "learning_rate": 7.113635243097694e-06, + "loss": 0.3414, + "step": 13110 + }, + { + "epoch": 0.38, + "grad_norm": 1.32235235052037, + "learning_rate": 7.11320955790121e-06, + "loss": 0.3518, + "step": 13111 + }, + { + "epoch": 0.38, + "grad_norm": 1.575310453127615, + "learning_rate": 7.112783854055628e-06, + "loss": 0.3296, + "step": 13112 + }, + { + "epoch": 0.38, + "grad_norm": 1.2008677724854635, + "learning_rate": 7.112358131564706e-06, + "loss": 0.3066, + "step": 13113 + }, + { + "epoch": 0.38, + "grad_norm": 1.3832107468036083, + "learning_rate": 7.111932390432199e-06, + "loss": 0.333, + "step": 13114 + }, + { + "epoch": 0.38, + "grad_norm": 1.2377068460337741, + "learning_rate": 7.111506630661867e-06, + "loss": 0.3406, + "step": 13115 + }, + { + "epoch": 0.38, + "grad_norm": 1.4981650581818111, + "learning_rate": 7.111080852257462e-06, + "loss": 0.3289, + "step": 13116 + }, + { + "epoch": 0.38, + "grad_norm": 1.3242197982262263, + "learning_rate": 7.110655055222748e-06, + "loss": 0.3267, + "step": 13117 + }, + { + "epoch": 0.38, + "grad_norm": 1.1896521858914617, + "learning_rate": 7.1102292395614795e-06, + "loss": 0.3324, + "step": 13118 + }, + { + "epoch": 0.38, + "grad_norm": 1.6428666680693658, + "learning_rate": 7.109803405277413e-06, + "loss": 0.3521, + "step": 13119 + }, + { + "epoch": 0.38, + "grad_norm": 1.3192323945377071, + "learning_rate": 7.109377552374311e-06, + "loss": 0.3435, + "step": 13120 + }, + { + "epoch": 0.38, + "grad_norm": 1.372587256210232, + "learning_rate": 7.108951680855924e-06, + "loss": 0.356, + "step": 13121 + }, + { + "epoch": 0.38, + "grad_norm": 1.5106571532723676, + "learning_rate": 7.108525790726019e-06, + "loss": 0.3257, + "step": 13122 + }, + { + "epoch": 0.38, + "grad_norm": 1.318979552746858, + "learning_rate": 7.10809988198835e-06, + "loss": 0.3386, + "step": 13123 + }, + { + "epoch": 0.38, + "grad_norm": 1.7484545535550229, + "learning_rate": 7.107673954646675e-06, + "loss": 0.3486, + "step": 13124 + }, + { + "epoch": 0.38, + "grad_norm": 1.2802700095191843, + "learning_rate": 7.107248008704754e-06, + "loss": 0.3662, + "step": 13125 + }, + { + "epoch": 0.38, + "grad_norm": 1.360454238605446, + "learning_rate": 7.106822044166347e-06, + "loss": 0.3064, + "step": 13126 + }, + { + "epoch": 0.38, + "grad_norm": 1.2454173291528885, + "learning_rate": 7.106396061035212e-06, + "loss": 0.3752, + "step": 13127 + }, + { + "epoch": 0.38, + "grad_norm": 1.3660260802671753, + "learning_rate": 7.105970059315108e-06, + "loss": 0.3499, + "step": 13128 + }, + { + "epoch": 0.38, + "grad_norm": 1.1728940574388964, + "learning_rate": 7.105544039009794e-06, + "loss": 0.3338, + "step": 13129 + }, + { + "epoch": 0.38, + "grad_norm": 1.3212999711717126, + "learning_rate": 7.1051180001230316e-06, + "loss": 0.3365, + "step": 13130 + }, + { + "epoch": 0.38, + "grad_norm": 1.3043698091229181, + "learning_rate": 7.104691942658579e-06, + "loss": 0.3341, + "step": 13131 + }, + { + "epoch": 0.38, + "grad_norm": 1.2515661494521975, + "learning_rate": 7.104265866620197e-06, + "loss": 0.3406, + "step": 13132 + }, + { + "epoch": 0.38, + "grad_norm": 1.1952020820415843, + "learning_rate": 7.1038397720116445e-06, + "loss": 0.3296, + "step": 13133 + }, + { + "epoch": 0.38, + "grad_norm": 1.2454172948288165, + "learning_rate": 7.1034136588366846e-06, + "loss": 0.312, + "step": 13134 + }, + { + "epoch": 0.38, + "grad_norm": 1.2383236763016574, + "learning_rate": 7.102987527099075e-06, + "loss": 0.3308, + "step": 13135 + }, + { + "epoch": 0.38, + "grad_norm": 1.5089924984124659, + "learning_rate": 7.102561376802576e-06, + "loss": 0.3279, + "step": 13136 + }, + { + "epoch": 0.38, + "grad_norm": 2.5720951463366246, + "learning_rate": 7.10213520795095e-06, + "loss": 0.3237, + "step": 13137 + }, + { + "epoch": 0.38, + "grad_norm": 1.4602280272349726, + "learning_rate": 7.101709020547958e-06, + "loss": 0.3272, + "step": 13138 + }, + { + "epoch": 0.38, + "grad_norm": 1.5641616888882937, + "learning_rate": 7.101282814597359e-06, + "loss": 0.3204, + "step": 13139 + }, + { + "epoch": 0.38, + "grad_norm": 1.4493738985151647, + "learning_rate": 7.100856590102918e-06, + "loss": 0.3375, + "step": 13140 + }, + { + "epoch": 0.38, + "grad_norm": 1.550647675058278, + "learning_rate": 7.100430347068394e-06, + "loss": 0.3423, + "step": 13141 + }, + { + "epoch": 0.38, + "grad_norm": 1.3363178531657163, + "learning_rate": 7.100004085497548e-06, + "loss": 0.323, + "step": 13142 + }, + { + "epoch": 0.38, + "grad_norm": 2.1802053883125048, + "learning_rate": 7.099577805394142e-06, + "loss": 0.3529, + "step": 13143 + }, + { + "epoch": 0.38, + "grad_norm": 1.279966691591949, + "learning_rate": 7.09915150676194e-06, + "loss": 0.374, + "step": 13144 + }, + { + "epoch": 0.38, + "grad_norm": 1.5140108945031057, + "learning_rate": 7.098725189604703e-06, + "loss": 0.3649, + "step": 13145 + }, + { + "epoch": 0.38, + "grad_norm": 1.2883769465764026, + "learning_rate": 7.098298853926192e-06, + "loss": 0.3209, + "step": 13146 + }, + { + "epoch": 0.38, + "grad_norm": 1.221766710507419, + "learning_rate": 7.097872499730169e-06, + "loss": 0.3248, + "step": 13147 + }, + { + "epoch": 0.38, + "grad_norm": 1.4132751933116643, + "learning_rate": 7.0974461270204e-06, + "loss": 0.3617, + "step": 13148 + }, + { + "epoch": 0.38, + "grad_norm": 1.2706166400618166, + "learning_rate": 7.097019735800645e-06, + "loss": 0.3176, + "step": 13149 + }, + { + "epoch": 0.38, + "grad_norm": 1.2312185556351243, + "learning_rate": 7.096593326074668e-06, + "loss": 0.3157, + "step": 13150 + }, + { + "epoch": 0.38, + "grad_norm": 1.3432362960764601, + "learning_rate": 7.09616689784623e-06, + "loss": 0.3411, + "step": 13151 + }, + { + "epoch": 0.38, + "grad_norm": 1.4097566124234222, + "learning_rate": 7.095740451119098e-06, + "loss": 0.3517, + "step": 13152 + }, + { + "epoch": 0.38, + "grad_norm": 1.4435281070095816, + "learning_rate": 7.095313985897033e-06, + "loss": 0.3357, + "step": 13153 + }, + { + "epoch": 0.38, + "grad_norm": 1.6508385696341807, + "learning_rate": 7.094887502183798e-06, + "loss": 0.319, + "step": 13154 + }, + { + "epoch": 0.38, + "grad_norm": 1.3358342868152226, + "learning_rate": 7.09446099998316e-06, + "loss": 0.3162, + "step": 13155 + }, + { + "epoch": 0.38, + "grad_norm": 1.403753950356904, + "learning_rate": 7.094034479298877e-06, + "loss": 0.3297, + "step": 13156 + }, + { + "epoch": 0.38, + "grad_norm": 1.1966613783406268, + "learning_rate": 7.093607940134717e-06, + "loss": 0.3503, + "step": 13157 + }, + { + "epoch": 0.38, + "grad_norm": 1.547865767262918, + "learning_rate": 7.093181382494445e-06, + "loss": 0.3678, + "step": 13158 + }, + { + "epoch": 0.38, + "grad_norm": 1.4649155646442726, + "learning_rate": 7.092754806381823e-06, + "loss": 0.3518, + "step": 13159 + }, + { + "epoch": 0.38, + "grad_norm": 1.3011096082429183, + "learning_rate": 7.092328211800618e-06, + "loss": 0.3276, + "step": 13160 + }, + { + "epoch": 0.38, + "grad_norm": 1.27946531143323, + "learning_rate": 7.091901598754591e-06, + "loss": 0.3411, + "step": 13161 + }, + { + "epoch": 0.38, + "grad_norm": 1.2941652482689021, + "learning_rate": 7.09147496724751e-06, + "loss": 0.3178, + "step": 13162 + }, + { + "epoch": 0.38, + "grad_norm": 1.2858443598264324, + "learning_rate": 7.091048317283141e-06, + "loss": 0.3212, + "step": 13163 + }, + { + "epoch": 0.38, + "grad_norm": 1.3587747174604548, + "learning_rate": 7.090621648865246e-06, + "loss": 0.3337, + "step": 13164 + }, + { + "epoch": 0.38, + "grad_norm": 1.1949904697501903, + "learning_rate": 7.0901949619975915e-06, + "loss": 0.3219, + "step": 13165 + }, + { + "epoch": 0.38, + "grad_norm": 1.533750025103081, + "learning_rate": 7.089768256683945e-06, + "loss": 0.3223, + "step": 13166 + }, + { + "epoch": 0.38, + "grad_norm": 1.1986299439343502, + "learning_rate": 7.089341532928069e-06, + "loss": 0.3385, + "step": 13167 + }, + { + "epoch": 0.38, + "grad_norm": 0.9916550996040359, + "learning_rate": 7.088914790733731e-06, + "loss": 0.5828, + "step": 13168 + }, + { + "epoch": 0.38, + "grad_norm": 1.980431505231923, + "learning_rate": 7.088488030104697e-06, + "loss": 0.3388, + "step": 13169 + }, + { + "epoch": 0.38, + "grad_norm": 1.4286843545333578, + "learning_rate": 7.088061251044732e-06, + "loss": 0.3201, + "step": 13170 + }, + { + "epoch": 0.38, + "grad_norm": 1.341047197615756, + "learning_rate": 7.0876344535576035e-06, + "loss": 0.3103, + "step": 13171 + }, + { + "epoch": 0.38, + "grad_norm": 1.2594588755524798, + "learning_rate": 7.08720763764708e-06, + "loss": 0.3379, + "step": 13172 + }, + { + "epoch": 0.38, + "grad_norm": 1.306017161084974, + "learning_rate": 7.086780803316924e-06, + "loss": 0.3504, + "step": 13173 + }, + { + "epoch": 0.38, + "grad_norm": 1.4716509766714465, + "learning_rate": 7.086353950570904e-06, + "loss": 0.3526, + "step": 13174 + }, + { + "epoch": 0.38, + "grad_norm": 1.2781265306587017, + "learning_rate": 7.085927079412788e-06, + "loss": 0.3211, + "step": 13175 + }, + { + "epoch": 0.38, + "grad_norm": 1.3927786409091183, + "learning_rate": 7.0855001898463425e-06, + "loss": 0.3378, + "step": 13176 + }, + { + "epoch": 0.38, + "grad_norm": 0.9687519190444636, + "learning_rate": 7.085073281875334e-06, + "loss": 0.6022, + "step": 13177 + }, + { + "epoch": 0.38, + "grad_norm": 1.3832587550997775, + "learning_rate": 7.08464635550353e-06, + "loss": 0.3423, + "step": 13178 + }, + { + "epoch": 0.38, + "grad_norm": 1.2933384074469434, + "learning_rate": 7.084219410734701e-06, + "loss": 0.3358, + "step": 13179 + }, + { + "epoch": 0.38, + "grad_norm": 1.4902392635783641, + "learning_rate": 7.0837924475726104e-06, + "loss": 0.3248, + "step": 13180 + }, + { + "epoch": 0.38, + "grad_norm": 1.3898155648663824, + "learning_rate": 7.083365466021031e-06, + "loss": 0.3373, + "step": 13181 + }, + { + "epoch": 0.38, + "grad_norm": 1.3538469731154033, + "learning_rate": 7.082938466083725e-06, + "loss": 0.3324, + "step": 13182 + }, + { + "epoch": 0.38, + "grad_norm": 0.923642375122179, + "learning_rate": 7.082511447764466e-06, + "loss": 0.6405, + "step": 13183 + }, + { + "epoch": 0.38, + "grad_norm": 1.6985032603601506, + "learning_rate": 7.0820844110670204e-06, + "loss": 0.34, + "step": 13184 + }, + { + "epoch": 0.38, + "grad_norm": 1.3468330771034243, + "learning_rate": 7.081657355995156e-06, + "loss": 0.3193, + "step": 13185 + }, + { + "epoch": 0.38, + "grad_norm": 1.3141905099855309, + "learning_rate": 7.081230282552644e-06, + "loss": 0.3202, + "step": 13186 + }, + { + "epoch": 0.38, + "grad_norm": 1.3205052641032409, + "learning_rate": 7.080803190743249e-06, + "loss": 0.3292, + "step": 13187 + }, + { + "epoch": 0.38, + "grad_norm": 1.4980992321578794, + "learning_rate": 7.080376080570743e-06, + "loss": 0.3265, + "step": 13188 + }, + { + "epoch": 0.38, + "grad_norm": 1.3041479623808592, + "learning_rate": 7.079948952038896e-06, + "loss": 0.3299, + "step": 13189 + }, + { + "epoch": 0.38, + "grad_norm": 1.2274147176567662, + "learning_rate": 7.079521805151478e-06, + "loss": 0.3054, + "step": 13190 + }, + { + "epoch": 0.38, + "grad_norm": 1.271269211156076, + "learning_rate": 7.079094639912255e-06, + "loss": 0.3185, + "step": 13191 + }, + { + "epoch": 0.38, + "grad_norm": 16.204972115364832, + "learning_rate": 7.078667456324999e-06, + "loss": 0.3282, + "step": 13192 + }, + { + "epoch": 0.38, + "grad_norm": 1.330405261964729, + "learning_rate": 7.078240254393479e-06, + "loss": 0.3674, + "step": 13193 + }, + { + "epoch": 0.38, + "grad_norm": 1.2872664608226767, + "learning_rate": 7.077813034121468e-06, + "loss": 0.3073, + "step": 13194 + }, + { + "epoch": 0.38, + "grad_norm": 1.2642767338861631, + "learning_rate": 7.077385795512732e-06, + "loss": 0.3253, + "step": 13195 + }, + { + "epoch": 0.38, + "grad_norm": 1.6403447702656317, + "learning_rate": 7.076958538571044e-06, + "loss": 0.3288, + "step": 13196 + }, + { + "epoch": 0.38, + "grad_norm": 1.2855086878385427, + "learning_rate": 7.076531263300174e-06, + "loss": 0.3323, + "step": 13197 + }, + { + "epoch": 0.38, + "grad_norm": 1.369709122554025, + "learning_rate": 7.076103969703891e-06, + "loss": 0.3281, + "step": 13198 + }, + { + "epoch": 0.38, + "grad_norm": 1.4258357165090214, + "learning_rate": 7.075676657785969e-06, + "loss": 0.3287, + "step": 13199 + }, + { + "epoch": 0.38, + "grad_norm": 1.3835287174619102, + "learning_rate": 7.075249327550178e-06, + "loss": 0.341, + "step": 13200 + }, + { + "epoch": 0.38, + "grad_norm": 1.251837103470609, + "learning_rate": 7.074821979000287e-06, + "loss": 0.331, + "step": 13201 + }, + { + "epoch": 0.38, + "grad_norm": 1.8946516858708955, + "learning_rate": 7.0743946121400695e-06, + "loss": 0.313, + "step": 13202 + }, + { + "epoch": 0.38, + "grad_norm": 1.5671539975414208, + "learning_rate": 7.0739672269732975e-06, + "loss": 0.3414, + "step": 13203 + }, + { + "epoch": 0.38, + "grad_norm": 1.453765106432467, + "learning_rate": 7.073539823503741e-06, + "loss": 0.3241, + "step": 13204 + }, + { + "epoch": 0.38, + "grad_norm": 1.0074036465422982, + "learning_rate": 7.073112401735173e-06, + "loss": 0.6156, + "step": 13205 + }, + { + "epoch": 0.38, + "grad_norm": 1.5026198456804272, + "learning_rate": 7.0726849616713634e-06, + "loss": 0.3385, + "step": 13206 + }, + { + "epoch": 0.38, + "grad_norm": 1.2179523632915408, + "learning_rate": 7.072257503316087e-06, + "loss": 0.3259, + "step": 13207 + }, + { + "epoch": 0.38, + "grad_norm": 1.2700722392442427, + "learning_rate": 7.071830026673117e-06, + "loss": 0.3131, + "step": 13208 + }, + { + "epoch": 0.38, + "grad_norm": 1.3145353001215572, + "learning_rate": 7.071402531746221e-06, + "loss": 0.3716, + "step": 13209 + }, + { + "epoch": 0.38, + "grad_norm": 1.360457985877299, + "learning_rate": 7.070975018539177e-06, + "loss": 0.3368, + "step": 13210 + }, + { + "epoch": 0.38, + "grad_norm": 1.266445192124985, + "learning_rate": 7.0705474870557546e-06, + "loss": 0.3246, + "step": 13211 + }, + { + "epoch": 0.38, + "grad_norm": 1.264927065591425, + "learning_rate": 7.070119937299728e-06, + "loss": 0.3281, + "step": 13212 + }, + { + "epoch": 0.38, + "grad_norm": 1.5023544762687644, + "learning_rate": 7.06969236927487e-06, + "loss": 0.3673, + "step": 13213 + }, + { + "epoch": 0.38, + "grad_norm": 1.5046903650722434, + "learning_rate": 7.0692647829849525e-06, + "loss": 0.3422, + "step": 13214 + }, + { + "epoch": 0.38, + "grad_norm": 1.2565963164292075, + "learning_rate": 7.068837178433752e-06, + "loss": 0.337, + "step": 13215 + }, + { + "epoch": 0.38, + "grad_norm": 1.2558249997878823, + "learning_rate": 7.068409555625038e-06, + "loss": 0.3127, + "step": 13216 + }, + { + "epoch": 0.38, + "grad_norm": 1.8450269337616167, + "learning_rate": 7.067981914562589e-06, + "loss": 0.315, + "step": 13217 + }, + { + "epoch": 0.38, + "grad_norm": 1.3810572907888756, + "learning_rate": 7.067554255250178e-06, + "loss": 0.3568, + "step": 13218 + }, + { + "epoch": 0.38, + "grad_norm": 1.5363780204874262, + "learning_rate": 7.067126577691575e-06, + "loss": 0.3344, + "step": 13219 + }, + { + "epoch": 0.38, + "grad_norm": 1.3949793610918702, + "learning_rate": 7.066698881890558e-06, + "loss": 0.3189, + "step": 13220 + }, + { + "epoch": 0.38, + "grad_norm": 1.2744137525158408, + "learning_rate": 7.0662711678509e-06, + "loss": 0.3318, + "step": 13221 + }, + { + "epoch": 0.38, + "grad_norm": 1.5978355456059008, + "learning_rate": 7.065843435576377e-06, + "loss": 0.3374, + "step": 13222 + }, + { + "epoch": 0.38, + "grad_norm": 1.1875844157581184, + "learning_rate": 7.065415685070762e-06, + "loss": 0.3209, + "step": 13223 + }, + { + "epoch": 0.38, + "grad_norm": 1.2158832039592034, + "learning_rate": 7.064987916337831e-06, + "loss": 0.3181, + "step": 13224 + }, + { + "epoch": 0.38, + "grad_norm": 1.3927833203904962, + "learning_rate": 7.064560129381359e-06, + "loss": 0.3432, + "step": 13225 + }, + { + "epoch": 0.38, + "grad_norm": 1.5455990933824901, + "learning_rate": 7.06413232420512e-06, + "loss": 0.3326, + "step": 13226 + }, + { + "epoch": 0.38, + "grad_norm": 1.3772283777663126, + "learning_rate": 7.063704500812891e-06, + "loss": 0.3396, + "step": 13227 + }, + { + "epoch": 0.38, + "grad_norm": 1.4124226833157498, + "learning_rate": 7.063276659208447e-06, + "loss": 0.3416, + "step": 13228 + }, + { + "epoch": 0.38, + "grad_norm": 1.2622023888417258, + "learning_rate": 7.062848799395563e-06, + "loss": 0.324, + "step": 13229 + }, + { + "epoch": 0.38, + "grad_norm": 1.280994449609511, + "learning_rate": 7.062420921378016e-06, + "loss": 0.3204, + "step": 13230 + }, + { + "epoch": 0.38, + "grad_norm": 1.6684618653772954, + "learning_rate": 7.061993025159582e-06, + "loss": 0.3219, + "step": 13231 + }, + { + "epoch": 0.38, + "grad_norm": 1.467354742337034, + "learning_rate": 7.061565110744036e-06, + "loss": 0.3328, + "step": 13232 + }, + { + "epoch": 0.38, + "grad_norm": 2.8129069655694376, + "learning_rate": 7.061137178135154e-06, + "loss": 0.3298, + "step": 13233 + }, + { + "epoch": 0.38, + "grad_norm": 1.6946737331427697, + "learning_rate": 7.060709227336715e-06, + "loss": 0.3182, + "step": 13234 + }, + { + "epoch": 0.38, + "grad_norm": 0.9815043460552861, + "learning_rate": 7.060281258352492e-06, + "loss": 0.6475, + "step": 13235 + }, + { + "epoch": 0.38, + "grad_norm": 1.3540401728914986, + "learning_rate": 7.059853271186268e-06, + "loss": 0.362, + "step": 13236 + }, + { + "epoch": 0.38, + "grad_norm": 1.3966882153497961, + "learning_rate": 7.059425265841813e-06, + "loss": 0.3302, + "step": 13237 + }, + { + "epoch": 0.38, + "grad_norm": 1.2839616143053614, + "learning_rate": 7.058997242322908e-06, + "loss": 0.3309, + "step": 13238 + }, + { + "epoch": 0.38, + "grad_norm": 1.284632628715223, + "learning_rate": 7.0585692006333305e-06, + "loss": 0.3194, + "step": 13239 + }, + { + "epoch": 0.38, + "grad_norm": 1.3141089648854078, + "learning_rate": 7.058141140776855e-06, + "loss": 0.3289, + "step": 13240 + }, + { + "epoch": 0.38, + "grad_norm": 2.1814520648989206, + "learning_rate": 7.057713062757263e-06, + "loss": 0.355, + "step": 13241 + }, + { + "epoch": 0.38, + "grad_norm": 0.8948462929961819, + "learning_rate": 7.05728496657833e-06, + "loss": 0.5988, + "step": 13242 + }, + { + "epoch": 0.38, + "grad_norm": 1.2492697093363938, + "learning_rate": 7.056856852243833e-06, + "loss": 0.3289, + "step": 13243 + }, + { + "epoch": 0.38, + "grad_norm": 1.2597326851738686, + "learning_rate": 7.056428719757552e-06, + "loss": 0.3334, + "step": 13244 + }, + { + "epoch": 0.38, + "grad_norm": 6.726822035216294, + "learning_rate": 7.056000569123266e-06, + "loss": 0.3553, + "step": 13245 + }, + { + "epoch": 0.38, + "grad_norm": 1.9154049836934555, + "learning_rate": 7.055572400344751e-06, + "loss": 0.3414, + "step": 13246 + }, + { + "epoch": 0.38, + "grad_norm": 1.2313797824348587, + "learning_rate": 7.055144213425786e-06, + "loss": 0.3276, + "step": 13247 + }, + { + "epoch": 0.38, + "grad_norm": 1.477500823281729, + "learning_rate": 7.054716008370152e-06, + "loss": 0.3497, + "step": 13248 + }, + { + "epoch": 0.38, + "grad_norm": 1.2601182804767321, + "learning_rate": 7.054287785181626e-06, + "loss": 0.341, + "step": 13249 + }, + { + "epoch": 0.38, + "grad_norm": 1.430841255580063, + "learning_rate": 7.053859543863988e-06, + "loss": 0.3593, + "step": 13250 + }, + { + "epoch": 0.38, + "grad_norm": 1.39822775725628, + "learning_rate": 7.053431284421015e-06, + "loss": 0.3297, + "step": 13251 + }, + { + "epoch": 0.38, + "grad_norm": 1.3493519305831003, + "learning_rate": 7.053003006856489e-06, + "loss": 0.3541, + "step": 13252 + }, + { + "epoch": 0.38, + "grad_norm": 1.4261655472125288, + "learning_rate": 7.05257471117419e-06, + "loss": 0.324, + "step": 13253 + }, + { + "epoch": 0.38, + "grad_norm": 1.2253545770121754, + "learning_rate": 7.052146397377895e-06, + "loss": 0.3259, + "step": 13254 + }, + { + "epoch": 0.38, + "grad_norm": 1.2164509601762725, + "learning_rate": 7.051718065471387e-06, + "loss": 0.3267, + "step": 13255 + }, + { + "epoch": 0.38, + "grad_norm": 1.2157480279775221, + "learning_rate": 7.051289715458443e-06, + "loss": 0.3488, + "step": 13256 + }, + { + "epoch": 0.38, + "grad_norm": 1.4836249735356368, + "learning_rate": 7.0508613473428436e-06, + "loss": 0.3459, + "step": 13257 + }, + { + "epoch": 0.38, + "grad_norm": 1.4728033897249275, + "learning_rate": 7.05043296112837e-06, + "loss": 0.3323, + "step": 13258 + }, + { + "epoch": 0.38, + "grad_norm": 1.8169205874104655, + "learning_rate": 7.050004556818805e-06, + "loss": 0.3266, + "step": 13259 + }, + { + "epoch": 0.38, + "grad_norm": 1.6865745641647072, + "learning_rate": 7.0495761344179255e-06, + "loss": 0.3246, + "step": 13260 + }, + { + "epoch": 0.38, + "grad_norm": 1.2552759276485683, + "learning_rate": 7.0491476939295146e-06, + "loss": 0.3469, + "step": 13261 + }, + { + "epoch": 0.38, + "grad_norm": 1.4240916361958567, + "learning_rate": 7.048719235357351e-06, + "loss": 0.3271, + "step": 13262 + }, + { + "epoch": 0.38, + "grad_norm": 1.6252731759338874, + "learning_rate": 7.04829075870522e-06, + "loss": 0.3332, + "step": 13263 + }, + { + "epoch": 0.38, + "grad_norm": 1.259257116280518, + "learning_rate": 7.047862263976898e-06, + "loss": 0.2944, + "step": 13264 + }, + { + "epoch": 0.38, + "grad_norm": 1.2391184934893749, + "learning_rate": 7.047433751176169e-06, + "loss": 0.349, + "step": 13265 + }, + { + "epoch": 0.38, + "grad_norm": 2.5534704875348826, + "learning_rate": 7.047005220306815e-06, + "loss": 0.3335, + "step": 13266 + }, + { + "epoch": 0.38, + "grad_norm": 1.2313070844915577, + "learning_rate": 7.046576671372618e-06, + "loss": 0.3264, + "step": 13267 + }, + { + "epoch": 0.38, + "grad_norm": 1.4204079114373653, + "learning_rate": 7.046148104377359e-06, + "loss": 0.364, + "step": 13268 + }, + { + "epoch": 0.38, + "grad_norm": 1.3188075264194312, + "learning_rate": 7.04571951932482e-06, + "loss": 0.361, + "step": 13269 + }, + { + "epoch": 0.38, + "grad_norm": 1.2382398283143157, + "learning_rate": 7.045290916218783e-06, + "loss": 0.3117, + "step": 13270 + }, + { + "epoch": 0.38, + "grad_norm": 1.3789673457977636, + "learning_rate": 7.0448622950630305e-06, + "loss": 0.325, + "step": 13271 + }, + { + "epoch": 0.38, + "grad_norm": 1.3088060372221095, + "learning_rate": 7.044433655861347e-06, + "loss": 0.3378, + "step": 13272 + }, + { + "epoch": 0.38, + "grad_norm": 1.4382232534690422, + "learning_rate": 7.044004998617513e-06, + "loss": 0.3421, + "step": 13273 + }, + { + "epoch": 0.39, + "grad_norm": 2.779155313330528, + "learning_rate": 7.043576323335313e-06, + "loss": 0.3515, + "step": 13274 + }, + { + "epoch": 0.39, + "grad_norm": 1.3743418581222338, + "learning_rate": 7.043147630018528e-06, + "loss": 0.3249, + "step": 13275 + }, + { + "epoch": 0.39, + "grad_norm": 1.242403104782702, + "learning_rate": 7.042718918670943e-06, + "loss": 0.3319, + "step": 13276 + }, + { + "epoch": 0.39, + "grad_norm": 1.587353996897315, + "learning_rate": 7.0422901892963415e-06, + "loss": 0.3361, + "step": 13277 + }, + { + "epoch": 0.39, + "grad_norm": 1.2689466941477197, + "learning_rate": 7.041861441898505e-06, + "loss": 0.3057, + "step": 13278 + }, + { + "epoch": 0.39, + "grad_norm": 1.5348791885062336, + "learning_rate": 7.04143267648122e-06, + "loss": 0.3223, + "step": 13279 + }, + { + "epoch": 0.39, + "grad_norm": 1.465212352640685, + "learning_rate": 7.041003893048269e-06, + "loss": 0.332, + "step": 13280 + }, + { + "epoch": 0.39, + "grad_norm": 1.4625395616673116, + "learning_rate": 7.040575091603435e-06, + "loss": 0.373, + "step": 13281 + }, + { + "epoch": 0.39, + "grad_norm": 2.7019691720931585, + "learning_rate": 7.040146272150506e-06, + "loss": 0.323, + "step": 13282 + }, + { + "epoch": 0.39, + "grad_norm": 2.033478712184193, + "learning_rate": 7.039717434693261e-06, + "loss": 0.323, + "step": 13283 + }, + { + "epoch": 0.39, + "grad_norm": 1.2334292411755607, + "learning_rate": 7.039288579235486e-06, + "loss": 0.3165, + "step": 13284 + }, + { + "epoch": 0.39, + "grad_norm": 1.280458976921794, + "learning_rate": 7.038859705780967e-06, + "loss": 0.3248, + "step": 13285 + }, + { + "epoch": 0.39, + "grad_norm": 1.238610880043959, + "learning_rate": 7.038430814333491e-06, + "loss": 0.327, + "step": 13286 + }, + { + "epoch": 0.39, + "grad_norm": 1.332204576139778, + "learning_rate": 7.038001904896839e-06, + "loss": 0.34, + "step": 13287 + }, + { + "epoch": 0.39, + "grad_norm": 1.3175045625277293, + "learning_rate": 7.0375729774747966e-06, + "loss": 0.3218, + "step": 13288 + }, + { + "epoch": 0.39, + "grad_norm": 1.3609926496328257, + "learning_rate": 7.03714403207115e-06, + "loss": 0.32, + "step": 13289 + }, + { + "epoch": 0.39, + "grad_norm": 1.2123221237742945, + "learning_rate": 7.0367150686896855e-06, + "loss": 0.3558, + "step": 13290 + }, + { + "epoch": 0.39, + "grad_norm": 1.196938252408046, + "learning_rate": 7.03628608733419e-06, + "loss": 0.325, + "step": 13291 + }, + { + "epoch": 0.39, + "grad_norm": 1.302836837615347, + "learning_rate": 7.035857088008444e-06, + "loss": 0.322, + "step": 13292 + }, + { + "epoch": 0.39, + "grad_norm": 1.3556350083860556, + "learning_rate": 7.035428070716238e-06, + "loss": 0.3417, + "step": 13293 + }, + { + "epoch": 0.39, + "grad_norm": 1.2130819714271206, + "learning_rate": 7.034999035461356e-06, + "loss": 0.3429, + "step": 13294 + }, + { + "epoch": 0.39, + "grad_norm": 1.3638469164403577, + "learning_rate": 7.034569982247587e-06, + "loss": 0.354, + "step": 13295 + }, + { + "epoch": 0.39, + "grad_norm": 1.2797837572138502, + "learning_rate": 7.034140911078714e-06, + "loss": 0.3248, + "step": 13296 + }, + { + "epoch": 0.39, + "grad_norm": 1.4099654698186697, + "learning_rate": 7.033711821958524e-06, + "loss": 0.3306, + "step": 13297 + }, + { + "epoch": 0.39, + "grad_norm": 1.3311086292269896, + "learning_rate": 7.033282714890806e-06, + "loss": 0.3514, + "step": 13298 + }, + { + "epoch": 0.39, + "grad_norm": 1.4210354698282877, + "learning_rate": 7.032853589879344e-06, + "loss": 0.3243, + "step": 13299 + }, + { + "epoch": 0.39, + "grad_norm": 1.4137114015026508, + "learning_rate": 7.032424446927928e-06, + "loss": 0.3686, + "step": 13300 + }, + { + "epoch": 0.39, + "grad_norm": 1.2040528741335148, + "learning_rate": 7.031995286040343e-06, + "loss": 0.3164, + "step": 13301 + }, + { + "epoch": 0.39, + "grad_norm": 1.26484723889005, + "learning_rate": 7.0315661072203765e-06, + "loss": 0.3283, + "step": 13302 + }, + { + "epoch": 0.39, + "grad_norm": 1.542799069610494, + "learning_rate": 7.0311369104718174e-06, + "loss": 0.3307, + "step": 13303 + }, + { + "epoch": 0.39, + "grad_norm": 1.2351949727102756, + "learning_rate": 7.030707695798453e-06, + "loss": 0.3329, + "step": 13304 + }, + { + "epoch": 0.39, + "grad_norm": 1.3967950713769102, + "learning_rate": 7.03027846320407e-06, + "loss": 0.3412, + "step": 13305 + }, + { + "epoch": 0.39, + "grad_norm": 1.1736125846054302, + "learning_rate": 7.0298492126924564e-06, + "loss": 0.3111, + "step": 13306 + }, + { + "epoch": 0.39, + "grad_norm": 1.3272035168254752, + "learning_rate": 7.029419944267402e-06, + "loss": 0.3312, + "step": 13307 + }, + { + "epoch": 0.39, + "grad_norm": 1.7299398992513109, + "learning_rate": 7.028990657932693e-06, + "loss": 0.3278, + "step": 13308 + }, + { + "epoch": 0.39, + "grad_norm": 1.650075123041122, + "learning_rate": 7.028561353692121e-06, + "loss": 0.3434, + "step": 13309 + }, + { + "epoch": 0.39, + "grad_norm": 1.3699351022286546, + "learning_rate": 7.028132031549471e-06, + "loss": 0.3618, + "step": 13310 + }, + { + "epoch": 0.39, + "grad_norm": 1.2224939613872792, + "learning_rate": 7.027702691508533e-06, + "loss": 0.331, + "step": 13311 + }, + { + "epoch": 0.39, + "grad_norm": 1.2678499407847286, + "learning_rate": 7.027273333573097e-06, + "loss": 0.3302, + "step": 13312 + }, + { + "epoch": 0.39, + "grad_norm": 1.2840453456196408, + "learning_rate": 7.0268439577469495e-06, + "loss": 0.3498, + "step": 13313 + }, + { + "epoch": 0.39, + "grad_norm": 1.2980728853098704, + "learning_rate": 7.026414564033884e-06, + "loss": 0.3687, + "step": 13314 + }, + { + "epoch": 0.39, + "grad_norm": 1.9331706847756358, + "learning_rate": 7.025985152437686e-06, + "loss": 0.3671, + "step": 13315 + }, + { + "epoch": 0.39, + "grad_norm": 1.23891526557497, + "learning_rate": 7.0255557229621475e-06, + "loss": 0.3013, + "step": 13316 + }, + { + "epoch": 0.39, + "grad_norm": 1.2098255283254598, + "learning_rate": 7.025126275611058e-06, + "loss": 0.3249, + "step": 13317 + }, + { + "epoch": 0.39, + "grad_norm": 1.2352304538288381, + "learning_rate": 7.024696810388205e-06, + "loss": 0.3136, + "step": 13318 + }, + { + "epoch": 0.39, + "grad_norm": 2.4707525886906634, + "learning_rate": 7.02426732729738e-06, + "loss": 0.3138, + "step": 13319 + }, + { + "epoch": 0.39, + "grad_norm": 0.9291402546559508, + "learning_rate": 7.023837826342374e-06, + "loss": 0.5893, + "step": 13320 + }, + { + "epoch": 0.39, + "grad_norm": 1.2964777744573543, + "learning_rate": 7.0234083075269774e-06, + "loss": 0.3436, + "step": 13321 + }, + { + "epoch": 0.39, + "grad_norm": 1.3401319213861138, + "learning_rate": 7.02297877085498e-06, + "loss": 0.3688, + "step": 13322 + }, + { + "epoch": 0.39, + "grad_norm": 1.3984412656433651, + "learning_rate": 7.022549216330173e-06, + "loss": 0.3375, + "step": 13323 + }, + { + "epoch": 0.39, + "grad_norm": 1.6038081273391298, + "learning_rate": 7.022119643956345e-06, + "loss": 0.3069, + "step": 13324 + }, + { + "epoch": 0.39, + "grad_norm": 1.2808049504669325, + "learning_rate": 7.02169005373729e-06, + "loss": 0.3391, + "step": 13325 + }, + { + "epoch": 0.39, + "grad_norm": 1.2761196354792523, + "learning_rate": 7.021260445676797e-06, + "loss": 0.3442, + "step": 13326 + }, + { + "epoch": 0.39, + "grad_norm": 0.9309841551885736, + "learning_rate": 7.020830819778659e-06, + "loss": 0.6066, + "step": 13327 + }, + { + "epoch": 0.39, + "grad_norm": 1.2309292527375553, + "learning_rate": 7.020401176046666e-06, + "loss": 0.3288, + "step": 13328 + }, + { + "epoch": 0.39, + "grad_norm": 2.0740523836956766, + "learning_rate": 7.01997151448461e-06, + "loss": 0.3545, + "step": 13329 + }, + { + "epoch": 0.39, + "grad_norm": 1.4105488624276088, + "learning_rate": 7.019541835096283e-06, + "loss": 0.3407, + "step": 13330 + }, + { + "epoch": 0.39, + "grad_norm": 1.2698483188551417, + "learning_rate": 7.019112137885476e-06, + "loss": 0.313, + "step": 13331 + }, + { + "epoch": 0.39, + "grad_norm": 1.8677797063901014, + "learning_rate": 7.018682422855985e-06, + "loss": 0.3156, + "step": 13332 + }, + { + "epoch": 0.39, + "grad_norm": 1.3705180889335735, + "learning_rate": 7.018252690011596e-06, + "loss": 0.341, + "step": 13333 + }, + { + "epoch": 0.39, + "grad_norm": 1.2392578029168753, + "learning_rate": 7.017822939356107e-06, + "loss": 0.3055, + "step": 13334 + }, + { + "epoch": 0.39, + "grad_norm": 1.3153096896805858, + "learning_rate": 7.017393170893307e-06, + "loss": 0.3104, + "step": 13335 + }, + { + "epoch": 0.39, + "grad_norm": 1.605165592281588, + "learning_rate": 7.01696338462699e-06, + "loss": 0.3381, + "step": 13336 + }, + { + "epoch": 0.39, + "grad_norm": 1.2431414555127218, + "learning_rate": 7.01653358056095e-06, + "loss": 0.3251, + "step": 13337 + }, + { + "epoch": 0.39, + "grad_norm": 1.300601356136445, + "learning_rate": 7.0161037586989774e-06, + "loss": 0.3279, + "step": 13338 + }, + { + "epoch": 0.39, + "grad_norm": 1.3735584880968428, + "learning_rate": 7.015673919044866e-06, + "loss": 0.3328, + "step": 13339 + }, + { + "epoch": 0.39, + "grad_norm": 1.337996041301851, + "learning_rate": 7.0152440616024105e-06, + "loss": 0.3509, + "step": 13340 + }, + { + "epoch": 0.39, + "grad_norm": 1.724966287117529, + "learning_rate": 7.014814186375406e-06, + "loss": 0.3197, + "step": 13341 + }, + { + "epoch": 0.39, + "grad_norm": 2.043307993671407, + "learning_rate": 7.014384293367642e-06, + "loss": 0.3624, + "step": 13342 + }, + { + "epoch": 0.39, + "grad_norm": 1.342191096256237, + "learning_rate": 7.013954382582914e-06, + "loss": 0.3228, + "step": 13343 + }, + { + "epoch": 0.39, + "grad_norm": 1.2403373062316148, + "learning_rate": 7.013524454025016e-06, + "loss": 0.3219, + "step": 13344 + }, + { + "epoch": 0.39, + "grad_norm": 1.4790012980453748, + "learning_rate": 7.013094507697744e-06, + "loss": 0.3357, + "step": 13345 + }, + { + "epoch": 0.39, + "grad_norm": 1.7655032632867027, + "learning_rate": 7.012664543604889e-06, + "loss": 0.3356, + "step": 13346 + }, + { + "epoch": 0.39, + "grad_norm": 1.718778143871382, + "learning_rate": 7.0122345617502474e-06, + "loss": 0.3526, + "step": 13347 + }, + { + "epoch": 0.39, + "grad_norm": 1.6650700866435333, + "learning_rate": 7.011804562137614e-06, + "loss": 0.309, + "step": 13348 + }, + { + "epoch": 0.39, + "grad_norm": 1.371106940966489, + "learning_rate": 7.011374544770782e-06, + "loss": 0.3388, + "step": 13349 + }, + { + "epoch": 0.39, + "grad_norm": 1.4647835763138632, + "learning_rate": 7.010944509653549e-06, + "loss": 0.3345, + "step": 13350 + }, + { + "epoch": 0.39, + "grad_norm": 1.2992604287257499, + "learning_rate": 7.010514456789707e-06, + "loss": 0.3141, + "step": 13351 + }, + { + "epoch": 0.39, + "grad_norm": 1.3822653641534846, + "learning_rate": 7.010084386183054e-06, + "loss": 0.3237, + "step": 13352 + }, + { + "epoch": 0.39, + "grad_norm": 1.3148807124153794, + "learning_rate": 7.009654297837382e-06, + "loss": 0.3377, + "step": 13353 + }, + { + "epoch": 0.39, + "grad_norm": 1.3539439724081084, + "learning_rate": 7.009224191756489e-06, + "loss": 0.3257, + "step": 13354 + }, + { + "epoch": 0.39, + "grad_norm": 1.266966620865151, + "learning_rate": 7.0087940679441735e-06, + "loss": 0.34, + "step": 13355 + }, + { + "epoch": 0.39, + "grad_norm": 1.3364626032338152, + "learning_rate": 7.008363926404225e-06, + "loss": 0.3292, + "step": 13356 + }, + { + "epoch": 0.39, + "grad_norm": 1.4209466840753302, + "learning_rate": 7.007933767140443e-06, + "loss": 0.3448, + "step": 13357 + }, + { + "epoch": 0.39, + "grad_norm": 1.3090845096312704, + "learning_rate": 7.007503590156622e-06, + "loss": 0.3279, + "step": 13358 + }, + { + "epoch": 0.39, + "grad_norm": 1.352329528145009, + "learning_rate": 7.007073395456562e-06, + "loss": 0.3535, + "step": 13359 + }, + { + "epoch": 0.39, + "grad_norm": 1.4850969070165194, + "learning_rate": 7.0066431830440555e-06, + "loss": 0.3255, + "step": 13360 + }, + { + "epoch": 0.39, + "grad_norm": 1.2575713322290962, + "learning_rate": 7.0062129529229e-06, + "loss": 0.3243, + "step": 13361 + }, + { + "epoch": 0.39, + "grad_norm": 1.3350240767450656, + "learning_rate": 7.005782705096894e-06, + "loss": 0.3328, + "step": 13362 + }, + { + "epoch": 0.39, + "grad_norm": 2.0551379514741375, + "learning_rate": 7.0053524395698345e-06, + "loss": 0.307, + "step": 13363 + }, + { + "epoch": 0.39, + "grad_norm": 1.2528619861201469, + "learning_rate": 7.004922156345515e-06, + "loss": 0.3379, + "step": 13364 + }, + { + "epoch": 0.39, + "grad_norm": 1.7803973788161787, + "learning_rate": 7.0044918554277375e-06, + "loss": 0.3478, + "step": 13365 + }, + { + "epoch": 0.39, + "grad_norm": 1.502189339058045, + "learning_rate": 7.004061536820296e-06, + "loss": 0.3381, + "step": 13366 + }, + { + "epoch": 0.39, + "grad_norm": 1.3883891522732046, + "learning_rate": 7.003631200526989e-06, + "loss": 0.3473, + "step": 13367 + }, + { + "epoch": 0.39, + "grad_norm": 1.5206808949655448, + "learning_rate": 7.003200846551614e-06, + "loss": 0.3436, + "step": 13368 + }, + { + "epoch": 0.39, + "grad_norm": 1.4294540861202512, + "learning_rate": 7.0027704748979705e-06, + "loss": 0.3218, + "step": 13369 + }, + { + "epoch": 0.39, + "grad_norm": 1.4999062551127482, + "learning_rate": 7.002340085569855e-06, + "loss": 0.3166, + "step": 13370 + }, + { + "epoch": 0.39, + "grad_norm": 1.3620833345435182, + "learning_rate": 7.001909678571066e-06, + "loss": 0.3169, + "step": 13371 + }, + { + "epoch": 0.39, + "grad_norm": 1.1570743071244798, + "learning_rate": 7.001479253905401e-06, + "loss": 0.3068, + "step": 13372 + }, + { + "epoch": 0.39, + "grad_norm": 1.5750685052249496, + "learning_rate": 7.001048811576661e-06, + "loss": 0.3284, + "step": 13373 + }, + { + "epoch": 0.39, + "grad_norm": 1.355018350046697, + "learning_rate": 7.000618351588643e-06, + "loss": 0.3489, + "step": 13374 + }, + { + "epoch": 0.39, + "grad_norm": 1.2935272386062533, + "learning_rate": 7.000187873945145e-06, + "loss": 0.3007, + "step": 13375 + }, + { + "epoch": 0.39, + "grad_norm": 1.3183835057916422, + "learning_rate": 6.999757378649966e-06, + "loss": 0.3547, + "step": 13376 + }, + { + "epoch": 0.39, + "grad_norm": 1.4477728153684644, + "learning_rate": 6.999326865706908e-06, + "loss": 0.3449, + "step": 13377 + }, + { + "epoch": 0.39, + "grad_norm": 1.3298990289153987, + "learning_rate": 6.998896335119766e-06, + "loss": 0.3233, + "step": 13378 + }, + { + "epoch": 0.39, + "grad_norm": 1.4195165858930974, + "learning_rate": 6.9984657868923435e-06, + "loss": 0.3247, + "step": 13379 + }, + { + "epoch": 0.39, + "grad_norm": 1.3740865122908907, + "learning_rate": 6.998035221028438e-06, + "loss": 0.3327, + "step": 13380 + }, + { + "epoch": 0.39, + "grad_norm": 1.2032004739260314, + "learning_rate": 6.997604637531848e-06, + "loss": 0.3245, + "step": 13381 + }, + { + "epoch": 0.39, + "grad_norm": 1.4244569870871915, + "learning_rate": 6.997174036406377e-06, + "loss": 0.3694, + "step": 13382 + }, + { + "epoch": 0.39, + "grad_norm": 1.351641843331726, + "learning_rate": 6.996743417655822e-06, + "loss": 0.3411, + "step": 13383 + }, + { + "epoch": 0.39, + "grad_norm": 1.4091022114659224, + "learning_rate": 6.996312781283983e-06, + "loss": 0.3505, + "step": 13384 + }, + { + "epoch": 0.39, + "grad_norm": 1.3911839990920052, + "learning_rate": 6.995882127294662e-06, + "loss": 0.3423, + "step": 13385 + }, + { + "epoch": 0.39, + "grad_norm": 1.4673822352478343, + "learning_rate": 6.99545145569166e-06, + "loss": 0.3341, + "step": 13386 + }, + { + "epoch": 0.39, + "grad_norm": 1.841445769157124, + "learning_rate": 6.995020766478777e-06, + "loss": 0.3474, + "step": 13387 + }, + { + "epoch": 0.39, + "grad_norm": 1.3317319660269926, + "learning_rate": 6.994590059659814e-06, + "loss": 0.3236, + "step": 13388 + }, + { + "epoch": 0.39, + "grad_norm": 1.1647443956432773, + "learning_rate": 6.99415933523857e-06, + "loss": 0.3256, + "step": 13389 + }, + { + "epoch": 0.39, + "grad_norm": 1.304497653096875, + "learning_rate": 6.993728593218848e-06, + "loss": 0.3493, + "step": 13390 + }, + { + "epoch": 0.39, + "grad_norm": 1.4314134608597822, + "learning_rate": 6.99329783360445e-06, + "loss": 0.3179, + "step": 13391 + }, + { + "epoch": 0.39, + "grad_norm": 1.3913786078223305, + "learning_rate": 6.992867056399175e-06, + "loss": 0.3154, + "step": 13392 + }, + { + "epoch": 0.39, + "grad_norm": 1.7674278731551698, + "learning_rate": 6.992436261606827e-06, + "loss": 0.3273, + "step": 13393 + }, + { + "epoch": 0.39, + "grad_norm": 1.327080230455568, + "learning_rate": 6.9920054492312086e-06, + "loss": 0.324, + "step": 13394 + }, + { + "epoch": 0.39, + "grad_norm": 1.2286374904194854, + "learning_rate": 6.991574619276118e-06, + "loss": 0.3054, + "step": 13395 + }, + { + "epoch": 0.39, + "grad_norm": 3.807561672799075, + "learning_rate": 6.99114377174536e-06, + "loss": 0.324, + "step": 13396 + }, + { + "epoch": 0.39, + "grad_norm": 1.3319564021372021, + "learning_rate": 6.990712906642736e-06, + "loss": 0.3368, + "step": 13397 + }, + { + "epoch": 0.39, + "grad_norm": 1.226496604015785, + "learning_rate": 6.990282023972048e-06, + "loss": 0.3336, + "step": 13398 + }, + { + "epoch": 0.39, + "grad_norm": 1.31757955636774, + "learning_rate": 6.9898511237370995e-06, + "loss": 0.3213, + "step": 13399 + }, + { + "epoch": 0.39, + "grad_norm": 1.359823512280184, + "learning_rate": 6.989420205941693e-06, + "loss": 0.355, + "step": 13400 + }, + { + "epoch": 0.39, + "grad_norm": 1.3457582513289448, + "learning_rate": 6.98898927058963e-06, + "loss": 0.3207, + "step": 13401 + }, + { + "epoch": 0.39, + "grad_norm": 1.3766883970743216, + "learning_rate": 6.988558317684715e-06, + "loss": 0.3475, + "step": 13402 + }, + { + "epoch": 0.39, + "grad_norm": 1.2245661913086372, + "learning_rate": 6.988127347230751e-06, + "loss": 0.33, + "step": 13403 + }, + { + "epoch": 0.39, + "grad_norm": 2.1583600816520225, + "learning_rate": 6.987696359231542e-06, + "loss": 0.3167, + "step": 13404 + }, + { + "epoch": 0.39, + "grad_norm": 1.4850013931902075, + "learning_rate": 6.987265353690891e-06, + "loss": 0.3224, + "step": 13405 + }, + { + "epoch": 0.39, + "grad_norm": 1.351461509628944, + "learning_rate": 6.9868343306126e-06, + "loss": 0.3403, + "step": 13406 + }, + { + "epoch": 0.39, + "grad_norm": 1.484035964799602, + "learning_rate": 6.986403290000473e-06, + "loss": 0.3421, + "step": 13407 + }, + { + "epoch": 0.39, + "grad_norm": 1.4551838252432943, + "learning_rate": 6.9859722318583166e-06, + "loss": 0.3129, + "step": 13408 + }, + { + "epoch": 0.39, + "grad_norm": 1.3584735884198833, + "learning_rate": 6.985541156189932e-06, + "loss": 0.3288, + "step": 13409 + }, + { + "epoch": 0.39, + "grad_norm": 1.9795845471122315, + "learning_rate": 6.985110062999127e-06, + "loss": 0.3613, + "step": 13410 + }, + { + "epoch": 0.39, + "grad_norm": 1.1824845166003533, + "learning_rate": 6.984678952289701e-06, + "loss": 0.3073, + "step": 13411 + }, + { + "epoch": 0.39, + "grad_norm": 1.5374096677693336, + "learning_rate": 6.984247824065462e-06, + "loss": 0.3209, + "step": 13412 + }, + { + "epoch": 0.39, + "grad_norm": 1.4995124385529797, + "learning_rate": 6.983816678330214e-06, + "loss": 0.3679, + "step": 13413 + }, + { + "epoch": 0.39, + "grad_norm": 1.6477285468701401, + "learning_rate": 6.983385515087763e-06, + "loss": 0.3356, + "step": 13414 + }, + { + "epoch": 0.39, + "grad_norm": 1.3550842795214206, + "learning_rate": 6.982954334341911e-06, + "loss": 0.3254, + "step": 13415 + }, + { + "epoch": 0.39, + "grad_norm": 1.4168591962654244, + "learning_rate": 6.982523136096465e-06, + "loss": 0.3183, + "step": 13416 + }, + { + "epoch": 0.39, + "grad_norm": 1.2693387165261198, + "learning_rate": 6.982091920355232e-06, + "loss": 0.3358, + "step": 13417 + }, + { + "epoch": 0.39, + "grad_norm": 1.4055037602737621, + "learning_rate": 6.981660687122015e-06, + "loss": 0.3387, + "step": 13418 + }, + { + "epoch": 0.39, + "grad_norm": 1.4176724408109513, + "learning_rate": 6.981229436400621e-06, + "loss": 0.3186, + "step": 13419 + }, + { + "epoch": 0.39, + "grad_norm": 1.2285040075366243, + "learning_rate": 6.9807981681948534e-06, + "loss": 0.3258, + "step": 13420 + }, + { + "epoch": 0.39, + "grad_norm": 1.375353850780877, + "learning_rate": 6.9803668825085205e-06, + "loss": 0.3193, + "step": 13421 + }, + { + "epoch": 0.39, + "grad_norm": 1.3555919280254771, + "learning_rate": 6.97993557934543e-06, + "loss": 0.314, + "step": 13422 + }, + { + "epoch": 0.39, + "grad_norm": 1.346401866518799, + "learning_rate": 6.979504258709384e-06, + "loss": 0.3292, + "step": 13423 + }, + { + "epoch": 0.39, + "grad_norm": 2.293453457555433, + "learning_rate": 6.979072920604193e-06, + "loss": 0.3665, + "step": 13424 + }, + { + "epoch": 0.39, + "grad_norm": 1.3824998852426171, + "learning_rate": 6.978641565033659e-06, + "loss": 0.3432, + "step": 13425 + }, + { + "epoch": 0.39, + "grad_norm": 1.2634847452614737, + "learning_rate": 6.9782101920015925e-06, + "loss": 0.3084, + "step": 13426 + }, + { + "epoch": 0.39, + "grad_norm": 1.3322720161730277, + "learning_rate": 6.9777788015117985e-06, + "loss": 0.3201, + "step": 13427 + }, + { + "epoch": 0.39, + "grad_norm": 1.2758758200326292, + "learning_rate": 6.977347393568086e-06, + "loss": 0.328, + "step": 13428 + }, + { + "epoch": 0.39, + "grad_norm": 1.4031214622314991, + "learning_rate": 6.97691596817426e-06, + "loss": 0.3527, + "step": 13429 + }, + { + "epoch": 0.39, + "grad_norm": 1.4583042801875805, + "learning_rate": 6.976484525334129e-06, + "loss": 0.327, + "step": 13430 + }, + { + "epoch": 0.39, + "grad_norm": 1.5990920009632015, + "learning_rate": 6.976053065051498e-06, + "loss": 0.326, + "step": 13431 + }, + { + "epoch": 0.39, + "grad_norm": 1.385672463524331, + "learning_rate": 6.975621587330179e-06, + "loss": 0.3503, + "step": 13432 + }, + { + "epoch": 0.39, + "grad_norm": 1.3128223157062084, + "learning_rate": 6.975190092173978e-06, + "loss": 0.3057, + "step": 13433 + }, + { + "epoch": 0.39, + "grad_norm": 1.3473621577277668, + "learning_rate": 6.9747585795867e-06, + "loss": 0.3625, + "step": 13434 + }, + { + "epoch": 0.39, + "grad_norm": 1.50613600434843, + "learning_rate": 6.9743270495721585e-06, + "loss": 0.3533, + "step": 13435 + }, + { + "epoch": 0.39, + "grad_norm": 1.3764173715392591, + "learning_rate": 6.973895502134158e-06, + "loss": 0.3321, + "step": 13436 + }, + { + "epoch": 0.39, + "grad_norm": 1.3973815267131338, + "learning_rate": 6.973463937276508e-06, + "loss": 0.3496, + "step": 13437 + }, + { + "epoch": 0.39, + "grad_norm": 1.2381522516016281, + "learning_rate": 6.973032355003015e-06, + "loss": 0.3446, + "step": 13438 + }, + { + "epoch": 0.39, + "grad_norm": 1.4178442774289097, + "learning_rate": 6.972600755317491e-06, + "loss": 0.3446, + "step": 13439 + }, + { + "epoch": 0.39, + "grad_norm": 1.4037659567144785, + "learning_rate": 6.972169138223743e-06, + "loss": 0.3328, + "step": 13440 + }, + { + "epoch": 0.39, + "grad_norm": 1.8288594222317878, + "learning_rate": 6.971737503725581e-06, + "loss": 0.3259, + "step": 13441 + }, + { + "epoch": 0.39, + "grad_norm": 1.4047669884441831, + "learning_rate": 6.971305851826814e-06, + "loss": 0.3252, + "step": 13442 + }, + { + "epoch": 0.39, + "grad_norm": 1.2804794402600657, + "learning_rate": 6.97087418253125e-06, + "loss": 0.3296, + "step": 13443 + }, + { + "epoch": 0.39, + "grad_norm": 1.834842858907147, + "learning_rate": 6.970442495842699e-06, + "loss": 0.3799, + "step": 13444 + }, + { + "epoch": 0.39, + "grad_norm": 1.1750887267332066, + "learning_rate": 6.9700107917649724e-06, + "loss": 0.3232, + "step": 13445 + }, + { + "epoch": 0.39, + "grad_norm": 1.348493313300619, + "learning_rate": 6.96957907030188e-06, + "loss": 0.3604, + "step": 13446 + }, + { + "epoch": 0.39, + "grad_norm": 1.1777948177371724, + "learning_rate": 6.969147331457228e-06, + "loss": 0.3046, + "step": 13447 + }, + { + "epoch": 0.39, + "grad_norm": 1.2695738014710698, + "learning_rate": 6.968715575234829e-06, + "loss": 0.3231, + "step": 13448 + }, + { + "epoch": 0.39, + "grad_norm": 1.2634429240308773, + "learning_rate": 6.968283801638493e-06, + "loss": 0.3296, + "step": 13449 + }, + { + "epoch": 0.39, + "grad_norm": 1.3746128110898699, + "learning_rate": 6.9678520106720326e-06, + "loss": 0.3203, + "step": 13450 + }, + { + "epoch": 0.39, + "grad_norm": 1.7007803717958023, + "learning_rate": 6.967420202339257e-06, + "loss": 0.3241, + "step": 13451 + }, + { + "epoch": 0.39, + "grad_norm": 1.2343799726449034, + "learning_rate": 6.9669883766439735e-06, + "loss": 0.3607, + "step": 13452 + }, + { + "epoch": 0.39, + "grad_norm": 1.4117801327397068, + "learning_rate": 6.966556533589997e-06, + "loss": 0.3249, + "step": 13453 + }, + { + "epoch": 0.39, + "grad_norm": 1.2816465754741229, + "learning_rate": 6.966124673181137e-06, + "loss": 0.3252, + "step": 13454 + }, + { + "epoch": 0.39, + "grad_norm": 1.252884311054459, + "learning_rate": 6.965692795421206e-06, + "loss": 0.3252, + "step": 13455 + }, + { + "epoch": 0.39, + "grad_norm": 1.4316202341190027, + "learning_rate": 6.965260900314013e-06, + "loss": 0.3081, + "step": 13456 + }, + { + "epoch": 0.39, + "grad_norm": 1.2170705627607068, + "learning_rate": 6.9648289878633705e-06, + "loss": 0.3245, + "step": 13457 + }, + { + "epoch": 0.39, + "grad_norm": 1.202936033676801, + "learning_rate": 6.964397058073091e-06, + "loss": 0.3261, + "step": 13458 + }, + { + "epoch": 0.39, + "grad_norm": 1.1877442982624573, + "learning_rate": 6.963965110946985e-06, + "loss": 0.3244, + "step": 13459 + }, + { + "epoch": 0.39, + "grad_norm": 1.3794677314975528, + "learning_rate": 6.963533146488867e-06, + "loss": 0.3454, + "step": 13460 + }, + { + "epoch": 0.39, + "grad_norm": 1.2908149529722526, + "learning_rate": 6.963101164702546e-06, + "loss": 0.3149, + "step": 13461 + }, + { + "epoch": 0.39, + "grad_norm": 1.3222942628454701, + "learning_rate": 6.962669165591835e-06, + "loss": 0.3411, + "step": 13462 + }, + { + "epoch": 0.39, + "grad_norm": 1.3331029513352344, + "learning_rate": 6.962237149160547e-06, + "loss": 0.3495, + "step": 13463 + }, + { + "epoch": 0.39, + "grad_norm": 1.217285810323708, + "learning_rate": 6.961805115412496e-06, + "loss": 0.3465, + "step": 13464 + }, + { + "epoch": 0.39, + "grad_norm": 1.349795133727918, + "learning_rate": 6.961373064351491e-06, + "loss": 0.3251, + "step": 13465 + }, + { + "epoch": 0.39, + "grad_norm": 1.3502964618638809, + "learning_rate": 6.960940995981348e-06, + "loss": 0.351, + "step": 13466 + }, + { + "epoch": 0.39, + "grad_norm": 1.4015192960437453, + "learning_rate": 6.960508910305879e-06, + "loss": 0.3503, + "step": 13467 + }, + { + "epoch": 0.39, + "grad_norm": 1.4867110710700415, + "learning_rate": 6.960076807328896e-06, + "loss": 0.3589, + "step": 13468 + }, + { + "epoch": 0.39, + "grad_norm": 1.2838342249638615, + "learning_rate": 6.959644687054215e-06, + "loss": 0.3112, + "step": 13469 + }, + { + "epoch": 0.39, + "grad_norm": 1.4601423464052172, + "learning_rate": 6.959212549485646e-06, + "loss": 0.2944, + "step": 13470 + }, + { + "epoch": 0.39, + "grad_norm": 1.4838050785148127, + "learning_rate": 6.958780394627006e-06, + "loss": 0.3326, + "step": 13471 + }, + { + "epoch": 0.39, + "grad_norm": 1.2198856852688011, + "learning_rate": 6.958348222482106e-06, + "loss": 0.3054, + "step": 13472 + }, + { + "epoch": 0.39, + "grad_norm": 1.2072265452982904, + "learning_rate": 6.957916033054763e-06, + "loss": 0.3267, + "step": 13473 + }, + { + "epoch": 0.39, + "grad_norm": 1.2573799534659351, + "learning_rate": 6.957483826348789e-06, + "loss": 0.334, + "step": 13474 + }, + { + "epoch": 0.39, + "grad_norm": 1.2704295469562183, + "learning_rate": 6.957051602367997e-06, + "loss": 0.3248, + "step": 13475 + }, + { + "epoch": 0.39, + "grad_norm": 1.6106621851517051, + "learning_rate": 6.9566193611162034e-06, + "loss": 0.3325, + "step": 13476 + }, + { + "epoch": 0.39, + "grad_norm": 1.5060715317560842, + "learning_rate": 6.956187102597223e-06, + "loss": 0.3881, + "step": 13477 + }, + { + "epoch": 0.39, + "grad_norm": 1.2667743523775394, + "learning_rate": 6.955754826814871e-06, + "loss": 0.3311, + "step": 13478 + }, + { + "epoch": 0.39, + "grad_norm": 1.3953348093920477, + "learning_rate": 6.955322533772959e-06, + "loss": 0.3354, + "step": 13479 + }, + { + "epoch": 0.39, + "grad_norm": 1.2578818170419714, + "learning_rate": 6.954890223475303e-06, + "loss": 0.3312, + "step": 13480 + }, + { + "epoch": 0.39, + "grad_norm": 1.1745182173282969, + "learning_rate": 6.954457895925722e-06, + "loss": 0.3218, + "step": 13481 + }, + { + "epoch": 0.39, + "grad_norm": 1.4350038797691838, + "learning_rate": 6.9540255511280255e-06, + "loss": 0.3333, + "step": 13482 + }, + { + "epoch": 0.39, + "grad_norm": 1.3292993030453912, + "learning_rate": 6.953593189086034e-06, + "loss": 0.3258, + "step": 13483 + }, + { + "epoch": 0.39, + "grad_norm": 1.3115174413050148, + "learning_rate": 6.95316080980356e-06, + "loss": 0.3229, + "step": 13484 + }, + { + "epoch": 0.39, + "grad_norm": 1.2928114854718082, + "learning_rate": 6.95272841328442e-06, + "loss": 0.3521, + "step": 13485 + }, + { + "epoch": 0.39, + "grad_norm": 1.298700847971467, + "learning_rate": 6.95229599953243e-06, + "loss": 0.3243, + "step": 13486 + }, + { + "epoch": 0.39, + "grad_norm": 1.470043597925442, + "learning_rate": 6.951863568551407e-06, + "loss": 0.3514, + "step": 13487 + }, + { + "epoch": 0.39, + "grad_norm": 1.3252977065148142, + "learning_rate": 6.951431120345165e-06, + "loss": 0.3431, + "step": 13488 + }, + { + "epoch": 0.39, + "grad_norm": 1.2683101464644368, + "learning_rate": 6.950998654917523e-06, + "loss": 0.323, + "step": 13489 + }, + { + "epoch": 0.39, + "grad_norm": 1.264699521978798, + "learning_rate": 6.950566172272295e-06, + "loss": 0.329, + "step": 13490 + }, + { + "epoch": 0.39, + "grad_norm": 1.6584589254404634, + "learning_rate": 6.9501336724133e-06, + "loss": 0.365, + "step": 13491 + }, + { + "epoch": 0.39, + "grad_norm": 1.281328085373386, + "learning_rate": 6.949701155344355e-06, + "loss": 0.3557, + "step": 13492 + }, + { + "epoch": 0.39, + "grad_norm": 2.5100988458734856, + "learning_rate": 6.949268621069274e-06, + "loss": 0.3237, + "step": 13493 + }, + { + "epoch": 0.39, + "grad_norm": 1.339180480264056, + "learning_rate": 6.948836069591876e-06, + "loss": 0.322, + "step": 13494 + }, + { + "epoch": 0.39, + "grad_norm": 1.2730370991229183, + "learning_rate": 6.948403500915977e-06, + "loss": 0.3057, + "step": 13495 + }, + { + "epoch": 0.39, + "grad_norm": 1.7567926801155174, + "learning_rate": 6.947970915045397e-06, + "loss": 0.368, + "step": 13496 + }, + { + "epoch": 0.39, + "grad_norm": 1.384166257133318, + "learning_rate": 6.947538311983951e-06, + "loss": 0.358, + "step": 13497 + }, + { + "epoch": 0.39, + "grad_norm": 1.4391397688205165, + "learning_rate": 6.947105691735459e-06, + "loss": 0.3154, + "step": 13498 + }, + { + "epoch": 0.39, + "grad_norm": 0.8823207359728731, + "learning_rate": 6.946673054303736e-06, + "loss": 0.5643, + "step": 13499 + }, + { + "epoch": 0.39, + "grad_norm": 1.3354597506649029, + "learning_rate": 6.946240399692602e-06, + "loss": 0.3236, + "step": 13500 + }, + { + "epoch": 0.39, + "grad_norm": 1.336188949765494, + "learning_rate": 6.945807727905876e-06, + "loss": 0.3223, + "step": 13501 + }, + { + "epoch": 0.39, + "grad_norm": 1.3165582028506306, + "learning_rate": 6.945375038947375e-06, + "loss": 0.3153, + "step": 13502 + }, + { + "epoch": 0.39, + "grad_norm": 1.4576821430031837, + "learning_rate": 6.944942332820916e-06, + "loss": 0.3507, + "step": 13503 + }, + { + "epoch": 0.39, + "grad_norm": 1.3845568862744864, + "learning_rate": 6.944509609530321e-06, + "loss": 0.3269, + "step": 13504 + }, + { + "epoch": 0.39, + "grad_norm": 1.440069862193451, + "learning_rate": 6.944076869079408e-06, + "loss": 0.3247, + "step": 13505 + }, + { + "epoch": 0.39, + "grad_norm": 1.9005772368739693, + "learning_rate": 6.943644111471994e-06, + "loss": 0.3156, + "step": 13506 + }, + { + "epoch": 0.39, + "grad_norm": 1.5760291381284188, + "learning_rate": 6.943211336711899e-06, + "loss": 0.3258, + "step": 13507 + }, + { + "epoch": 0.39, + "grad_norm": 1.36654115079744, + "learning_rate": 6.942778544802943e-06, + "loss": 0.337, + "step": 13508 + }, + { + "epoch": 0.39, + "grad_norm": 1.9588048502431756, + "learning_rate": 6.942345735748943e-06, + "loss": 0.3353, + "step": 13509 + }, + { + "epoch": 0.39, + "grad_norm": 1.2699541988090135, + "learning_rate": 6.941912909553723e-06, + "loss": 0.3168, + "step": 13510 + }, + { + "epoch": 0.39, + "grad_norm": 1.3293527552705446, + "learning_rate": 6.941480066221098e-06, + "loss": 0.3337, + "step": 13511 + }, + { + "epoch": 0.39, + "grad_norm": 1.2788678310719777, + "learning_rate": 6.94104720575489e-06, + "loss": 0.3327, + "step": 13512 + }, + { + "epoch": 0.39, + "grad_norm": 1.235345006663642, + "learning_rate": 6.940614328158919e-06, + "loss": 0.3426, + "step": 13513 + }, + { + "epoch": 0.39, + "grad_norm": 1.4254215570431326, + "learning_rate": 6.940181433437006e-06, + "loss": 0.3202, + "step": 13514 + }, + { + "epoch": 0.39, + "grad_norm": 1.3092769930405783, + "learning_rate": 6.9397485215929705e-06, + "loss": 0.3196, + "step": 13515 + }, + { + "epoch": 0.39, + "grad_norm": 1.3346249530469394, + "learning_rate": 6.939315592630631e-06, + "loss": 0.3281, + "step": 13516 + }, + { + "epoch": 0.39, + "grad_norm": 1.677520973772958, + "learning_rate": 6.938882646553811e-06, + "loss": 0.3416, + "step": 13517 + }, + { + "epoch": 0.39, + "grad_norm": 1.375964692744205, + "learning_rate": 6.938449683366329e-06, + "loss": 0.3225, + "step": 13518 + }, + { + "epoch": 0.39, + "grad_norm": 1.4284411802692514, + "learning_rate": 6.93801670307201e-06, + "loss": 0.3408, + "step": 13519 + }, + { + "epoch": 0.39, + "grad_norm": 1.2394814846695328, + "learning_rate": 6.937583705674671e-06, + "loss": 0.3226, + "step": 13520 + }, + { + "epoch": 0.39, + "grad_norm": 1.5483630731216245, + "learning_rate": 6.937150691178133e-06, + "loss": 0.3232, + "step": 13521 + }, + { + "epoch": 0.39, + "grad_norm": 1.4359709630320303, + "learning_rate": 6.936717659586219e-06, + "loss": 0.3148, + "step": 13522 + }, + { + "epoch": 0.39, + "grad_norm": 1.486642733936377, + "learning_rate": 6.936284610902749e-06, + "loss": 0.3283, + "step": 13523 + }, + { + "epoch": 0.39, + "grad_norm": 4.402054306098445, + "learning_rate": 6.935851545131549e-06, + "loss": 0.3454, + "step": 13524 + }, + { + "epoch": 0.39, + "grad_norm": 1.4802882827404462, + "learning_rate": 6.935418462276435e-06, + "loss": 0.3456, + "step": 13525 + }, + { + "epoch": 0.39, + "grad_norm": 1.4436168468841801, + "learning_rate": 6.934985362341232e-06, + "loss": 0.3254, + "step": 13526 + }, + { + "epoch": 0.39, + "grad_norm": 0.9217860969341543, + "learning_rate": 6.934552245329762e-06, + "loss": 0.577, + "step": 13527 + }, + { + "epoch": 0.39, + "grad_norm": 1.3887221590350407, + "learning_rate": 6.934119111245847e-06, + "loss": 0.3279, + "step": 13528 + }, + { + "epoch": 0.39, + "grad_norm": 1.5391265339027855, + "learning_rate": 6.93368596009331e-06, + "loss": 0.3796, + "step": 13529 + }, + { + "epoch": 0.39, + "grad_norm": 1.7790054162936761, + "learning_rate": 6.9332527918759715e-06, + "loss": 0.334, + "step": 13530 + }, + { + "epoch": 0.39, + "grad_norm": 1.9326751599288048, + "learning_rate": 6.932819606597657e-06, + "loss": 0.332, + "step": 13531 + }, + { + "epoch": 0.39, + "grad_norm": 1.3149628756166896, + "learning_rate": 6.932386404262188e-06, + "loss": 0.3261, + "step": 13532 + }, + { + "epoch": 0.39, + "grad_norm": 1.222480158671387, + "learning_rate": 6.931953184873387e-06, + "loss": 0.3143, + "step": 13533 + }, + { + "epoch": 0.39, + "grad_norm": 1.2875066519598348, + "learning_rate": 6.931519948435079e-06, + "loss": 0.3293, + "step": 13534 + }, + { + "epoch": 0.39, + "grad_norm": 1.3196610681244747, + "learning_rate": 6.931086694951085e-06, + "loss": 0.3594, + "step": 13535 + }, + { + "epoch": 0.39, + "grad_norm": 1.2786247001046254, + "learning_rate": 6.930653424425229e-06, + "loss": 0.3282, + "step": 13536 + }, + { + "epoch": 0.39, + "grad_norm": 3.6799765509008036, + "learning_rate": 6.930220136861335e-06, + "loss": 0.3346, + "step": 13537 + }, + { + "epoch": 0.39, + "grad_norm": 1.343315315612682, + "learning_rate": 6.929786832263228e-06, + "loss": 0.3398, + "step": 13538 + }, + { + "epoch": 0.39, + "grad_norm": 1.2743532419537555, + "learning_rate": 6.9293535106347294e-06, + "loss": 0.3345, + "step": 13539 + }, + { + "epoch": 0.39, + "grad_norm": 1.5345223842964757, + "learning_rate": 6.928920171979665e-06, + "loss": 0.339, + "step": 13540 + }, + { + "epoch": 0.39, + "grad_norm": 1.400853876043545, + "learning_rate": 6.928486816301859e-06, + "loss": 0.3466, + "step": 13541 + }, + { + "epoch": 0.39, + "grad_norm": 1.308353336829039, + "learning_rate": 6.928053443605136e-06, + "loss": 0.3222, + "step": 13542 + }, + { + "epoch": 0.39, + "grad_norm": 1.2867268157522984, + "learning_rate": 6.92762005389332e-06, + "loss": 0.324, + "step": 13543 + }, + { + "epoch": 0.39, + "grad_norm": 1.2747867683530356, + "learning_rate": 6.927186647170234e-06, + "loss": 0.3387, + "step": 13544 + }, + { + "epoch": 0.39, + "grad_norm": 1.2372395449963862, + "learning_rate": 6.926753223439706e-06, + "loss": 0.3381, + "step": 13545 + }, + { + "epoch": 0.39, + "grad_norm": 1.2539883703451753, + "learning_rate": 6.92631978270556e-06, + "loss": 0.3061, + "step": 13546 + }, + { + "epoch": 0.39, + "grad_norm": 1.29229677436676, + "learning_rate": 6.925886324971619e-06, + "loss": 0.3256, + "step": 13547 + }, + { + "epoch": 0.39, + "grad_norm": 1.3266674793674236, + "learning_rate": 6.925452850241712e-06, + "loss": 0.3562, + "step": 13548 + }, + { + "epoch": 0.39, + "grad_norm": 1.4057955343603916, + "learning_rate": 6.9250193585196605e-06, + "loss": 0.3314, + "step": 13549 + }, + { + "epoch": 0.39, + "grad_norm": 1.4527501548921398, + "learning_rate": 6.924585849809291e-06, + "loss": 0.3231, + "step": 13550 + }, + { + "epoch": 0.39, + "grad_norm": 1.3190441795318457, + "learning_rate": 6.924152324114432e-06, + "loss": 0.3334, + "step": 13551 + }, + { + "epoch": 0.39, + "grad_norm": 1.2444276665807545, + "learning_rate": 6.923718781438906e-06, + "loss": 0.342, + "step": 13552 + }, + { + "epoch": 0.39, + "grad_norm": 1.4485776905896681, + "learning_rate": 6.9232852217865395e-06, + "loss": 0.3432, + "step": 13553 + }, + { + "epoch": 0.39, + "grad_norm": 1.259028023870225, + "learning_rate": 6.922851645161162e-06, + "loss": 0.3432, + "step": 13554 + }, + { + "epoch": 0.39, + "grad_norm": 1.3342846333487373, + "learning_rate": 6.922418051566595e-06, + "loss": 0.3081, + "step": 13555 + }, + { + "epoch": 0.39, + "grad_norm": 1.0093804873359404, + "learning_rate": 6.92198444100667e-06, + "loss": 0.6152, + "step": 13556 + }, + { + "epoch": 0.39, + "grad_norm": 1.2535644203080736, + "learning_rate": 6.921550813485209e-06, + "loss": 0.303, + "step": 13557 + }, + { + "epoch": 0.39, + "grad_norm": 1.2319461937813272, + "learning_rate": 6.921117169006042e-06, + "loss": 0.3287, + "step": 13558 + }, + { + "epoch": 0.39, + "grad_norm": 1.3093450421495878, + "learning_rate": 6.920683507572994e-06, + "loss": 0.3447, + "step": 13559 + }, + { + "epoch": 0.39, + "grad_norm": 1.210625654618854, + "learning_rate": 6.920249829189893e-06, + "loss": 0.3073, + "step": 13560 + }, + { + "epoch": 0.39, + "grad_norm": 1.4505126586449324, + "learning_rate": 6.919816133860565e-06, + "loss": 0.37, + "step": 13561 + }, + { + "epoch": 0.39, + "grad_norm": 1.236977565766117, + "learning_rate": 6.91938242158884e-06, + "loss": 0.347, + "step": 13562 + }, + { + "epoch": 0.39, + "grad_norm": 1.2729806190713917, + "learning_rate": 6.918948692378542e-06, + "loss": 0.3252, + "step": 13563 + }, + { + "epoch": 0.39, + "grad_norm": 1.224331578111774, + "learning_rate": 6.918514946233502e-06, + "loss": 0.3122, + "step": 13564 + }, + { + "epoch": 0.39, + "grad_norm": 1.3453487446290777, + "learning_rate": 6.918081183157546e-06, + "loss": 0.3313, + "step": 13565 + }, + { + "epoch": 0.39, + "grad_norm": 1.5140540186729656, + "learning_rate": 6.917647403154502e-06, + "loss": 0.3316, + "step": 13566 + }, + { + "epoch": 0.39, + "grad_norm": 1.225491696103074, + "learning_rate": 6.917213606228198e-06, + "loss": 0.3292, + "step": 13567 + }, + { + "epoch": 0.39, + "grad_norm": 1.5286900828854912, + "learning_rate": 6.916779792382463e-06, + "loss": 0.326, + "step": 13568 + }, + { + "epoch": 0.39, + "grad_norm": 1.53953135336558, + "learning_rate": 6.916345961621126e-06, + "loss": 0.3291, + "step": 13569 + }, + { + "epoch": 0.39, + "grad_norm": 1.2856471445405646, + "learning_rate": 6.915912113948013e-06, + "loss": 0.3438, + "step": 13570 + }, + { + "epoch": 0.39, + "grad_norm": 1.2481059724180539, + "learning_rate": 6.915478249366956e-06, + "loss": 0.3397, + "step": 13571 + }, + { + "epoch": 0.39, + "grad_norm": 1.4768848489861304, + "learning_rate": 6.91504436788178e-06, + "loss": 0.3237, + "step": 13572 + }, + { + "epoch": 0.39, + "grad_norm": 1.2366819236797701, + "learning_rate": 6.914610469496318e-06, + "loss": 0.3791, + "step": 13573 + }, + { + "epoch": 0.39, + "grad_norm": 1.5580305922688797, + "learning_rate": 6.914176554214397e-06, + "loss": 0.3468, + "step": 13574 + }, + { + "epoch": 0.39, + "grad_norm": 1.2303389717418147, + "learning_rate": 6.913742622039846e-06, + "loss": 0.3314, + "step": 13575 + }, + { + "epoch": 0.39, + "grad_norm": 1.3175223099155833, + "learning_rate": 6.913308672976497e-06, + "loss": 0.3275, + "step": 13576 + }, + { + "epoch": 0.39, + "grad_norm": 1.2792073288036636, + "learning_rate": 6.912874707028177e-06, + "loss": 0.3361, + "step": 13577 + }, + { + "epoch": 0.39, + "grad_norm": 1.7891311508404266, + "learning_rate": 6.912440724198715e-06, + "loss": 0.3235, + "step": 13578 + }, + { + "epoch": 0.39, + "grad_norm": 1.3720452388048674, + "learning_rate": 6.912006724491944e-06, + "loss": 0.3162, + "step": 13579 + }, + { + "epoch": 0.39, + "grad_norm": 1.2608406101528125, + "learning_rate": 6.9115727079116914e-06, + "loss": 0.3137, + "step": 13580 + }, + { + "epoch": 0.39, + "grad_norm": 1.9935854902104477, + "learning_rate": 6.911138674461788e-06, + "loss": 0.3505, + "step": 13581 + }, + { + "epoch": 0.39, + "grad_norm": 1.608978414092131, + "learning_rate": 6.910704624146065e-06, + "loss": 0.3227, + "step": 13582 + }, + { + "epoch": 0.39, + "grad_norm": 1.3284325762612976, + "learning_rate": 6.910270556968354e-06, + "loss": 0.3307, + "step": 13583 + }, + { + "epoch": 0.39, + "grad_norm": 1.425517362554223, + "learning_rate": 6.909836472932483e-06, + "loss": 0.3099, + "step": 13584 + }, + { + "epoch": 0.39, + "grad_norm": 1.4616168199174229, + "learning_rate": 6.909402372042284e-06, + "loss": 0.3153, + "step": 13585 + }, + { + "epoch": 0.39, + "grad_norm": 1.653494271162223, + "learning_rate": 6.908968254301587e-06, + "loss": 0.3305, + "step": 13586 + }, + { + "epoch": 0.39, + "grad_norm": 1.4654985453217628, + "learning_rate": 6.908534119714226e-06, + "loss": 0.3425, + "step": 13587 + }, + { + "epoch": 0.39, + "grad_norm": 1.353939103684593, + "learning_rate": 6.908099968284029e-06, + "loss": 0.327, + "step": 13588 + }, + { + "epoch": 0.39, + "grad_norm": 1.4046348961243693, + "learning_rate": 6.907665800014828e-06, + "loss": 0.3228, + "step": 13589 + }, + { + "epoch": 0.39, + "grad_norm": 1.2548462040716233, + "learning_rate": 6.9072316149104565e-06, + "loss": 0.3204, + "step": 13590 + }, + { + "epoch": 0.39, + "grad_norm": 1.5583394462903746, + "learning_rate": 6.9067974129747435e-06, + "loss": 0.3353, + "step": 13591 + }, + { + "epoch": 0.39, + "grad_norm": 1.605443135543801, + "learning_rate": 6.9063631942115225e-06, + "loss": 0.3325, + "step": 13592 + }, + { + "epoch": 0.39, + "grad_norm": 1.2731095641759387, + "learning_rate": 6.905928958624627e-06, + "loss": 0.3339, + "step": 13593 + }, + { + "epoch": 0.39, + "grad_norm": 1.3916704870785916, + "learning_rate": 6.905494706217886e-06, + "loss": 0.3062, + "step": 13594 + }, + { + "epoch": 0.39, + "grad_norm": 1.279858213781156, + "learning_rate": 6.905060436995133e-06, + "loss": 0.3187, + "step": 13595 + }, + { + "epoch": 0.39, + "grad_norm": 1.3989450227044875, + "learning_rate": 6.904626150960198e-06, + "loss": 0.3334, + "step": 13596 + }, + { + "epoch": 0.39, + "grad_norm": 1.3952838353923482, + "learning_rate": 6.90419184811692e-06, + "loss": 0.3231, + "step": 13597 + }, + { + "epoch": 0.39, + "grad_norm": 1.3441985503877685, + "learning_rate": 6.903757528469126e-06, + "loss": 0.3218, + "step": 13598 + }, + { + "epoch": 0.39, + "grad_norm": 1.331746529613389, + "learning_rate": 6.903323192020651e-06, + "loss": 0.3262, + "step": 13599 + }, + { + "epoch": 0.39, + "grad_norm": 1.4203299782763907, + "learning_rate": 6.902888838775328e-06, + "loss": 0.3862, + "step": 13600 + }, + { + "epoch": 0.39, + "grad_norm": 1.2269356766072697, + "learning_rate": 6.90245446873699e-06, + "loss": 0.2973, + "step": 13601 + }, + { + "epoch": 0.39, + "grad_norm": 1.3062311308390238, + "learning_rate": 6.90202008190947e-06, + "loss": 0.3197, + "step": 13602 + }, + { + "epoch": 0.39, + "grad_norm": 1.2923734774288784, + "learning_rate": 6.9015856782966005e-06, + "loss": 0.3177, + "step": 13603 + }, + { + "epoch": 0.39, + "grad_norm": 1.4434325787712774, + "learning_rate": 6.901151257902218e-06, + "loss": 0.3244, + "step": 13604 + }, + { + "epoch": 0.39, + "grad_norm": 1.402418385526066, + "learning_rate": 6.900716820730154e-06, + "loss": 0.3332, + "step": 13605 + }, + { + "epoch": 0.39, + "grad_norm": 1.2584857933315396, + "learning_rate": 6.900282366784244e-06, + "loss": 0.3258, + "step": 13606 + }, + { + "epoch": 0.39, + "grad_norm": 1.2879636795390088, + "learning_rate": 6.899847896068319e-06, + "loss": 0.3193, + "step": 13607 + }, + { + "epoch": 0.39, + "grad_norm": 1.4599221119579235, + "learning_rate": 6.899413408586215e-06, + "loss": 0.3196, + "step": 13608 + }, + { + "epoch": 0.39, + "grad_norm": 1.3012001251722427, + "learning_rate": 6.898978904341768e-06, + "loss": 0.3363, + "step": 13609 + }, + { + "epoch": 0.39, + "grad_norm": 1.3414014310293758, + "learning_rate": 6.89854438333881e-06, + "loss": 0.3111, + "step": 13610 + }, + { + "epoch": 0.39, + "grad_norm": 1.3823847005781973, + "learning_rate": 6.898109845581179e-06, + "loss": 0.3454, + "step": 13611 + }, + { + "epoch": 0.39, + "grad_norm": 1.675885992398155, + "learning_rate": 6.897675291072705e-06, + "loss": 0.3352, + "step": 13612 + }, + { + "epoch": 0.39, + "grad_norm": 1.390190885853241, + "learning_rate": 6.8972407198172265e-06, + "loss": 0.3333, + "step": 13613 + }, + { + "epoch": 0.39, + "grad_norm": 1.5990586423094197, + "learning_rate": 6.896806131818578e-06, + "loss": 0.3551, + "step": 13614 + }, + { + "epoch": 0.39, + "grad_norm": 1.389048010158293, + "learning_rate": 6.896371527080594e-06, + "loss": 0.3265, + "step": 13615 + }, + { + "epoch": 0.39, + "grad_norm": 1.349101987227028, + "learning_rate": 6.89593690560711e-06, + "loss": 0.3246, + "step": 13616 + }, + { + "epoch": 0.39, + "grad_norm": 1.6079043314611812, + "learning_rate": 6.895502267401962e-06, + "loss": 0.3109, + "step": 13617 + }, + { + "epoch": 0.39, + "grad_norm": 1.6718177748518663, + "learning_rate": 6.895067612468986e-06, + "loss": 0.3344, + "step": 13618 + }, + { + "epoch": 0.4, + "grad_norm": 1.1802651166235452, + "learning_rate": 6.894632940812017e-06, + "loss": 0.3367, + "step": 13619 + }, + { + "epoch": 0.4, + "grad_norm": 1.3024246612996224, + "learning_rate": 6.894198252434891e-06, + "loss": 0.3093, + "step": 13620 + }, + { + "epoch": 0.4, + "grad_norm": 1.725860328438124, + "learning_rate": 6.893763547341446e-06, + "loss": 0.3184, + "step": 13621 + }, + { + "epoch": 0.4, + "grad_norm": 1.2685033428566894, + "learning_rate": 6.893328825535515e-06, + "loss": 0.319, + "step": 13622 + }, + { + "epoch": 0.4, + "grad_norm": 1.3426735848587295, + "learning_rate": 6.892894087020936e-06, + "loss": 0.3375, + "step": 13623 + }, + { + "epoch": 0.4, + "grad_norm": 1.2526667561715334, + "learning_rate": 6.892459331801547e-06, + "loss": 0.3367, + "step": 13624 + }, + { + "epoch": 0.4, + "grad_norm": 1.206093271769056, + "learning_rate": 6.892024559881181e-06, + "loss": 0.3064, + "step": 13625 + }, + { + "epoch": 0.4, + "grad_norm": 2.3515661569393287, + "learning_rate": 6.8915897712636785e-06, + "loss": 0.3266, + "step": 13626 + }, + { + "epoch": 0.4, + "grad_norm": 0.9880120282472092, + "learning_rate": 6.891154965952875e-06, + "loss": 0.5924, + "step": 13627 + }, + { + "epoch": 0.4, + "grad_norm": 1.948081150332825, + "learning_rate": 6.890720143952609e-06, + "loss": 0.3122, + "step": 13628 + }, + { + "epoch": 0.4, + "grad_norm": 1.5350192680332277, + "learning_rate": 6.8902853052667166e-06, + "loss": 0.3355, + "step": 13629 + }, + { + "epoch": 0.4, + "grad_norm": 0.9294249643997211, + "learning_rate": 6.889850449899034e-06, + "loss": 0.6395, + "step": 13630 + }, + { + "epoch": 0.4, + "grad_norm": 1.2634907186866013, + "learning_rate": 6.8894155778534015e-06, + "loss": 0.3422, + "step": 13631 + }, + { + "epoch": 0.4, + "grad_norm": 1.4569831507873674, + "learning_rate": 6.888980689133655e-06, + "loss": 0.3198, + "step": 13632 + }, + { + "epoch": 0.4, + "grad_norm": 1.4181035822345536, + "learning_rate": 6.888545783743634e-06, + "loss": 0.3171, + "step": 13633 + }, + { + "epoch": 0.4, + "grad_norm": 1.334506052026389, + "learning_rate": 6.888110861687175e-06, + "loss": 0.3706, + "step": 13634 + }, + { + "epoch": 0.4, + "grad_norm": 1.5247669292622885, + "learning_rate": 6.887675922968115e-06, + "loss": 0.3346, + "step": 13635 + }, + { + "epoch": 0.4, + "grad_norm": 1.3733074659232634, + "learning_rate": 6.887240967590296e-06, + "loss": 0.3207, + "step": 13636 + }, + { + "epoch": 0.4, + "grad_norm": 1.2173485399381485, + "learning_rate": 6.886805995557554e-06, + "loss": 0.325, + "step": 13637 + }, + { + "epoch": 0.4, + "grad_norm": 1.3423288965168174, + "learning_rate": 6.886371006873729e-06, + "loss": 0.3353, + "step": 13638 + }, + { + "epoch": 0.4, + "grad_norm": 1.196130391500703, + "learning_rate": 6.885936001542658e-06, + "loss": 0.3033, + "step": 13639 + }, + { + "epoch": 0.4, + "grad_norm": 1.566061432187357, + "learning_rate": 6.88550097956818e-06, + "loss": 0.3231, + "step": 13640 + }, + { + "epoch": 0.4, + "grad_norm": 1.3015438542269198, + "learning_rate": 6.885065940954136e-06, + "loss": 0.3293, + "step": 13641 + }, + { + "epoch": 0.4, + "grad_norm": 1.2791507507281166, + "learning_rate": 6.884630885704364e-06, + "loss": 0.3141, + "step": 13642 + }, + { + "epoch": 0.4, + "grad_norm": 1.423681080862129, + "learning_rate": 6.884195813822704e-06, + "loss": 0.3379, + "step": 13643 + }, + { + "epoch": 0.4, + "grad_norm": 1.2002679996053294, + "learning_rate": 6.8837607253129946e-06, + "loss": 0.3033, + "step": 13644 + }, + { + "epoch": 0.4, + "grad_norm": 1.2150160874333213, + "learning_rate": 6.8833256201790755e-06, + "loss": 0.3127, + "step": 13645 + }, + { + "epoch": 0.4, + "grad_norm": 1.265626446660865, + "learning_rate": 6.882890498424787e-06, + "loss": 0.3392, + "step": 13646 + }, + { + "epoch": 0.4, + "grad_norm": 1.2858686788170621, + "learning_rate": 6.882455360053971e-06, + "loss": 0.3104, + "step": 13647 + }, + { + "epoch": 0.4, + "grad_norm": 1.3106715835655078, + "learning_rate": 6.882020205070463e-06, + "loss": 0.3178, + "step": 13648 + }, + { + "epoch": 0.4, + "grad_norm": 1.3081748728500728, + "learning_rate": 6.881585033478107e-06, + "loss": 0.3312, + "step": 13649 + }, + { + "epoch": 0.4, + "grad_norm": 1.7314582542312513, + "learning_rate": 6.881149845280741e-06, + "loss": 0.3295, + "step": 13650 + }, + { + "epoch": 0.4, + "grad_norm": 1.0267556363153518, + "learning_rate": 6.880714640482208e-06, + "loss": 0.6572, + "step": 13651 + }, + { + "epoch": 0.4, + "grad_norm": 1.3171167864851951, + "learning_rate": 6.880279419086347e-06, + "loss": 0.327, + "step": 13652 + }, + { + "epoch": 0.4, + "grad_norm": 1.431608732461933, + "learning_rate": 6.8798441810969995e-06, + "loss": 0.3319, + "step": 13653 + }, + { + "epoch": 0.4, + "grad_norm": 1.3026767270276416, + "learning_rate": 6.879408926518006e-06, + "loss": 0.3392, + "step": 13654 + }, + { + "epoch": 0.4, + "grad_norm": 1.4876060022983209, + "learning_rate": 6.878973655353207e-06, + "loss": 0.3348, + "step": 13655 + }, + { + "epoch": 0.4, + "grad_norm": 1.471194032913394, + "learning_rate": 6.878538367606446e-06, + "loss": 0.3505, + "step": 13656 + }, + { + "epoch": 0.4, + "grad_norm": 2.0552907252119628, + "learning_rate": 6.878103063281562e-06, + "loss": 0.3353, + "step": 13657 + }, + { + "epoch": 0.4, + "grad_norm": 1.2675934826622417, + "learning_rate": 6.877667742382399e-06, + "loss": 0.312, + "step": 13658 + }, + { + "epoch": 0.4, + "grad_norm": 1.6079571706832438, + "learning_rate": 6.877232404912797e-06, + "loss": 0.3466, + "step": 13659 + }, + { + "epoch": 0.4, + "grad_norm": 1.453038293697026, + "learning_rate": 6.876797050876598e-06, + "loss": 0.3366, + "step": 13660 + }, + { + "epoch": 0.4, + "grad_norm": 1.6819229772654887, + "learning_rate": 6.876361680277645e-06, + "loss": 0.3662, + "step": 13661 + }, + { + "epoch": 0.4, + "grad_norm": 3.2885585968978113, + "learning_rate": 6.875926293119778e-06, + "loss": 0.3236, + "step": 13662 + }, + { + "epoch": 0.4, + "grad_norm": 1.3785730903702496, + "learning_rate": 6.875490889406841e-06, + "loss": 0.3865, + "step": 13663 + }, + { + "epoch": 0.4, + "grad_norm": 1.3652720841493775, + "learning_rate": 6.875055469142675e-06, + "loss": 0.3306, + "step": 13664 + }, + { + "epoch": 0.4, + "grad_norm": 1.3462444995012557, + "learning_rate": 6.874620032331126e-06, + "loss": 0.3354, + "step": 13665 + }, + { + "epoch": 0.4, + "grad_norm": 1.2679546733662463, + "learning_rate": 6.8741845789760335e-06, + "loss": 0.3259, + "step": 13666 + }, + { + "epoch": 0.4, + "grad_norm": 1.3693316737041357, + "learning_rate": 6.87374910908124e-06, + "loss": 0.3605, + "step": 13667 + }, + { + "epoch": 0.4, + "grad_norm": 1.6151880556578406, + "learning_rate": 6.87331362265059e-06, + "loss": 0.3204, + "step": 13668 + }, + { + "epoch": 0.4, + "grad_norm": 1.5040717151251977, + "learning_rate": 6.872878119687928e-06, + "loss": 0.3306, + "step": 13669 + }, + { + "epoch": 0.4, + "grad_norm": 1.5811289425617627, + "learning_rate": 6.872442600197095e-06, + "loss": 0.3548, + "step": 13670 + }, + { + "epoch": 0.4, + "grad_norm": 2.2834666942219255, + "learning_rate": 6.872007064181936e-06, + "loss": 0.3189, + "step": 13671 + }, + { + "epoch": 0.4, + "grad_norm": 1.2605552465489436, + "learning_rate": 6.871571511646293e-06, + "loss": 0.3356, + "step": 13672 + }, + { + "epoch": 0.4, + "grad_norm": 1.5156206276646165, + "learning_rate": 6.871135942594012e-06, + "loss": 0.3317, + "step": 13673 + }, + { + "epoch": 0.4, + "grad_norm": 1.4594738569808443, + "learning_rate": 6.870700357028934e-06, + "loss": 0.3135, + "step": 13674 + }, + { + "epoch": 0.4, + "grad_norm": 1.3205394263253238, + "learning_rate": 6.870264754954906e-06, + "loss": 0.3435, + "step": 13675 + }, + { + "epoch": 0.4, + "grad_norm": 1.2938607887904745, + "learning_rate": 6.86982913637577e-06, + "loss": 0.3159, + "step": 13676 + }, + { + "epoch": 0.4, + "grad_norm": 1.3361114295002714, + "learning_rate": 6.86939350129537e-06, + "loss": 0.3599, + "step": 13677 + }, + { + "epoch": 0.4, + "grad_norm": 1.141412507087944, + "learning_rate": 6.868957849717553e-06, + "loss": 0.3131, + "step": 13678 + }, + { + "epoch": 0.4, + "grad_norm": 1.2578564868243243, + "learning_rate": 6.868522181646162e-06, + "loss": 0.3317, + "step": 13679 + }, + { + "epoch": 0.4, + "grad_norm": 1.3169613374363598, + "learning_rate": 6.868086497085042e-06, + "loss": 0.3261, + "step": 13680 + }, + { + "epoch": 0.4, + "grad_norm": 1.6128029886888904, + "learning_rate": 6.867650796038038e-06, + "loss": 0.3392, + "step": 13681 + }, + { + "epoch": 0.4, + "grad_norm": 1.2021023060150993, + "learning_rate": 6.867215078508994e-06, + "loss": 0.3197, + "step": 13682 + }, + { + "epoch": 0.4, + "grad_norm": 1.3672281329564038, + "learning_rate": 6.866779344501758e-06, + "loss": 0.311, + "step": 13683 + }, + { + "epoch": 0.4, + "grad_norm": 1.3090116962672604, + "learning_rate": 6.866343594020173e-06, + "loss": 0.3344, + "step": 13684 + }, + { + "epoch": 0.4, + "grad_norm": 1.2978462045352819, + "learning_rate": 6.865907827068085e-06, + "loss": 0.3179, + "step": 13685 + }, + { + "epoch": 0.4, + "grad_norm": 1.33749229757046, + "learning_rate": 6.865472043649338e-06, + "loss": 0.3335, + "step": 13686 + }, + { + "epoch": 0.4, + "grad_norm": 1.3964447670125368, + "learning_rate": 6.865036243767781e-06, + "loss": 0.3212, + "step": 13687 + }, + { + "epoch": 0.4, + "grad_norm": 0.9541060531760392, + "learning_rate": 6.864600427427259e-06, + "loss": 0.6246, + "step": 13688 + }, + { + "epoch": 0.4, + "grad_norm": 1.240541515162284, + "learning_rate": 6.8641645946316185e-06, + "loss": 0.3487, + "step": 13689 + }, + { + "epoch": 0.4, + "grad_norm": 1.2961936192642372, + "learning_rate": 6.8637287453847025e-06, + "loss": 0.3293, + "step": 13690 + }, + { + "epoch": 0.4, + "grad_norm": 1.4850713375411773, + "learning_rate": 6.86329287969036e-06, + "loss": 0.3293, + "step": 13691 + }, + { + "epoch": 0.4, + "grad_norm": 1.4913872836422146, + "learning_rate": 6.862856997552437e-06, + "loss": 0.3092, + "step": 13692 + }, + { + "epoch": 0.4, + "grad_norm": 1.2541602924746609, + "learning_rate": 6.862421098974782e-06, + "loss": 0.3241, + "step": 13693 + }, + { + "epoch": 0.4, + "grad_norm": 1.337500721096827, + "learning_rate": 6.861985183961239e-06, + "loss": 0.3263, + "step": 13694 + }, + { + "epoch": 0.4, + "grad_norm": 1.3814885398774133, + "learning_rate": 6.861549252515656e-06, + "loss": 0.3509, + "step": 13695 + }, + { + "epoch": 0.4, + "grad_norm": 1.4084097078320221, + "learning_rate": 6.86111330464188e-06, + "loss": 0.3157, + "step": 13696 + }, + { + "epoch": 0.4, + "grad_norm": 1.3062549550822342, + "learning_rate": 6.86067734034376e-06, + "loss": 0.3334, + "step": 13697 + }, + { + "epoch": 0.4, + "grad_norm": 1.4386104266123168, + "learning_rate": 6.86024135962514e-06, + "loss": 0.3201, + "step": 13698 + }, + { + "epoch": 0.4, + "grad_norm": 1.3001098232564212, + "learning_rate": 6.859805362489869e-06, + "loss": 0.3305, + "step": 13699 + }, + { + "epoch": 0.4, + "grad_norm": 1.6373720825162885, + "learning_rate": 6.859369348941796e-06, + "loss": 0.3406, + "step": 13700 + }, + { + "epoch": 0.4, + "grad_norm": 1.2570392500658383, + "learning_rate": 6.858933318984769e-06, + "loss": 0.3173, + "step": 13701 + }, + { + "epoch": 0.4, + "grad_norm": 1.4697775595816256, + "learning_rate": 6.858497272622633e-06, + "loss": 0.3551, + "step": 13702 + }, + { + "epoch": 0.4, + "grad_norm": 1.2244714670510517, + "learning_rate": 6.858061209859239e-06, + "loss": 0.3077, + "step": 13703 + }, + { + "epoch": 0.4, + "grad_norm": 1.2591062033606475, + "learning_rate": 6.857625130698434e-06, + "loss": 0.3269, + "step": 13704 + }, + { + "epoch": 0.4, + "grad_norm": 1.2208330171569608, + "learning_rate": 6.857189035144067e-06, + "loss": 0.3208, + "step": 13705 + }, + { + "epoch": 0.4, + "grad_norm": 1.6662889276655057, + "learning_rate": 6.856752923199985e-06, + "loss": 0.3441, + "step": 13706 + }, + { + "epoch": 0.4, + "grad_norm": 1.244177274837838, + "learning_rate": 6.85631679487004e-06, + "loss": 0.3222, + "step": 13707 + }, + { + "epoch": 0.4, + "grad_norm": 1.557062180223225, + "learning_rate": 6.8558806501580764e-06, + "loss": 0.3379, + "step": 13708 + }, + { + "epoch": 0.4, + "grad_norm": 1.4683101928157918, + "learning_rate": 6.855444489067946e-06, + "loss": 0.3425, + "step": 13709 + }, + { + "epoch": 0.4, + "grad_norm": 1.328968597489244, + "learning_rate": 6.855008311603497e-06, + "loss": 0.3167, + "step": 13710 + }, + { + "epoch": 0.4, + "grad_norm": 1.2898892842777854, + "learning_rate": 6.854572117768581e-06, + "loss": 0.3278, + "step": 13711 + }, + { + "epoch": 0.4, + "grad_norm": 1.3108253015389815, + "learning_rate": 6.854135907567043e-06, + "loss": 0.3314, + "step": 13712 + }, + { + "epoch": 0.4, + "grad_norm": 1.3323969624589664, + "learning_rate": 6.853699681002737e-06, + "loss": 0.3264, + "step": 13713 + }, + { + "epoch": 0.4, + "grad_norm": 3.649656938821621, + "learning_rate": 6.853263438079509e-06, + "loss": 0.34, + "step": 13714 + }, + { + "epoch": 0.4, + "grad_norm": 1.4388134728833362, + "learning_rate": 6.8528271788012135e-06, + "loss": 0.3327, + "step": 13715 + }, + { + "epoch": 0.4, + "grad_norm": 1.5192268723609195, + "learning_rate": 6.852390903171695e-06, + "loss": 0.3076, + "step": 13716 + }, + { + "epoch": 0.4, + "grad_norm": 1.2766148120651675, + "learning_rate": 6.851954611194808e-06, + "loss": 0.3193, + "step": 13717 + }, + { + "epoch": 0.4, + "grad_norm": 1.3520733752353193, + "learning_rate": 6.851518302874399e-06, + "loss": 0.3239, + "step": 13718 + }, + { + "epoch": 0.4, + "grad_norm": 1.2142388825387216, + "learning_rate": 6.851081978214321e-06, + "loss": 0.3267, + "step": 13719 + }, + { + "epoch": 0.4, + "grad_norm": 1.29751028724869, + "learning_rate": 6.850645637218426e-06, + "loss": 0.2981, + "step": 13720 + }, + { + "epoch": 0.4, + "grad_norm": 1.5967013631035785, + "learning_rate": 6.850209279890561e-06, + "loss": 0.3316, + "step": 13721 + }, + { + "epoch": 0.4, + "grad_norm": 1.3301770633306818, + "learning_rate": 6.849772906234577e-06, + "loss": 0.3307, + "step": 13722 + }, + { + "epoch": 0.4, + "grad_norm": 1.312824072110114, + "learning_rate": 6.849336516254329e-06, + "loss": 0.3324, + "step": 13723 + }, + { + "epoch": 0.4, + "grad_norm": 1.8379971730305065, + "learning_rate": 6.848900109953664e-06, + "loss": 0.2954, + "step": 13724 + }, + { + "epoch": 0.4, + "grad_norm": 2.4004094829557143, + "learning_rate": 6.848463687336437e-06, + "loss": 0.3606, + "step": 13725 + }, + { + "epoch": 0.4, + "grad_norm": 1.2497926623171929, + "learning_rate": 6.848027248406497e-06, + "loss": 0.2942, + "step": 13726 + }, + { + "epoch": 0.4, + "grad_norm": 1.3243482573640308, + "learning_rate": 6.847590793167696e-06, + "loss": 0.3146, + "step": 13727 + }, + { + "epoch": 0.4, + "grad_norm": 1.3400336034614624, + "learning_rate": 6.847154321623885e-06, + "loss": 0.3303, + "step": 13728 + }, + { + "epoch": 0.4, + "grad_norm": 1.3198704994180066, + "learning_rate": 6.846717833778918e-06, + "loss": 0.358, + "step": 13729 + }, + { + "epoch": 0.4, + "grad_norm": 1.2274565554384154, + "learning_rate": 6.846281329636644e-06, + "loss": 0.3057, + "step": 13730 + }, + { + "epoch": 0.4, + "grad_norm": 1.886057992082962, + "learning_rate": 6.845844809200918e-06, + "loss": 0.3333, + "step": 13731 + }, + { + "epoch": 0.4, + "grad_norm": 1.4478292232746621, + "learning_rate": 6.845408272475589e-06, + "loss": 0.3164, + "step": 13732 + }, + { + "epoch": 0.4, + "grad_norm": 3.0126393175693877, + "learning_rate": 6.844971719464513e-06, + "loss": 0.3231, + "step": 13733 + }, + { + "epoch": 0.4, + "grad_norm": 1.5389584348545584, + "learning_rate": 6.844535150171542e-06, + "loss": 0.3543, + "step": 13734 + }, + { + "epoch": 0.4, + "grad_norm": 1.4006330371404818, + "learning_rate": 6.844098564600527e-06, + "loss": 0.3565, + "step": 13735 + }, + { + "epoch": 0.4, + "grad_norm": 1.3196395776647818, + "learning_rate": 6.843661962755321e-06, + "loss": 0.3108, + "step": 13736 + }, + { + "epoch": 0.4, + "grad_norm": 1.306065009818164, + "learning_rate": 6.843225344639778e-06, + "loss": 0.3291, + "step": 13737 + }, + { + "epoch": 0.4, + "grad_norm": 1.24286656837916, + "learning_rate": 6.842788710257752e-06, + "loss": 0.3304, + "step": 13738 + }, + { + "epoch": 0.4, + "grad_norm": 1.4157700565994766, + "learning_rate": 6.842352059613094e-06, + "loss": 0.3454, + "step": 13739 + }, + { + "epoch": 0.4, + "grad_norm": 1.3212845485049183, + "learning_rate": 6.841915392709659e-06, + "loss": 0.3327, + "step": 13740 + }, + { + "epoch": 0.4, + "grad_norm": 1.2793006063100392, + "learning_rate": 6.841478709551299e-06, + "loss": 0.3121, + "step": 13741 + }, + { + "epoch": 0.4, + "grad_norm": 1.2670147660496485, + "learning_rate": 6.84104201014187e-06, + "loss": 0.3231, + "step": 13742 + }, + { + "epoch": 0.4, + "grad_norm": 1.263085425797356, + "learning_rate": 6.840605294485226e-06, + "loss": 0.2953, + "step": 13743 + }, + { + "epoch": 0.4, + "grad_norm": 1.263591158796601, + "learning_rate": 6.840168562585219e-06, + "loss": 0.3673, + "step": 13744 + }, + { + "epoch": 0.4, + "grad_norm": 1.3286850265347894, + "learning_rate": 6.839731814445705e-06, + "loss": 0.3156, + "step": 13745 + }, + { + "epoch": 0.4, + "grad_norm": 1.247996949220141, + "learning_rate": 6.839295050070535e-06, + "loss": 0.3313, + "step": 13746 + }, + { + "epoch": 0.4, + "grad_norm": 1.5140778745322592, + "learning_rate": 6.8388582694635676e-06, + "loss": 0.3576, + "step": 13747 + }, + { + "epoch": 0.4, + "grad_norm": 1.3831241096031601, + "learning_rate": 6.8384214726286556e-06, + "loss": 0.3299, + "step": 13748 + }, + { + "epoch": 0.4, + "grad_norm": 1.330075539554998, + "learning_rate": 6.837984659569653e-06, + "loss": 0.3431, + "step": 13749 + }, + { + "epoch": 0.4, + "grad_norm": 1.2255876222104332, + "learning_rate": 6.837547830290416e-06, + "loss": 0.3228, + "step": 13750 + }, + { + "epoch": 0.4, + "grad_norm": 1.498446404493393, + "learning_rate": 6.837110984794797e-06, + "loss": 0.2979, + "step": 13751 + }, + { + "epoch": 0.4, + "grad_norm": 1.445189563347458, + "learning_rate": 6.836674123086656e-06, + "loss": 0.3281, + "step": 13752 + }, + { + "epoch": 0.4, + "grad_norm": 1.2024008797439394, + "learning_rate": 6.836237245169843e-06, + "loss": 0.3363, + "step": 13753 + }, + { + "epoch": 0.4, + "grad_norm": 1.5636066187761646, + "learning_rate": 6.835800351048218e-06, + "loss": 0.3323, + "step": 13754 + }, + { + "epoch": 0.4, + "grad_norm": 1.2476960271036759, + "learning_rate": 6.835363440725633e-06, + "loss": 0.308, + "step": 13755 + }, + { + "epoch": 0.4, + "grad_norm": 1.8026466626818984, + "learning_rate": 6.834926514205947e-06, + "loss": 0.337, + "step": 13756 + }, + { + "epoch": 0.4, + "grad_norm": 1.334737216839428, + "learning_rate": 6.834489571493013e-06, + "loss": 0.3377, + "step": 13757 + }, + { + "epoch": 0.4, + "grad_norm": 2.617995048416887, + "learning_rate": 6.834052612590688e-06, + "loss": 0.3363, + "step": 13758 + }, + { + "epoch": 0.4, + "grad_norm": 1.4406557848715529, + "learning_rate": 6.833615637502829e-06, + "loss": 0.3263, + "step": 13759 + }, + { + "epoch": 0.4, + "grad_norm": 1.45039580170667, + "learning_rate": 6.833178646233291e-06, + "loss": 0.3462, + "step": 13760 + }, + { + "epoch": 0.4, + "grad_norm": 1.2905190744486195, + "learning_rate": 6.832741638785932e-06, + "loss": 0.3428, + "step": 13761 + }, + { + "epoch": 0.4, + "grad_norm": 1.3010384964648452, + "learning_rate": 6.8323046151646066e-06, + "loss": 0.3296, + "step": 13762 + }, + { + "epoch": 0.4, + "grad_norm": 1.3552919643977914, + "learning_rate": 6.8318675753731735e-06, + "loss": 0.3182, + "step": 13763 + }, + { + "epoch": 0.4, + "grad_norm": 0.9461071540814587, + "learning_rate": 6.831430519415488e-06, + "loss": 0.6269, + "step": 13764 + }, + { + "epoch": 0.4, + "grad_norm": 1.313283876882832, + "learning_rate": 6.830993447295409e-06, + "loss": 0.326, + "step": 13765 + }, + { + "epoch": 0.4, + "grad_norm": 1.3360987953191026, + "learning_rate": 6.830556359016791e-06, + "loss": 0.3401, + "step": 13766 + }, + { + "epoch": 0.4, + "grad_norm": 1.2154380891842795, + "learning_rate": 6.830119254583494e-06, + "loss": 0.3199, + "step": 13767 + }, + { + "epoch": 0.4, + "grad_norm": 1.6888767275420937, + "learning_rate": 6.829682133999374e-06, + "loss": 0.3289, + "step": 13768 + }, + { + "epoch": 0.4, + "grad_norm": 1.368001229364775, + "learning_rate": 6.829244997268287e-06, + "loss": 0.3334, + "step": 13769 + }, + { + "epoch": 0.4, + "grad_norm": 1.3644775061506411, + "learning_rate": 6.828807844394096e-06, + "loss": 0.3436, + "step": 13770 + }, + { + "epoch": 0.4, + "grad_norm": 1.3673238648879849, + "learning_rate": 6.828370675380653e-06, + "loss": 0.3293, + "step": 13771 + }, + { + "epoch": 0.4, + "grad_norm": 1.2604897942238866, + "learning_rate": 6.82793349023182e-06, + "loss": 0.3065, + "step": 13772 + }, + { + "epoch": 0.4, + "grad_norm": 1.9691947118680817, + "learning_rate": 6.827496288951453e-06, + "loss": 0.3324, + "step": 13773 + }, + { + "epoch": 0.4, + "grad_norm": 1.5357340082053892, + "learning_rate": 6.827059071543411e-06, + "loss": 0.3311, + "step": 13774 + }, + { + "epoch": 0.4, + "grad_norm": 1.3479329662775217, + "learning_rate": 6.826621838011553e-06, + "loss": 0.3535, + "step": 13775 + }, + { + "epoch": 0.4, + "grad_norm": 1.35052366369445, + "learning_rate": 6.826184588359737e-06, + "loss": 0.315, + "step": 13776 + }, + { + "epoch": 0.4, + "grad_norm": 1.2156244679469315, + "learning_rate": 6.82574732259182e-06, + "loss": 0.3319, + "step": 13777 + }, + { + "epoch": 0.4, + "grad_norm": 1.3052930003395502, + "learning_rate": 6.825310040711663e-06, + "loss": 0.3163, + "step": 13778 + }, + { + "epoch": 0.4, + "grad_norm": 1.6883630523385673, + "learning_rate": 6.824872742723126e-06, + "loss": 0.3047, + "step": 13779 + }, + { + "epoch": 0.4, + "grad_norm": 2.7693951189157833, + "learning_rate": 6.824435428630068e-06, + "loss": 0.3205, + "step": 13780 + }, + { + "epoch": 0.4, + "grad_norm": 1.3168556481608298, + "learning_rate": 6.823998098436344e-06, + "loss": 0.3259, + "step": 13781 + }, + { + "epoch": 0.4, + "grad_norm": 1.3882303539969538, + "learning_rate": 6.823560752145817e-06, + "loss": 0.3256, + "step": 13782 + }, + { + "epoch": 0.4, + "grad_norm": 1.4547999452567768, + "learning_rate": 6.823123389762347e-06, + "loss": 0.338, + "step": 13783 + }, + { + "epoch": 0.4, + "grad_norm": 1.39098270469068, + "learning_rate": 6.822686011289794e-06, + "loss": 0.3324, + "step": 13784 + }, + { + "epoch": 0.4, + "grad_norm": 1.4308164918407404, + "learning_rate": 6.822248616732014e-06, + "loss": 0.3543, + "step": 13785 + }, + { + "epoch": 0.4, + "grad_norm": 1.3811800747753762, + "learning_rate": 6.821811206092872e-06, + "loss": 0.3111, + "step": 13786 + }, + { + "epoch": 0.4, + "grad_norm": 1.2748717500055533, + "learning_rate": 6.821373779376226e-06, + "loss": 0.3226, + "step": 13787 + }, + { + "epoch": 0.4, + "grad_norm": 1.3065049583341926, + "learning_rate": 6.820936336585935e-06, + "loss": 0.3514, + "step": 13788 + }, + { + "epoch": 0.4, + "grad_norm": 1.2960609438067092, + "learning_rate": 6.820498877725862e-06, + "loss": 0.3403, + "step": 13789 + }, + { + "epoch": 0.4, + "grad_norm": 1.3455484823191686, + "learning_rate": 6.820061402799865e-06, + "loss": 0.3125, + "step": 13790 + }, + { + "epoch": 0.4, + "grad_norm": 1.3805284088048373, + "learning_rate": 6.819623911811806e-06, + "loss": 0.3448, + "step": 13791 + }, + { + "epoch": 0.4, + "grad_norm": 1.3714034257282512, + "learning_rate": 6.819186404765545e-06, + "loss": 0.3241, + "step": 13792 + }, + { + "epoch": 0.4, + "grad_norm": 1.4271268498466863, + "learning_rate": 6.8187488816649465e-06, + "loss": 0.3464, + "step": 13793 + }, + { + "epoch": 0.4, + "grad_norm": 1.4642127536562801, + "learning_rate": 6.818311342513867e-06, + "loss": 0.329, + "step": 13794 + }, + { + "epoch": 0.4, + "grad_norm": 1.587440345308455, + "learning_rate": 6.81787378731617e-06, + "loss": 0.3211, + "step": 13795 + }, + { + "epoch": 0.4, + "grad_norm": 1.9123243653142277, + "learning_rate": 6.817436216075716e-06, + "loss": 0.3147, + "step": 13796 + }, + { + "epoch": 0.4, + "grad_norm": 1.2462988641338546, + "learning_rate": 6.8169986287963675e-06, + "loss": 0.3205, + "step": 13797 + }, + { + "epoch": 0.4, + "grad_norm": 1.3630638752156854, + "learning_rate": 6.816561025481987e-06, + "loss": 0.3184, + "step": 13798 + }, + { + "epoch": 0.4, + "grad_norm": 1.2918648489936695, + "learning_rate": 6.816123406136434e-06, + "loss": 0.3296, + "step": 13799 + }, + { + "epoch": 0.4, + "grad_norm": 1.2768384425906363, + "learning_rate": 6.815685770763573e-06, + "loss": 0.3329, + "step": 13800 + }, + { + "epoch": 0.4, + "grad_norm": 1.2691300370053766, + "learning_rate": 6.8152481193672635e-06, + "loss": 0.3188, + "step": 13801 + }, + { + "epoch": 0.4, + "grad_norm": 1.2349515440969978, + "learning_rate": 6.81481045195137e-06, + "loss": 0.3273, + "step": 13802 + }, + { + "epoch": 0.4, + "grad_norm": 1.4086708071088765, + "learning_rate": 6.814372768519756e-06, + "loss": 0.3358, + "step": 13803 + }, + { + "epoch": 0.4, + "grad_norm": 1.75988806736069, + "learning_rate": 6.81393506907628e-06, + "loss": 0.3108, + "step": 13804 + }, + { + "epoch": 0.4, + "grad_norm": 1.2801584915166757, + "learning_rate": 6.813497353624806e-06, + "loss": 0.3283, + "step": 13805 + }, + { + "epoch": 0.4, + "grad_norm": 1.8378770848648995, + "learning_rate": 6.813059622169199e-06, + "loss": 0.3416, + "step": 13806 + }, + { + "epoch": 0.4, + "grad_norm": 1.308550092313714, + "learning_rate": 6.812621874713321e-06, + "loss": 0.3146, + "step": 13807 + }, + { + "epoch": 0.4, + "grad_norm": 1.349780467157861, + "learning_rate": 6.8121841112610355e-06, + "loss": 0.3249, + "step": 13808 + }, + { + "epoch": 0.4, + "grad_norm": 1.3828063416973544, + "learning_rate": 6.8117463318162035e-06, + "loss": 0.3259, + "step": 13809 + }, + { + "epoch": 0.4, + "grad_norm": 1.2905339842372299, + "learning_rate": 6.81130853638269e-06, + "loss": 0.3265, + "step": 13810 + }, + { + "epoch": 0.4, + "grad_norm": 1.427995725984426, + "learning_rate": 6.81087072496436e-06, + "loss": 0.3622, + "step": 13811 + }, + { + "epoch": 0.4, + "grad_norm": 1.4093716279248878, + "learning_rate": 6.810432897565075e-06, + "loss": 0.3342, + "step": 13812 + }, + { + "epoch": 0.4, + "grad_norm": 1.4857222123336928, + "learning_rate": 6.8099950541886995e-06, + "loss": 0.3695, + "step": 13813 + }, + { + "epoch": 0.4, + "grad_norm": 1.4332538070663563, + "learning_rate": 6.8095571948390974e-06, + "loss": 0.3557, + "step": 13814 + }, + { + "epoch": 0.4, + "grad_norm": 1.718117504340187, + "learning_rate": 6.809119319520134e-06, + "loss": 0.3092, + "step": 13815 + }, + { + "epoch": 0.4, + "grad_norm": 1.7442963416822368, + "learning_rate": 6.808681428235673e-06, + "loss": 0.3335, + "step": 13816 + }, + { + "epoch": 0.4, + "grad_norm": 1.234875083771654, + "learning_rate": 6.808243520989576e-06, + "loss": 0.3191, + "step": 13817 + }, + { + "epoch": 0.4, + "grad_norm": 1.4314015372720288, + "learning_rate": 6.8078055977857104e-06, + "loss": 0.3203, + "step": 13818 + }, + { + "epoch": 0.4, + "grad_norm": 1.3780138921481697, + "learning_rate": 6.8073676586279416e-06, + "loss": 0.324, + "step": 13819 + }, + { + "epoch": 0.4, + "grad_norm": 2.1671679787247724, + "learning_rate": 6.806929703520132e-06, + "loss": 0.3479, + "step": 13820 + }, + { + "epoch": 0.4, + "grad_norm": 1.2952657194068338, + "learning_rate": 6.806491732466149e-06, + "loss": 0.3029, + "step": 13821 + }, + { + "epoch": 0.4, + "grad_norm": 1.3446364593139737, + "learning_rate": 6.806053745469855e-06, + "loss": 0.3305, + "step": 13822 + }, + { + "epoch": 0.4, + "grad_norm": 1.4046986449694898, + "learning_rate": 6.805615742535117e-06, + "loss": 0.3372, + "step": 13823 + }, + { + "epoch": 0.4, + "grad_norm": 1.3302063858718838, + "learning_rate": 6.8051777236658e-06, + "loss": 0.3072, + "step": 13824 + }, + { + "epoch": 0.4, + "grad_norm": 1.4562904729419657, + "learning_rate": 6.804739688865771e-06, + "loss": 0.3443, + "step": 13825 + }, + { + "epoch": 0.4, + "grad_norm": 1.3943851375213132, + "learning_rate": 6.804301638138893e-06, + "loss": 0.3215, + "step": 13826 + }, + { + "epoch": 0.4, + "grad_norm": 1.6223945752708113, + "learning_rate": 6.803863571489033e-06, + "loss": 0.2982, + "step": 13827 + }, + { + "epoch": 0.4, + "grad_norm": 1.3242933155268204, + "learning_rate": 6.803425488920057e-06, + "loss": 0.3451, + "step": 13828 + }, + { + "epoch": 0.4, + "grad_norm": 1.4086164954298621, + "learning_rate": 6.802987390435832e-06, + "loss": 0.3449, + "step": 13829 + }, + { + "epoch": 0.4, + "grad_norm": 1.2641460374506324, + "learning_rate": 6.802549276040222e-06, + "loss": 0.3235, + "step": 13830 + }, + { + "epoch": 0.4, + "grad_norm": 1.3311970175485879, + "learning_rate": 6.802111145737096e-06, + "loss": 0.3074, + "step": 13831 + }, + { + "epoch": 0.4, + "grad_norm": 2.0216379097181947, + "learning_rate": 6.801672999530318e-06, + "loss": 0.3349, + "step": 13832 + }, + { + "epoch": 0.4, + "grad_norm": 1.2975201938117713, + "learning_rate": 6.801234837423756e-06, + "loss": 0.3232, + "step": 13833 + }, + { + "epoch": 0.4, + "grad_norm": 1.2246140263640717, + "learning_rate": 6.800796659421277e-06, + "loss": 0.308, + "step": 13834 + }, + { + "epoch": 0.4, + "grad_norm": 1.886108587248647, + "learning_rate": 6.800358465526747e-06, + "loss": 0.356, + "step": 13835 + }, + { + "epoch": 0.4, + "grad_norm": 1.6395061491008773, + "learning_rate": 6.799920255744032e-06, + "loss": 0.3507, + "step": 13836 + }, + { + "epoch": 0.4, + "grad_norm": 1.3826354198000734, + "learning_rate": 6.799482030077002e-06, + "loss": 0.3438, + "step": 13837 + }, + { + "epoch": 0.4, + "grad_norm": 1.4964939525302978, + "learning_rate": 6.799043788529525e-06, + "loss": 0.3228, + "step": 13838 + }, + { + "epoch": 0.4, + "grad_norm": 1.361257515711021, + "learning_rate": 6.798605531105465e-06, + "loss": 0.3352, + "step": 13839 + }, + { + "epoch": 0.4, + "grad_norm": 1.4444949470497186, + "learning_rate": 6.7981672578086925e-06, + "loss": 0.3147, + "step": 13840 + }, + { + "epoch": 0.4, + "grad_norm": 1.423129365749333, + "learning_rate": 6.797728968643071e-06, + "loss": 0.3344, + "step": 13841 + }, + { + "epoch": 0.4, + "grad_norm": 1.1778390908015326, + "learning_rate": 6.797290663612473e-06, + "loss": 0.3029, + "step": 13842 + }, + { + "epoch": 0.4, + "grad_norm": 1.3952897185583395, + "learning_rate": 6.796852342720767e-06, + "loss": 0.3208, + "step": 13843 + }, + { + "epoch": 0.4, + "grad_norm": 1.7197413313625851, + "learning_rate": 6.796414005971818e-06, + "loss": 0.3269, + "step": 13844 + }, + { + "epoch": 0.4, + "grad_norm": 1.2959920882793388, + "learning_rate": 6.795975653369496e-06, + "loss": 0.3378, + "step": 13845 + }, + { + "epoch": 0.4, + "grad_norm": 1.259369855559555, + "learning_rate": 6.795537284917666e-06, + "loss": 0.3134, + "step": 13846 + }, + { + "epoch": 0.4, + "grad_norm": 1.7712281334998163, + "learning_rate": 6.795098900620202e-06, + "loss": 0.3385, + "step": 13847 + }, + { + "epoch": 0.4, + "grad_norm": 1.0518552313153142, + "learning_rate": 6.79466050048097e-06, + "loss": 0.5566, + "step": 13848 + }, + { + "epoch": 0.4, + "grad_norm": 1.3494741139747997, + "learning_rate": 6.794222084503839e-06, + "loss": 0.3261, + "step": 13849 + }, + { + "epoch": 0.4, + "grad_norm": 1.3507008704499852, + "learning_rate": 6.793783652692677e-06, + "loss": 0.3222, + "step": 13850 + }, + { + "epoch": 0.4, + "grad_norm": 1.2367452635497629, + "learning_rate": 6.793345205051356e-06, + "loss": 0.3296, + "step": 13851 + }, + { + "epoch": 0.4, + "grad_norm": 1.4521003532418593, + "learning_rate": 6.792906741583743e-06, + "loss": 0.3845, + "step": 13852 + }, + { + "epoch": 0.4, + "grad_norm": 1.3336532353868578, + "learning_rate": 6.792468262293708e-06, + "loss": 0.3639, + "step": 13853 + }, + { + "epoch": 0.4, + "grad_norm": 1.435816895029908, + "learning_rate": 6.792029767185121e-06, + "loss": 0.3255, + "step": 13854 + }, + { + "epoch": 0.4, + "grad_norm": 1.2532143485128886, + "learning_rate": 6.791591256261852e-06, + "loss": 0.3297, + "step": 13855 + }, + { + "epoch": 0.4, + "grad_norm": 1.1839410970890354, + "learning_rate": 6.791152729527768e-06, + "loss": 0.3115, + "step": 13856 + }, + { + "epoch": 0.4, + "grad_norm": 1.2783350153205217, + "learning_rate": 6.790714186986745e-06, + "loss": 0.3278, + "step": 13857 + }, + { + "epoch": 0.4, + "grad_norm": 1.5409138816052872, + "learning_rate": 6.790275628642647e-06, + "loss": 0.3126, + "step": 13858 + }, + { + "epoch": 0.4, + "grad_norm": 1.3164181790043232, + "learning_rate": 6.789837054499346e-06, + "loss": 0.2981, + "step": 13859 + }, + { + "epoch": 0.4, + "grad_norm": 1.2971017977682926, + "learning_rate": 6.789398464560715e-06, + "loss": 0.3259, + "step": 13860 + }, + { + "epoch": 0.4, + "grad_norm": 1.2397373139654042, + "learning_rate": 6.788959858830621e-06, + "loss": 0.3394, + "step": 13861 + }, + { + "epoch": 0.4, + "grad_norm": 1.3731705846487259, + "learning_rate": 6.788521237312938e-06, + "loss": 0.3264, + "step": 13862 + }, + { + "epoch": 0.4, + "grad_norm": 1.2215361792725627, + "learning_rate": 6.788082600011534e-06, + "loss": 0.2944, + "step": 13863 + }, + { + "epoch": 0.4, + "grad_norm": 1.1558269311742198, + "learning_rate": 6.787643946930283e-06, + "loss": 0.3106, + "step": 13864 + }, + { + "epoch": 0.4, + "grad_norm": 0.9984116277994887, + "learning_rate": 6.787205278073052e-06, + "loss": 0.6125, + "step": 13865 + }, + { + "epoch": 0.4, + "grad_norm": 1.5585221957823552, + "learning_rate": 6.786766593443717e-06, + "loss": 0.3209, + "step": 13866 + }, + { + "epoch": 0.4, + "grad_norm": 1.2952984914354648, + "learning_rate": 6.7863278930461455e-06, + "loss": 0.3343, + "step": 13867 + }, + { + "epoch": 0.4, + "grad_norm": 5.221165134894715, + "learning_rate": 6.78588917688421e-06, + "loss": 0.3465, + "step": 13868 + }, + { + "epoch": 0.4, + "grad_norm": 1.3873107332072452, + "learning_rate": 6.785450444961783e-06, + "loss": 0.3053, + "step": 13869 + }, + { + "epoch": 0.4, + "grad_norm": 1.3580632727719613, + "learning_rate": 6.785011697282738e-06, + "loss": 0.3249, + "step": 13870 + }, + { + "epoch": 0.4, + "grad_norm": 1.2851666014710779, + "learning_rate": 6.784572933850944e-06, + "loss": 0.3209, + "step": 13871 + }, + { + "epoch": 0.4, + "grad_norm": 1.9508491592100716, + "learning_rate": 6.784134154670272e-06, + "loss": 0.3189, + "step": 13872 + }, + { + "epoch": 0.4, + "grad_norm": 1.590031471077187, + "learning_rate": 6.783695359744598e-06, + "loss": 0.3235, + "step": 13873 + }, + { + "epoch": 0.4, + "grad_norm": 1.3211373338951147, + "learning_rate": 6.783256549077792e-06, + "loss": 0.3285, + "step": 13874 + }, + { + "epoch": 0.4, + "grad_norm": 1.302566286720317, + "learning_rate": 6.782817722673727e-06, + "loss": 0.3105, + "step": 13875 + }, + { + "epoch": 0.4, + "grad_norm": 1.2865071077816657, + "learning_rate": 6.782378880536278e-06, + "loss": 0.3209, + "step": 13876 + }, + { + "epoch": 0.4, + "grad_norm": 1.2788291716545557, + "learning_rate": 6.7819400226693135e-06, + "loss": 0.3162, + "step": 13877 + }, + { + "epoch": 0.4, + "grad_norm": 1.2834355581662829, + "learning_rate": 6.781501149076709e-06, + "loss": 0.3214, + "step": 13878 + }, + { + "epoch": 0.4, + "grad_norm": 1.3127310448925056, + "learning_rate": 6.781062259762336e-06, + "loss": 0.339, + "step": 13879 + }, + { + "epoch": 0.4, + "grad_norm": 1.267787745805551, + "learning_rate": 6.780623354730072e-06, + "loss": 0.318, + "step": 13880 + }, + { + "epoch": 0.4, + "grad_norm": 1.32776234791358, + "learning_rate": 6.780184433983784e-06, + "loss": 0.3334, + "step": 13881 + }, + { + "epoch": 0.4, + "grad_norm": 1.3891420931872163, + "learning_rate": 6.77974549752735e-06, + "loss": 0.3348, + "step": 13882 + }, + { + "epoch": 0.4, + "grad_norm": 1.3638228291945609, + "learning_rate": 6.779306545364641e-06, + "loss": 0.3352, + "step": 13883 + }, + { + "epoch": 0.4, + "grad_norm": 1.350937460468643, + "learning_rate": 6.7788675774995335e-06, + "loss": 0.3153, + "step": 13884 + }, + { + "epoch": 0.4, + "grad_norm": 5.371799364373341, + "learning_rate": 6.7784285939359e-06, + "loss": 0.3259, + "step": 13885 + }, + { + "epoch": 0.4, + "grad_norm": 1.3605066493209945, + "learning_rate": 6.777989594677614e-06, + "loss": 0.3301, + "step": 13886 + }, + { + "epoch": 0.4, + "grad_norm": 1.3586751958137397, + "learning_rate": 6.777550579728549e-06, + "loss": 0.3483, + "step": 13887 + }, + { + "epoch": 0.4, + "grad_norm": 1.3400185215810505, + "learning_rate": 6.777111549092582e-06, + "loss": 0.3197, + "step": 13888 + }, + { + "epoch": 0.4, + "grad_norm": 1.2604655176384822, + "learning_rate": 6.776672502773586e-06, + "loss": 0.3177, + "step": 13889 + }, + { + "epoch": 0.4, + "grad_norm": 1.4166885972427197, + "learning_rate": 6.776233440775434e-06, + "loss": 0.3451, + "step": 13890 + }, + { + "epoch": 0.4, + "grad_norm": 1.2727859084794564, + "learning_rate": 6.775794363102002e-06, + "loss": 0.3089, + "step": 13891 + }, + { + "epoch": 0.4, + "grad_norm": 1.2118647994969727, + "learning_rate": 6.775355269757166e-06, + "loss": 0.3256, + "step": 13892 + }, + { + "epoch": 0.4, + "grad_norm": 1.3472071965958323, + "learning_rate": 6.7749161607448e-06, + "loss": 0.3247, + "step": 13893 + }, + { + "epoch": 0.4, + "grad_norm": 1.4042160547577491, + "learning_rate": 6.774477036068779e-06, + "loss": 0.3486, + "step": 13894 + }, + { + "epoch": 0.4, + "grad_norm": 1.4653948920976245, + "learning_rate": 6.774037895732979e-06, + "loss": 0.3142, + "step": 13895 + }, + { + "epoch": 0.4, + "grad_norm": 1.1583720245009763, + "learning_rate": 6.773598739741274e-06, + "loss": 0.3039, + "step": 13896 + }, + { + "epoch": 0.4, + "grad_norm": 1.4459541854181346, + "learning_rate": 6.773159568097541e-06, + "loss": 0.354, + "step": 13897 + }, + { + "epoch": 0.4, + "grad_norm": 1.5082707091576606, + "learning_rate": 6.772720380805656e-06, + "loss": 0.3351, + "step": 13898 + }, + { + "epoch": 0.4, + "grad_norm": 1.192802323106162, + "learning_rate": 6.772281177869493e-06, + "loss": 0.3177, + "step": 13899 + }, + { + "epoch": 0.4, + "grad_norm": 3.0266663084031498, + "learning_rate": 6.771841959292929e-06, + "loss": 0.3349, + "step": 13900 + }, + { + "epoch": 0.4, + "grad_norm": 1.2990580986499725, + "learning_rate": 6.77140272507984e-06, + "loss": 0.314, + "step": 13901 + }, + { + "epoch": 0.4, + "grad_norm": 1.2111734200040505, + "learning_rate": 6.770963475234101e-06, + "loss": 0.3125, + "step": 13902 + }, + { + "epoch": 0.4, + "grad_norm": 1.405492473419879, + "learning_rate": 6.770524209759593e-06, + "loss": 0.3424, + "step": 13903 + }, + { + "epoch": 0.4, + "grad_norm": 1.4972357582949634, + "learning_rate": 6.770084928660185e-06, + "loss": 0.3349, + "step": 13904 + }, + { + "epoch": 0.4, + "grad_norm": 1.3796985041664553, + "learning_rate": 6.769645631939761e-06, + "loss": 0.3206, + "step": 13905 + }, + { + "epoch": 0.4, + "grad_norm": 1.378731174621844, + "learning_rate": 6.769206319602193e-06, + "loss": 0.3556, + "step": 13906 + }, + { + "epoch": 0.4, + "grad_norm": 1.3677000733688756, + "learning_rate": 6.768766991651361e-06, + "loss": 0.3093, + "step": 13907 + }, + { + "epoch": 0.4, + "grad_norm": 1.362806097552817, + "learning_rate": 6.7683276480911395e-06, + "loss": 0.3424, + "step": 13908 + }, + { + "epoch": 0.4, + "grad_norm": 1.3596295083401608, + "learning_rate": 6.767888288925408e-06, + "loss": 0.3171, + "step": 13909 + }, + { + "epoch": 0.4, + "grad_norm": 1.2686007663256513, + "learning_rate": 6.767448914158041e-06, + "loss": 0.3163, + "step": 13910 + }, + { + "epoch": 0.4, + "grad_norm": 1.2979495810247097, + "learning_rate": 6.767009523792919e-06, + "loss": 0.3033, + "step": 13911 + }, + { + "epoch": 0.4, + "grad_norm": 1.6107172342571354, + "learning_rate": 6.76657011783392e-06, + "loss": 0.372, + "step": 13912 + }, + { + "epoch": 0.4, + "grad_norm": 1.4044264371616049, + "learning_rate": 6.766130696284918e-06, + "loss": 0.3604, + "step": 13913 + }, + { + "epoch": 0.4, + "grad_norm": 1.2833179297896058, + "learning_rate": 6.7656912591497936e-06, + "loss": 0.3176, + "step": 13914 + }, + { + "epoch": 0.4, + "grad_norm": 1.174421882227038, + "learning_rate": 6.765251806432423e-06, + "loss": 0.3107, + "step": 13915 + }, + { + "epoch": 0.4, + "grad_norm": 1.4650170934676359, + "learning_rate": 6.764812338136687e-06, + "loss": 0.3326, + "step": 13916 + }, + { + "epoch": 0.4, + "grad_norm": 1.2675782882063908, + "learning_rate": 6.764372854266463e-06, + "loss": 0.3433, + "step": 13917 + }, + { + "epoch": 0.4, + "grad_norm": 1.2985705160909515, + "learning_rate": 6.763933354825627e-06, + "loss": 0.3203, + "step": 13918 + }, + { + "epoch": 0.4, + "grad_norm": 1.2400071926761027, + "learning_rate": 6.76349383981806e-06, + "loss": 0.332, + "step": 13919 + }, + { + "epoch": 0.4, + "grad_norm": 1.2787504631507862, + "learning_rate": 6.763054309247642e-06, + "loss": 0.3242, + "step": 13920 + }, + { + "epoch": 0.4, + "grad_norm": 1.5330095863029267, + "learning_rate": 6.762614763118249e-06, + "loss": 0.3374, + "step": 13921 + }, + { + "epoch": 0.4, + "grad_norm": 1.3708189983231893, + "learning_rate": 6.762175201433762e-06, + "loss": 0.3229, + "step": 13922 + }, + { + "epoch": 0.4, + "grad_norm": 1.2311475438356005, + "learning_rate": 6.761735624198058e-06, + "loss": 0.3127, + "step": 13923 + }, + { + "epoch": 0.4, + "grad_norm": 1.4043369890205464, + "learning_rate": 6.761296031415018e-06, + "loss": 0.3308, + "step": 13924 + }, + { + "epoch": 0.4, + "grad_norm": 1.4745769497864756, + "learning_rate": 6.7608564230885225e-06, + "loss": 0.328, + "step": 13925 + }, + { + "epoch": 0.4, + "grad_norm": 1.2002790390589186, + "learning_rate": 6.760416799222448e-06, + "loss": 0.3111, + "step": 13926 + }, + { + "epoch": 0.4, + "grad_norm": 1.5372805518329518, + "learning_rate": 6.759977159820675e-06, + "loss": 0.332, + "step": 13927 + }, + { + "epoch": 0.4, + "grad_norm": 1.3505766566428985, + "learning_rate": 6.759537504887086e-06, + "loss": 0.3397, + "step": 13928 + }, + { + "epoch": 0.4, + "grad_norm": 1.3506581164519218, + "learning_rate": 6.759097834425558e-06, + "loss": 0.3174, + "step": 13929 + }, + { + "epoch": 0.4, + "grad_norm": 1.2848940063195189, + "learning_rate": 6.758658148439971e-06, + "loss": 0.3113, + "step": 13930 + }, + { + "epoch": 0.4, + "grad_norm": 1.5186312843406953, + "learning_rate": 6.758218446934208e-06, + "loss": 0.323, + "step": 13931 + }, + { + "epoch": 0.4, + "grad_norm": 1.2766268901094955, + "learning_rate": 6.757778729912148e-06, + "loss": 0.3361, + "step": 13932 + }, + { + "epoch": 0.4, + "grad_norm": 1.4974677993757721, + "learning_rate": 6.7573389973776695e-06, + "loss": 0.3296, + "step": 13933 + }, + { + "epoch": 0.4, + "grad_norm": 1.2838790282683639, + "learning_rate": 6.756899249334655e-06, + "loss": 0.3374, + "step": 13934 + }, + { + "epoch": 0.4, + "grad_norm": 1.2045164268368571, + "learning_rate": 6.756459485786987e-06, + "loss": 0.323, + "step": 13935 + }, + { + "epoch": 0.4, + "grad_norm": 1.303333579439112, + "learning_rate": 6.756019706738544e-06, + "loss": 0.3218, + "step": 13936 + }, + { + "epoch": 0.4, + "grad_norm": 1.2033780797744873, + "learning_rate": 6.755579912193207e-06, + "loss": 0.3291, + "step": 13937 + }, + { + "epoch": 0.4, + "grad_norm": 1.413468791481254, + "learning_rate": 6.755140102154855e-06, + "loss": 0.3648, + "step": 13938 + }, + { + "epoch": 0.4, + "grad_norm": 1.2755008197096032, + "learning_rate": 6.7547002766273774e-06, + "loss": 0.3398, + "step": 13939 + }, + { + "epoch": 0.4, + "grad_norm": 1.2380301105314424, + "learning_rate": 6.7542604356146465e-06, + "loss": 0.3179, + "step": 13940 + }, + { + "epoch": 0.4, + "grad_norm": 1.2921761670296472, + "learning_rate": 6.753820579120549e-06, + "loss": 0.3345, + "step": 13941 + }, + { + "epoch": 0.4, + "grad_norm": 1.5257636144568296, + "learning_rate": 6.753380707148967e-06, + "loss": 0.3314, + "step": 13942 + }, + { + "epoch": 0.4, + "grad_norm": 1.2312348484199163, + "learning_rate": 6.7529408197037795e-06, + "loss": 0.3217, + "step": 13943 + }, + { + "epoch": 0.4, + "grad_norm": 1.3580833027592083, + "learning_rate": 6.75250091678887e-06, + "loss": 0.328, + "step": 13944 + }, + { + "epoch": 0.4, + "grad_norm": 1.2327578826793684, + "learning_rate": 6.752060998408119e-06, + "loss": 0.3378, + "step": 13945 + }, + { + "epoch": 0.4, + "grad_norm": 1.2546820473182683, + "learning_rate": 6.751621064565412e-06, + "loss": 0.325, + "step": 13946 + }, + { + "epoch": 0.4, + "grad_norm": 1.2095482624715792, + "learning_rate": 6.751181115264629e-06, + "loss": 0.3234, + "step": 13947 + }, + { + "epoch": 0.4, + "grad_norm": 1.3153939889705588, + "learning_rate": 6.750741150509653e-06, + "loss": 0.3319, + "step": 13948 + }, + { + "epoch": 0.4, + "grad_norm": 1.3142772829576024, + "learning_rate": 6.750301170304369e-06, + "loss": 0.3573, + "step": 13949 + }, + { + "epoch": 0.4, + "grad_norm": 1.5355298453477686, + "learning_rate": 6.7498611746526545e-06, + "loss": 0.3533, + "step": 13950 + }, + { + "epoch": 0.4, + "grad_norm": 1.3737835833603396, + "learning_rate": 6.749421163558397e-06, + "loss": 0.3627, + "step": 13951 + }, + { + "epoch": 0.4, + "grad_norm": 1.323541137602936, + "learning_rate": 6.7489811370254785e-06, + "loss": 0.343, + "step": 13952 + }, + { + "epoch": 0.4, + "grad_norm": 1.1842165524951003, + "learning_rate": 6.748541095057783e-06, + "loss": 0.3216, + "step": 13953 + }, + { + "epoch": 0.4, + "grad_norm": 1.2528146569653944, + "learning_rate": 6.748101037659192e-06, + "loss": 0.3028, + "step": 13954 + }, + { + "epoch": 0.4, + "grad_norm": 1.2802374108951502, + "learning_rate": 6.74766096483359e-06, + "loss": 0.3359, + "step": 13955 + }, + { + "epoch": 0.4, + "grad_norm": 1.3193192846658202, + "learning_rate": 6.747220876584861e-06, + "loss": 0.3215, + "step": 13956 + }, + { + "epoch": 0.4, + "grad_norm": 1.3239138081491704, + "learning_rate": 6.746780772916887e-06, + "loss": 0.3196, + "step": 13957 + }, + { + "epoch": 0.4, + "grad_norm": 1.4211156430502214, + "learning_rate": 6.746340653833556e-06, + "loss": 0.3645, + "step": 13958 + }, + { + "epoch": 0.4, + "grad_norm": 1.3778967610941197, + "learning_rate": 6.745900519338747e-06, + "loss": 0.34, + "step": 13959 + }, + { + "epoch": 0.4, + "grad_norm": 1.2315422910267315, + "learning_rate": 6.745460369436347e-06, + "loss": 0.3196, + "step": 13960 + }, + { + "epoch": 0.4, + "grad_norm": 1.4690018869856059, + "learning_rate": 6.7450202041302404e-06, + "loss": 0.3263, + "step": 13961 + }, + { + "epoch": 0.4, + "grad_norm": 1.390286239573588, + "learning_rate": 6.744580023424312e-06, + "loss": 0.3294, + "step": 13962 + }, + { + "epoch": 0.4, + "grad_norm": 1.3398720750273534, + "learning_rate": 6.7441398273224435e-06, + "loss": 0.3334, + "step": 13963 + }, + { + "epoch": 0.41, + "grad_norm": 1.337581752903143, + "learning_rate": 6.743699615828523e-06, + "loss": 0.3121, + "step": 13964 + }, + { + "epoch": 0.41, + "grad_norm": 1.4311280638663324, + "learning_rate": 6.743259388946432e-06, + "loss": 0.3593, + "step": 13965 + }, + { + "epoch": 0.41, + "grad_norm": 1.653424623587686, + "learning_rate": 6.74281914668006e-06, + "loss": 0.3204, + "step": 13966 + }, + { + "epoch": 0.41, + "grad_norm": 1.2719304653127101, + "learning_rate": 6.74237888903329e-06, + "loss": 0.3135, + "step": 13967 + }, + { + "epoch": 0.41, + "grad_norm": 1.2006690085556628, + "learning_rate": 6.741938616010004e-06, + "loss": 0.3316, + "step": 13968 + }, + { + "epoch": 0.41, + "grad_norm": 1.3133002122968425, + "learning_rate": 6.741498327614093e-06, + "loss": 0.2955, + "step": 13969 + }, + { + "epoch": 0.41, + "grad_norm": 1.3388344521902351, + "learning_rate": 6.74105802384944e-06, + "loss": 0.3349, + "step": 13970 + }, + { + "epoch": 0.41, + "grad_norm": 1.2609488299194818, + "learning_rate": 6.74061770471993e-06, + "loss": 0.3157, + "step": 13971 + }, + { + "epoch": 0.41, + "grad_norm": 1.4097119285933504, + "learning_rate": 6.7401773702294495e-06, + "loss": 0.3269, + "step": 13972 + }, + { + "epoch": 0.41, + "grad_norm": 1.2652851368567406, + "learning_rate": 6.739737020381883e-06, + "loss": 0.3166, + "step": 13973 + }, + { + "epoch": 0.41, + "grad_norm": 1.2659542397363621, + "learning_rate": 6.7392966551811205e-06, + "loss": 0.3198, + "step": 13974 + }, + { + "epoch": 0.41, + "grad_norm": 1.226016348917496, + "learning_rate": 6.7388562746310425e-06, + "loss": 0.3251, + "step": 13975 + }, + { + "epoch": 0.41, + "grad_norm": 1.3227012007566026, + "learning_rate": 6.738415878735542e-06, + "loss": 0.3525, + "step": 13976 + }, + { + "epoch": 0.41, + "grad_norm": 1.6181412737477083, + "learning_rate": 6.7379754674985e-06, + "loss": 0.3503, + "step": 13977 + }, + { + "epoch": 0.41, + "grad_norm": 1.5916542611652649, + "learning_rate": 6.737535040923806e-06, + "loss": 0.3356, + "step": 13978 + }, + { + "epoch": 0.41, + "grad_norm": 1.1589871947538186, + "learning_rate": 6.737094599015345e-06, + "loss": 0.3096, + "step": 13979 + }, + { + "epoch": 0.41, + "grad_norm": 1.282270626607946, + "learning_rate": 6.736654141777006e-06, + "loss": 0.3468, + "step": 13980 + }, + { + "epoch": 0.41, + "grad_norm": 1.4230318107194306, + "learning_rate": 6.736213669212674e-06, + "loss": 0.3396, + "step": 13981 + }, + { + "epoch": 0.41, + "grad_norm": 1.3712543891515554, + "learning_rate": 6.735773181326238e-06, + "loss": 0.32, + "step": 13982 + }, + { + "epoch": 0.41, + "grad_norm": 1.1637862724057866, + "learning_rate": 6.735332678121585e-06, + "loss": 0.3086, + "step": 13983 + }, + { + "epoch": 0.41, + "grad_norm": 1.191690675683362, + "learning_rate": 6.734892159602601e-06, + "loss": 0.3057, + "step": 13984 + }, + { + "epoch": 0.41, + "grad_norm": 1.215129577421744, + "learning_rate": 6.734451625773174e-06, + "loss": 0.3327, + "step": 13985 + }, + { + "epoch": 0.41, + "grad_norm": 2.027044247815883, + "learning_rate": 6.7340110766371934e-06, + "loss": 0.3426, + "step": 13986 + }, + { + "epoch": 0.41, + "grad_norm": 1.7665158517105688, + "learning_rate": 6.733570512198544e-06, + "loss": 0.3317, + "step": 13987 + }, + { + "epoch": 0.41, + "grad_norm": 1.3318148158028087, + "learning_rate": 6.733129932461117e-06, + "loss": 0.3219, + "step": 13988 + }, + { + "epoch": 0.41, + "grad_norm": 1.560942751069663, + "learning_rate": 6.732689337428798e-06, + "loss": 0.343, + "step": 13989 + }, + { + "epoch": 0.41, + "grad_norm": 1.1884577850718063, + "learning_rate": 6.732248727105478e-06, + "loss": 0.3241, + "step": 13990 + }, + { + "epoch": 0.41, + "grad_norm": 1.294169701848885, + "learning_rate": 6.7318081014950435e-06, + "loss": 0.3177, + "step": 13991 + }, + { + "epoch": 0.41, + "grad_norm": 1.3197975302901945, + "learning_rate": 6.731367460601381e-06, + "loss": 0.3284, + "step": 13992 + }, + { + "epoch": 0.41, + "grad_norm": 1.5465088407618974, + "learning_rate": 6.730926804428384e-06, + "loss": 0.343, + "step": 13993 + }, + { + "epoch": 0.41, + "grad_norm": 1.2316728852394627, + "learning_rate": 6.730486132979938e-06, + "loss": 0.3067, + "step": 13994 + }, + { + "epoch": 0.41, + "grad_norm": 1.2694179490246154, + "learning_rate": 6.730045446259932e-06, + "loss": 0.2972, + "step": 13995 + }, + { + "epoch": 0.41, + "grad_norm": 1.2735383697692018, + "learning_rate": 6.729604744272256e-06, + "loss": 0.3572, + "step": 13996 + }, + { + "epoch": 0.41, + "grad_norm": 1.3736171575113494, + "learning_rate": 6.729164027020799e-06, + "loss": 0.3287, + "step": 13997 + }, + { + "epoch": 0.41, + "grad_norm": 1.7030078574825789, + "learning_rate": 6.728723294509452e-06, + "loss": 0.3233, + "step": 13998 + }, + { + "epoch": 0.41, + "grad_norm": 1.319728048681391, + "learning_rate": 6.728282546742102e-06, + "loss": 0.3338, + "step": 13999 + }, + { + "epoch": 0.41, + "grad_norm": 1.2739487649704837, + "learning_rate": 6.7278417837226375e-06, + "loss": 0.3255, + "step": 14000 + }, + { + "epoch": 0.41, + "grad_norm": 1.2596728593036837, + "learning_rate": 6.727401005454951e-06, + "loss": 0.3228, + "step": 14001 + }, + { + "epoch": 0.41, + "grad_norm": 1.2673275149611285, + "learning_rate": 6.726960211942931e-06, + "loss": 0.3232, + "step": 14002 + }, + { + "epoch": 0.41, + "grad_norm": 0.9534881645258617, + "learning_rate": 6.726519403190469e-06, + "loss": 0.6005, + "step": 14003 + }, + { + "epoch": 0.41, + "grad_norm": 1.4080827001887768, + "learning_rate": 6.726078579201453e-06, + "loss": 0.3111, + "step": 14004 + }, + { + "epoch": 0.41, + "grad_norm": 1.366622977082082, + "learning_rate": 6.725637739979776e-06, + "loss": 0.3192, + "step": 14005 + }, + { + "epoch": 0.41, + "grad_norm": 1.4560848320021729, + "learning_rate": 6.725196885529326e-06, + "loss": 0.3137, + "step": 14006 + }, + { + "epoch": 0.41, + "grad_norm": 1.3509021398760828, + "learning_rate": 6.724756015853994e-06, + "loss": 0.3232, + "step": 14007 + }, + { + "epoch": 0.41, + "grad_norm": 1.3553371224408581, + "learning_rate": 6.724315130957671e-06, + "loss": 0.3149, + "step": 14008 + }, + { + "epoch": 0.41, + "grad_norm": 0.9210717788450895, + "learning_rate": 6.723874230844249e-06, + "loss": 0.6303, + "step": 14009 + }, + { + "epoch": 0.41, + "grad_norm": 1.509305517118, + "learning_rate": 6.723433315517616e-06, + "loss": 0.342, + "step": 14010 + }, + { + "epoch": 0.41, + "grad_norm": 1.2964873578635498, + "learning_rate": 6.722992384981666e-06, + "loss": 0.3309, + "step": 14011 + }, + { + "epoch": 0.41, + "grad_norm": 1.3203028910400196, + "learning_rate": 6.722551439240289e-06, + "loss": 0.3125, + "step": 14012 + }, + { + "epoch": 0.41, + "grad_norm": 1.3048194050682858, + "learning_rate": 6.722110478297378e-06, + "loss": 0.3398, + "step": 14013 + }, + { + "epoch": 0.41, + "grad_norm": 1.2586595879663345, + "learning_rate": 6.72166950215682e-06, + "loss": 0.3149, + "step": 14014 + }, + { + "epoch": 0.41, + "grad_norm": 1.2167827005530292, + "learning_rate": 6.721228510822511e-06, + "loss": 0.3367, + "step": 14015 + }, + { + "epoch": 0.41, + "grad_norm": 1.4430785653828508, + "learning_rate": 6.720787504298341e-06, + "loss": 0.3533, + "step": 14016 + }, + { + "epoch": 0.41, + "grad_norm": 1.3345459217263187, + "learning_rate": 6.720346482588202e-06, + "loss": 0.3332, + "step": 14017 + }, + { + "epoch": 0.41, + "grad_norm": 1.2810488079291271, + "learning_rate": 6.719905445695987e-06, + "loss": 0.3139, + "step": 14018 + }, + { + "epoch": 0.41, + "grad_norm": 1.2607728869431374, + "learning_rate": 6.719464393625587e-06, + "loss": 0.3339, + "step": 14019 + }, + { + "epoch": 0.41, + "grad_norm": 1.359695690559472, + "learning_rate": 6.7190233263808934e-06, + "loss": 0.3311, + "step": 14020 + }, + { + "epoch": 0.41, + "grad_norm": 1.3910777597520008, + "learning_rate": 6.718582243965801e-06, + "loss": 0.3174, + "step": 14021 + }, + { + "epoch": 0.41, + "grad_norm": 1.3018282190065995, + "learning_rate": 6.718141146384203e-06, + "loss": 0.329, + "step": 14022 + }, + { + "epoch": 0.41, + "grad_norm": 1.2994148325004304, + "learning_rate": 6.717700033639988e-06, + "loss": 0.3194, + "step": 14023 + }, + { + "epoch": 0.41, + "grad_norm": 1.2577694317815646, + "learning_rate": 6.717258905737051e-06, + "loss": 0.3169, + "step": 14024 + }, + { + "epoch": 0.41, + "grad_norm": 1.3372131459397172, + "learning_rate": 6.716817762679285e-06, + "loss": 0.3324, + "step": 14025 + }, + { + "epoch": 0.41, + "grad_norm": 1.6413342795751098, + "learning_rate": 6.716376604470584e-06, + "loss": 0.3191, + "step": 14026 + }, + { + "epoch": 0.41, + "grad_norm": 1.28853504554195, + "learning_rate": 6.715935431114841e-06, + "loss": 0.3299, + "step": 14027 + }, + { + "epoch": 0.41, + "grad_norm": 1.2473455610415987, + "learning_rate": 6.71549424261595e-06, + "loss": 0.3189, + "step": 14028 + }, + { + "epoch": 0.41, + "grad_norm": 1.2257609179839792, + "learning_rate": 6.7150530389778e-06, + "loss": 0.3191, + "step": 14029 + }, + { + "epoch": 0.41, + "grad_norm": 1.373250194749538, + "learning_rate": 6.71461182020429e-06, + "loss": 0.3336, + "step": 14030 + }, + { + "epoch": 0.41, + "grad_norm": 1.3113047994638665, + "learning_rate": 6.714170586299311e-06, + "loss": 0.3279, + "step": 14031 + }, + { + "epoch": 0.41, + "grad_norm": 1.2656242933753523, + "learning_rate": 6.7137293372667576e-06, + "loss": 0.3034, + "step": 14032 + }, + { + "epoch": 0.41, + "grad_norm": 1.2660631177413137, + "learning_rate": 6.713288073110523e-06, + "loss": 0.3177, + "step": 14033 + }, + { + "epoch": 0.41, + "grad_norm": 1.2993701550224437, + "learning_rate": 6.712846793834503e-06, + "loss": 0.3164, + "step": 14034 + }, + { + "epoch": 0.41, + "grad_norm": 1.3468729715768624, + "learning_rate": 6.712405499442592e-06, + "loss": 0.3575, + "step": 14035 + }, + { + "epoch": 0.41, + "grad_norm": 1.4435917507526552, + "learning_rate": 6.711964189938683e-06, + "loss": 0.3345, + "step": 14036 + }, + { + "epoch": 0.41, + "grad_norm": 0.9556403755993481, + "learning_rate": 6.7115228653266695e-06, + "loss": 0.6367, + "step": 14037 + }, + { + "epoch": 0.41, + "grad_norm": 1.3713885166804893, + "learning_rate": 6.711081525610448e-06, + "loss": 0.3416, + "step": 14038 + }, + { + "epoch": 0.41, + "grad_norm": 1.5328648381385361, + "learning_rate": 6.710640170793914e-06, + "loss": 0.3171, + "step": 14039 + }, + { + "epoch": 0.41, + "grad_norm": 1.576835161317724, + "learning_rate": 6.710198800880963e-06, + "loss": 0.3253, + "step": 14040 + }, + { + "epoch": 0.41, + "grad_norm": 1.3003412831232042, + "learning_rate": 6.709757415875486e-06, + "loss": 0.3404, + "step": 14041 + }, + { + "epoch": 0.41, + "grad_norm": 1.4286659156037729, + "learning_rate": 6.709316015781382e-06, + "loss": 0.3358, + "step": 14042 + }, + { + "epoch": 0.41, + "grad_norm": 1.3588178838309242, + "learning_rate": 6.708874600602544e-06, + "loss": 0.3382, + "step": 14043 + }, + { + "epoch": 0.41, + "grad_norm": 1.24982659136008, + "learning_rate": 6.7084331703428695e-06, + "loss": 0.3197, + "step": 14044 + }, + { + "epoch": 0.41, + "grad_norm": 1.4620701156139146, + "learning_rate": 6.707991725006254e-06, + "loss": 0.3328, + "step": 14045 + }, + { + "epoch": 0.41, + "grad_norm": 1.3892827222525768, + "learning_rate": 6.7075502645965915e-06, + "loss": 0.3411, + "step": 14046 + }, + { + "epoch": 0.41, + "grad_norm": 1.4583662089182585, + "learning_rate": 6.70710878911778e-06, + "loss": 0.3266, + "step": 14047 + }, + { + "epoch": 0.41, + "grad_norm": 0.9087349227208964, + "learning_rate": 6.7066672985737135e-06, + "loss": 0.6006, + "step": 14048 + }, + { + "epoch": 0.41, + "grad_norm": 1.537359390799506, + "learning_rate": 6.706225792968291e-06, + "loss": 0.3287, + "step": 14049 + }, + { + "epoch": 0.41, + "grad_norm": 1.4542853995044105, + "learning_rate": 6.7057842723054044e-06, + "loss": 0.3607, + "step": 14050 + }, + { + "epoch": 0.41, + "grad_norm": 1.301033319567803, + "learning_rate": 6.705342736588955e-06, + "loss": 0.3347, + "step": 14051 + }, + { + "epoch": 0.41, + "grad_norm": 0.9814111659193374, + "learning_rate": 6.704901185822834e-06, + "loss": 0.5876, + "step": 14052 + }, + { + "epoch": 0.41, + "grad_norm": 1.2500980803561528, + "learning_rate": 6.704459620010945e-06, + "loss": 0.3122, + "step": 14053 + }, + { + "epoch": 0.41, + "grad_norm": 0.9465488844634016, + "learning_rate": 6.704018039157179e-06, + "loss": 0.6607, + "step": 14054 + }, + { + "epoch": 0.41, + "grad_norm": 1.1949614697444242, + "learning_rate": 6.7035764432654346e-06, + "loss": 0.3172, + "step": 14055 + }, + { + "epoch": 0.41, + "grad_norm": 1.2497323050133278, + "learning_rate": 6.70313483233961e-06, + "loss": 0.3037, + "step": 14056 + }, + { + "epoch": 0.41, + "grad_norm": 1.2730385536946192, + "learning_rate": 6.7026932063836e-06, + "loss": 0.3113, + "step": 14057 + }, + { + "epoch": 0.41, + "grad_norm": 1.365267580915418, + "learning_rate": 6.702251565401305e-06, + "loss": 0.3297, + "step": 14058 + }, + { + "epoch": 0.41, + "grad_norm": 2.351021887623404, + "learning_rate": 6.701809909396622e-06, + "loss": 0.3481, + "step": 14059 + }, + { + "epoch": 0.41, + "grad_norm": 1.5596899729067626, + "learning_rate": 6.701368238373444e-06, + "loss": 0.3575, + "step": 14060 + }, + { + "epoch": 0.41, + "grad_norm": 2.133199156575233, + "learning_rate": 6.700926552335676e-06, + "loss": 0.3288, + "step": 14061 + }, + { + "epoch": 0.41, + "grad_norm": 1.6172493630043867, + "learning_rate": 6.70048485128721e-06, + "loss": 0.3444, + "step": 14062 + }, + { + "epoch": 0.41, + "grad_norm": 2.0835026377442856, + "learning_rate": 6.700043135231949e-06, + "loss": 0.3262, + "step": 14063 + }, + { + "epoch": 0.41, + "grad_norm": 1.2670512445368547, + "learning_rate": 6.6996014041737855e-06, + "loss": 0.3225, + "step": 14064 + }, + { + "epoch": 0.41, + "grad_norm": 1.3887310093173777, + "learning_rate": 6.699159658116623e-06, + "loss": 0.3305, + "step": 14065 + }, + { + "epoch": 0.41, + "grad_norm": 1.3367254982094818, + "learning_rate": 6.698717897064357e-06, + "loss": 0.33, + "step": 14066 + }, + { + "epoch": 0.41, + "grad_norm": 1.7209528637201048, + "learning_rate": 6.698276121020886e-06, + "loss": 0.3794, + "step": 14067 + }, + { + "epoch": 0.41, + "grad_norm": 1.4039667275869934, + "learning_rate": 6.697834329990112e-06, + "loss": 0.3402, + "step": 14068 + }, + { + "epoch": 0.41, + "grad_norm": 2.0878901979082647, + "learning_rate": 6.697392523975927e-06, + "loss": 0.3341, + "step": 14069 + }, + { + "epoch": 0.41, + "grad_norm": 1.410935751613692, + "learning_rate": 6.696950702982237e-06, + "loss": 0.3375, + "step": 14070 + }, + { + "epoch": 0.41, + "grad_norm": 1.2118038009910772, + "learning_rate": 6.696508867012936e-06, + "loss": 0.3307, + "step": 14071 + }, + { + "epoch": 0.41, + "grad_norm": 1.305675925436131, + "learning_rate": 6.696067016071929e-06, + "loss": 0.3332, + "step": 14072 + }, + { + "epoch": 0.41, + "grad_norm": 1.297010799103168, + "learning_rate": 6.69562515016311e-06, + "loss": 0.3172, + "step": 14073 + }, + { + "epoch": 0.41, + "grad_norm": 1.3305074865517035, + "learning_rate": 6.695183269290378e-06, + "loss": 0.333, + "step": 14074 + }, + { + "epoch": 0.41, + "grad_norm": 1.6012558613773185, + "learning_rate": 6.694741373457637e-06, + "loss": 0.3393, + "step": 14075 + }, + { + "epoch": 0.41, + "grad_norm": 1.2698447840625742, + "learning_rate": 6.694299462668785e-06, + "loss": 0.3272, + "step": 14076 + }, + { + "epoch": 0.41, + "grad_norm": 1.3491852658315193, + "learning_rate": 6.693857536927722e-06, + "loss": 0.3145, + "step": 14077 + }, + { + "epoch": 0.41, + "grad_norm": 1.304232508580911, + "learning_rate": 6.693415596238345e-06, + "loss": 0.3564, + "step": 14078 + }, + { + "epoch": 0.41, + "grad_norm": 1.2644188730056545, + "learning_rate": 6.692973640604557e-06, + "loss": 0.3315, + "step": 14079 + }, + { + "epoch": 0.41, + "grad_norm": 1.3694026550375418, + "learning_rate": 6.692531670030259e-06, + "loss": 0.3402, + "step": 14080 + }, + { + "epoch": 0.41, + "grad_norm": 1.5316448687751631, + "learning_rate": 6.692089684519351e-06, + "loss": 0.3135, + "step": 14081 + }, + { + "epoch": 0.41, + "grad_norm": 1.3158755168116834, + "learning_rate": 6.691647684075732e-06, + "loss": 0.3284, + "step": 14082 + }, + { + "epoch": 0.41, + "grad_norm": 0.9759153076297964, + "learning_rate": 6.691205668703302e-06, + "loss": 0.6069, + "step": 14083 + }, + { + "epoch": 0.41, + "grad_norm": 1.4445863018191751, + "learning_rate": 6.690763638405965e-06, + "loss": 0.325, + "step": 14084 + }, + { + "epoch": 0.41, + "grad_norm": 1.5051031328455293, + "learning_rate": 6.690321593187619e-06, + "loss": 0.333, + "step": 14085 + }, + { + "epoch": 0.41, + "grad_norm": 1.6312243704320284, + "learning_rate": 6.689879533052167e-06, + "loss": 0.3254, + "step": 14086 + }, + { + "epoch": 0.41, + "grad_norm": 1.2321176080081615, + "learning_rate": 6.689437458003509e-06, + "loss": 0.3016, + "step": 14087 + }, + { + "epoch": 0.41, + "grad_norm": 1.2619875228298565, + "learning_rate": 6.688995368045546e-06, + "loss": 0.3551, + "step": 14088 + }, + { + "epoch": 0.41, + "grad_norm": 1.2009139213352698, + "learning_rate": 6.68855326318218e-06, + "loss": 0.3128, + "step": 14089 + }, + { + "epoch": 0.41, + "grad_norm": 1.212760128789386, + "learning_rate": 6.688111143417315e-06, + "loss": 0.3267, + "step": 14090 + }, + { + "epoch": 0.41, + "grad_norm": 2.068150521738996, + "learning_rate": 6.687669008754849e-06, + "loss": 0.3674, + "step": 14091 + }, + { + "epoch": 0.41, + "grad_norm": 1.2144512080638203, + "learning_rate": 6.6872268591986845e-06, + "loss": 0.3087, + "step": 14092 + }, + { + "epoch": 0.41, + "grad_norm": 1.4354932325596548, + "learning_rate": 6.686784694752725e-06, + "loss": 0.3644, + "step": 14093 + }, + { + "epoch": 0.41, + "grad_norm": 1.3479559017106517, + "learning_rate": 6.686342515420873e-06, + "loss": 0.3525, + "step": 14094 + }, + { + "epoch": 0.41, + "grad_norm": 1.2925738976084726, + "learning_rate": 6.685900321207027e-06, + "loss": 0.3496, + "step": 14095 + }, + { + "epoch": 0.41, + "grad_norm": 1.3772186552998504, + "learning_rate": 6.685458112115095e-06, + "loss": 0.3342, + "step": 14096 + }, + { + "epoch": 0.41, + "grad_norm": 1.2530321550605859, + "learning_rate": 6.685015888148975e-06, + "loss": 0.3136, + "step": 14097 + }, + { + "epoch": 0.41, + "grad_norm": 1.337720525492816, + "learning_rate": 6.68457364931257e-06, + "loss": 0.3185, + "step": 14098 + }, + { + "epoch": 0.41, + "grad_norm": 1.2766632208802426, + "learning_rate": 6.684131395609784e-06, + "loss": 0.3136, + "step": 14099 + }, + { + "epoch": 0.41, + "grad_norm": 1.3508149238738878, + "learning_rate": 6.683689127044522e-06, + "loss": 0.3085, + "step": 14100 + }, + { + "epoch": 0.41, + "grad_norm": 1.6355690346685605, + "learning_rate": 6.683246843620683e-06, + "loss": 0.3381, + "step": 14101 + }, + { + "epoch": 0.41, + "grad_norm": 1.3761360427162717, + "learning_rate": 6.682804545342172e-06, + "loss": 0.3091, + "step": 14102 + }, + { + "epoch": 0.41, + "grad_norm": 1.5161228035076868, + "learning_rate": 6.682362232212892e-06, + "loss": 0.3207, + "step": 14103 + }, + { + "epoch": 0.41, + "grad_norm": 1.237242236441166, + "learning_rate": 6.681919904236748e-06, + "loss": 0.3193, + "step": 14104 + }, + { + "epoch": 0.41, + "grad_norm": 1.2304651838342655, + "learning_rate": 6.681477561417641e-06, + "loss": 0.3192, + "step": 14105 + }, + { + "epoch": 0.41, + "grad_norm": 1.2670610561695153, + "learning_rate": 6.681035203759476e-06, + "loss": 0.3143, + "step": 14106 + }, + { + "epoch": 0.41, + "grad_norm": 1.4483488122479105, + "learning_rate": 6.680592831266157e-06, + "loss": 0.32, + "step": 14107 + }, + { + "epoch": 0.41, + "grad_norm": 2.1887531827930498, + "learning_rate": 6.680150443941588e-06, + "loss": 0.3208, + "step": 14108 + }, + { + "epoch": 0.41, + "grad_norm": 1.3261030960027675, + "learning_rate": 6.679708041789672e-06, + "loss": 0.323, + "step": 14109 + }, + { + "epoch": 0.41, + "grad_norm": 1.4083387788783204, + "learning_rate": 6.679265624814315e-06, + "loss": 0.318, + "step": 14110 + }, + { + "epoch": 0.41, + "grad_norm": 2.4220931472145932, + "learning_rate": 6.678823193019419e-06, + "loss": 0.3239, + "step": 14111 + }, + { + "epoch": 0.41, + "grad_norm": 1.3078872978751, + "learning_rate": 6.678380746408889e-06, + "loss": 0.3144, + "step": 14112 + }, + { + "epoch": 0.41, + "grad_norm": 1.2495389993933284, + "learning_rate": 6.677938284986633e-06, + "loss": 0.3397, + "step": 14113 + }, + { + "epoch": 0.41, + "grad_norm": 1.2406272940155298, + "learning_rate": 6.67749580875655e-06, + "loss": 0.3027, + "step": 14114 + }, + { + "epoch": 0.41, + "grad_norm": 1.338808429306711, + "learning_rate": 6.6770533177225495e-06, + "loss": 0.3128, + "step": 14115 + }, + { + "epoch": 0.41, + "grad_norm": 2.3312066981354893, + "learning_rate": 6.676610811888534e-06, + "loss": 0.3239, + "step": 14116 + }, + { + "epoch": 0.41, + "grad_norm": 1.2038678439353627, + "learning_rate": 6.67616829125841e-06, + "loss": 0.3211, + "step": 14117 + }, + { + "epoch": 0.41, + "grad_norm": 1.2514908689176978, + "learning_rate": 6.675725755836083e-06, + "loss": 0.3247, + "step": 14118 + }, + { + "epoch": 0.41, + "grad_norm": 1.197475737499131, + "learning_rate": 6.675283205625457e-06, + "loss": 0.3025, + "step": 14119 + }, + { + "epoch": 0.41, + "grad_norm": 1.435177220297325, + "learning_rate": 6.674840640630438e-06, + "loss": 0.3168, + "step": 14120 + }, + { + "epoch": 0.41, + "grad_norm": 1.24858211199899, + "learning_rate": 6.674398060854931e-06, + "loss": 0.312, + "step": 14121 + }, + { + "epoch": 0.41, + "grad_norm": 1.8030314047318692, + "learning_rate": 6.673955466302844e-06, + "loss": 0.3194, + "step": 14122 + }, + { + "epoch": 0.41, + "grad_norm": 1.3417488717961148, + "learning_rate": 6.6735128569780806e-06, + "loss": 0.3796, + "step": 14123 + }, + { + "epoch": 0.41, + "grad_norm": 1.2735247743764093, + "learning_rate": 6.673070232884549e-06, + "loss": 0.3176, + "step": 14124 + }, + { + "epoch": 0.41, + "grad_norm": 1.2280924267372777, + "learning_rate": 6.672627594026152e-06, + "loss": 0.3246, + "step": 14125 + }, + { + "epoch": 0.41, + "grad_norm": 2.2323813856990684, + "learning_rate": 6.672184940406799e-06, + "loss": 0.303, + "step": 14126 + }, + { + "epoch": 0.41, + "grad_norm": 1.331123291074458, + "learning_rate": 6.671742272030395e-06, + "loss": 0.3279, + "step": 14127 + }, + { + "epoch": 0.41, + "grad_norm": 1.4173944273586834, + "learning_rate": 6.671299588900847e-06, + "loss": 0.3456, + "step": 14128 + }, + { + "epoch": 0.41, + "grad_norm": 1.2434667815905038, + "learning_rate": 6.670856891022061e-06, + "loss": 0.3048, + "step": 14129 + }, + { + "epoch": 0.41, + "grad_norm": 1.39093734499072, + "learning_rate": 6.670414178397946e-06, + "loss": 0.3663, + "step": 14130 + }, + { + "epoch": 0.41, + "grad_norm": 1.4098013451160563, + "learning_rate": 6.669971451032406e-06, + "loss": 0.342, + "step": 14131 + }, + { + "epoch": 0.41, + "grad_norm": 1.4115509428811834, + "learning_rate": 6.66952870892935e-06, + "loss": 0.3394, + "step": 14132 + }, + { + "epoch": 0.41, + "grad_norm": 1.2660961402287731, + "learning_rate": 6.669085952092684e-06, + "loss": 0.327, + "step": 14133 + }, + { + "epoch": 0.41, + "grad_norm": 1.3399321102994841, + "learning_rate": 6.668643180526316e-06, + "loss": 0.3393, + "step": 14134 + }, + { + "epoch": 0.41, + "grad_norm": 1.6846845677243178, + "learning_rate": 6.668200394234154e-06, + "loss": 0.3305, + "step": 14135 + }, + { + "epoch": 0.41, + "grad_norm": 1.4120788392277694, + "learning_rate": 6.6677575932201055e-06, + "loss": 0.3274, + "step": 14136 + }, + { + "epoch": 0.41, + "grad_norm": 1.3315385307432828, + "learning_rate": 6.667314777488077e-06, + "loss": 0.3155, + "step": 14137 + }, + { + "epoch": 0.41, + "grad_norm": 1.4586072995229538, + "learning_rate": 6.666871947041978e-06, + "loss": 0.316, + "step": 14138 + }, + { + "epoch": 0.41, + "grad_norm": 1.561280358795157, + "learning_rate": 6.666429101885714e-06, + "loss": 0.3203, + "step": 14139 + }, + { + "epoch": 0.41, + "grad_norm": 1.289340728929407, + "learning_rate": 6.665986242023196e-06, + "loss": 0.2998, + "step": 14140 + }, + { + "epoch": 0.41, + "grad_norm": 1.317783925904263, + "learning_rate": 6.665543367458331e-06, + "loss": 0.3296, + "step": 14141 + }, + { + "epoch": 0.41, + "grad_norm": 1.297631688240405, + "learning_rate": 6.665100478195025e-06, + "loss": 0.3105, + "step": 14142 + }, + { + "epoch": 0.41, + "grad_norm": 1.900611248228855, + "learning_rate": 6.664657574237191e-06, + "loss": 0.3368, + "step": 14143 + }, + { + "epoch": 0.41, + "grad_norm": 1.2243852117636325, + "learning_rate": 6.6642146555887345e-06, + "loss": 0.3341, + "step": 14144 + }, + { + "epoch": 0.41, + "grad_norm": 1.0221122247558272, + "learning_rate": 6.663771722253567e-06, + "loss": 0.6338, + "step": 14145 + }, + { + "epoch": 0.41, + "grad_norm": 0.9945109926389069, + "learning_rate": 6.6633287742355935e-06, + "loss": 0.5933, + "step": 14146 + }, + { + "epoch": 0.41, + "grad_norm": 1.3170160503406931, + "learning_rate": 6.662885811538725e-06, + "loss": 0.3154, + "step": 14147 + }, + { + "epoch": 0.41, + "grad_norm": 1.5267995204693479, + "learning_rate": 6.6624428341668715e-06, + "loss": 0.3605, + "step": 14148 + }, + { + "epoch": 0.41, + "grad_norm": 1.8317462105510507, + "learning_rate": 6.661999842123943e-06, + "loss": 0.3478, + "step": 14149 + }, + { + "epoch": 0.41, + "grad_norm": 1.9728091896780786, + "learning_rate": 6.661556835413846e-06, + "loss": 0.3348, + "step": 14150 + }, + { + "epoch": 0.41, + "grad_norm": 1.263841958596659, + "learning_rate": 6.661113814040491e-06, + "loss": 0.3057, + "step": 14151 + }, + { + "epoch": 0.41, + "grad_norm": 1.3159354054536223, + "learning_rate": 6.6606707780077894e-06, + "loss": 0.3274, + "step": 14152 + }, + { + "epoch": 0.41, + "grad_norm": 3.152640457759312, + "learning_rate": 6.6602277273196496e-06, + "loss": 0.3397, + "step": 14153 + }, + { + "epoch": 0.41, + "grad_norm": 1.4447671480028683, + "learning_rate": 6.65978466197998e-06, + "loss": 0.3245, + "step": 14154 + }, + { + "epoch": 0.41, + "grad_norm": 2.3495615493654842, + "learning_rate": 6.6593415819926945e-06, + "loss": 0.3041, + "step": 14155 + }, + { + "epoch": 0.41, + "grad_norm": 1.4314143615446082, + "learning_rate": 6.6588984873617e-06, + "loss": 0.3385, + "step": 14156 + }, + { + "epoch": 0.41, + "grad_norm": 1.3265954754534461, + "learning_rate": 6.658455378090908e-06, + "loss": 0.3414, + "step": 14157 + }, + { + "epoch": 0.41, + "grad_norm": 1.2234547016806736, + "learning_rate": 6.658012254184228e-06, + "loss": 0.3105, + "step": 14158 + }, + { + "epoch": 0.41, + "grad_norm": 2.0680080017873887, + "learning_rate": 6.657569115645573e-06, + "loss": 0.3176, + "step": 14159 + }, + { + "epoch": 0.41, + "grad_norm": 1.4770843872595192, + "learning_rate": 6.657125962478852e-06, + "loss": 0.3407, + "step": 14160 + }, + { + "epoch": 0.41, + "grad_norm": 1.238266873283718, + "learning_rate": 6.656682794687976e-06, + "loss": 0.3086, + "step": 14161 + }, + { + "epoch": 0.41, + "grad_norm": 1.3642319840031014, + "learning_rate": 6.6562396122768544e-06, + "loss": 0.3314, + "step": 14162 + }, + { + "epoch": 0.41, + "grad_norm": 1.561096423836218, + "learning_rate": 6.655796415249402e-06, + "loss": 0.3126, + "step": 14163 + }, + { + "epoch": 0.41, + "grad_norm": 1.3478286451085848, + "learning_rate": 6.655353203609527e-06, + "loss": 0.3178, + "step": 14164 + }, + { + "epoch": 0.41, + "grad_norm": 1.2277688763942083, + "learning_rate": 6.6549099773611415e-06, + "loss": 0.3167, + "step": 14165 + }, + { + "epoch": 0.41, + "grad_norm": 1.508951368882355, + "learning_rate": 6.654466736508157e-06, + "loss": 0.3411, + "step": 14166 + }, + { + "epoch": 0.41, + "grad_norm": 1.2208217662071834, + "learning_rate": 6.6540234810544855e-06, + "loss": 0.3222, + "step": 14167 + }, + { + "epoch": 0.41, + "grad_norm": 1.3211335833328266, + "learning_rate": 6.653580211004039e-06, + "loss": 0.3407, + "step": 14168 + }, + { + "epoch": 0.41, + "grad_norm": 1.4224096478982295, + "learning_rate": 6.653136926360728e-06, + "loss": 0.3081, + "step": 14169 + }, + { + "epoch": 0.41, + "grad_norm": 1.2734058485789743, + "learning_rate": 6.652693627128465e-06, + "loss": 0.3233, + "step": 14170 + }, + { + "epoch": 0.41, + "grad_norm": 1.2720756447614976, + "learning_rate": 6.652250313311163e-06, + "loss": 0.3172, + "step": 14171 + }, + { + "epoch": 0.41, + "grad_norm": 1.5551004916613118, + "learning_rate": 6.651806984912732e-06, + "loss": 0.3474, + "step": 14172 + }, + { + "epoch": 0.41, + "grad_norm": 1.4755112982311214, + "learning_rate": 6.651363641937089e-06, + "loss": 0.3304, + "step": 14173 + }, + { + "epoch": 0.41, + "grad_norm": 1.203855804968325, + "learning_rate": 6.6509202843881414e-06, + "loss": 0.3136, + "step": 14174 + }, + { + "epoch": 0.41, + "grad_norm": 1.2515305092530888, + "learning_rate": 6.650476912269804e-06, + "loss": 0.3364, + "step": 14175 + }, + { + "epoch": 0.41, + "grad_norm": 1.28042644180458, + "learning_rate": 6.65003352558599e-06, + "loss": 0.3087, + "step": 14176 + }, + { + "epoch": 0.41, + "grad_norm": 1.2710070928681219, + "learning_rate": 6.649590124340612e-06, + "loss": 0.2973, + "step": 14177 + }, + { + "epoch": 0.41, + "grad_norm": 1.3721235731929773, + "learning_rate": 6.649146708537584e-06, + "loss": 0.3422, + "step": 14178 + }, + { + "epoch": 0.41, + "grad_norm": 1.5230599055159313, + "learning_rate": 6.648703278180816e-06, + "loss": 0.3327, + "step": 14179 + }, + { + "epoch": 0.41, + "grad_norm": 1.2926991406861958, + "learning_rate": 6.648259833274224e-06, + "loss": 0.3626, + "step": 14180 + }, + { + "epoch": 0.41, + "grad_norm": 1.3114792511828302, + "learning_rate": 6.64781637382172e-06, + "loss": 0.3409, + "step": 14181 + }, + { + "epoch": 0.41, + "grad_norm": 1.2515690637171464, + "learning_rate": 6.64737289982722e-06, + "loss": 0.2967, + "step": 14182 + }, + { + "epoch": 0.41, + "grad_norm": 1.3502270516110217, + "learning_rate": 6.646929411294636e-06, + "loss": 0.3115, + "step": 14183 + }, + { + "epoch": 0.41, + "grad_norm": 1.1827311928086301, + "learning_rate": 6.64648590822788e-06, + "loss": 0.3284, + "step": 14184 + }, + { + "epoch": 0.41, + "grad_norm": 1.3712316157223368, + "learning_rate": 6.646042390630867e-06, + "loss": 0.3064, + "step": 14185 + }, + { + "epoch": 0.41, + "grad_norm": 1.4735828469907883, + "learning_rate": 6.645598858507514e-06, + "loss": 0.3297, + "step": 14186 + }, + { + "epoch": 0.41, + "grad_norm": 1.2606728535144407, + "learning_rate": 6.64515531186173e-06, + "loss": 0.3137, + "step": 14187 + }, + { + "epoch": 0.41, + "grad_norm": 1.301773113729369, + "learning_rate": 6.644711750697435e-06, + "loss": 0.3208, + "step": 14188 + }, + { + "epoch": 0.41, + "grad_norm": 1.5726402251765663, + "learning_rate": 6.644268175018538e-06, + "loss": 0.3167, + "step": 14189 + }, + { + "epoch": 0.41, + "grad_norm": 1.2590747559028679, + "learning_rate": 6.643824584828957e-06, + "loss": 0.3621, + "step": 14190 + }, + { + "epoch": 0.41, + "grad_norm": 1.374977879695618, + "learning_rate": 6.643380980132608e-06, + "loss": 0.3688, + "step": 14191 + }, + { + "epoch": 0.41, + "grad_norm": 1.233437186993263, + "learning_rate": 6.642937360933401e-06, + "loss": 0.3082, + "step": 14192 + }, + { + "epoch": 0.41, + "grad_norm": 1.4599755540388828, + "learning_rate": 6.642493727235254e-06, + "loss": 0.3334, + "step": 14193 + }, + { + "epoch": 0.41, + "grad_norm": 1.368297865238512, + "learning_rate": 6.642050079042082e-06, + "loss": 0.3207, + "step": 14194 + }, + { + "epoch": 0.41, + "grad_norm": 1.3508419585456495, + "learning_rate": 6.6416064163578e-06, + "loss": 0.3199, + "step": 14195 + }, + { + "epoch": 0.41, + "grad_norm": 1.1965480404703932, + "learning_rate": 6.641162739186324e-06, + "loss": 0.3178, + "step": 14196 + }, + { + "epoch": 0.41, + "grad_norm": 1.2654941151232346, + "learning_rate": 6.640719047531567e-06, + "loss": 0.3374, + "step": 14197 + }, + { + "epoch": 0.41, + "grad_norm": 1.1419318293642648, + "learning_rate": 6.640275341397447e-06, + "loss": 0.3087, + "step": 14198 + }, + { + "epoch": 0.41, + "grad_norm": 1.542457610466536, + "learning_rate": 6.639831620787879e-06, + "loss": 0.3108, + "step": 14199 + }, + { + "epoch": 0.41, + "grad_norm": 1.3518481555697888, + "learning_rate": 6.639387885706778e-06, + "loss": 0.3175, + "step": 14200 + }, + { + "epoch": 0.41, + "grad_norm": 1.3655729124246716, + "learning_rate": 6.638944136158062e-06, + "loss": 0.3225, + "step": 14201 + }, + { + "epoch": 0.41, + "grad_norm": 1.2360729370878503, + "learning_rate": 6.638500372145644e-06, + "loss": 0.3373, + "step": 14202 + }, + { + "epoch": 0.41, + "grad_norm": 1.2495822277512707, + "learning_rate": 6.638056593673443e-06, + "loss": 0.3228, + "step": 14203 + }, + { + "epoch": 0.41, + "grad_norm": 1.4148078409236213, + "learning_rate": 6.637612800745376e-06, + "loss": 0.3157, + "step": 14204 + }, + { + "epoch": 0.41, + "grad_norm": 1.5909273486491393, + "learning_rate": 6.637168993365355e-06, + "loss": 0.3557, + "step": 14205 + }, + { + "epoch": 0.41, + "grad_norm": 1.2725054897749049, + "learning_rate": 6.636725171537301e-06, + "loss": 0.3109, + "step": 14206 + }, + { + "epoch": 0.41, + "grad_norm": 1.2757313620591884, + "learning_rate": 6.636281335265129e-06, + "loss": 0.3166, + "step": 14207 + }, + { + "epoch": 0.41, + "grad_norm": 1.2992524155681815, + "learning_rate": 6.635837484552755e-06, + "loss": 0.3498, + "step": 14208 + }, + { + "epoch": 0.41, + "grad_norm": 1.2876072836690395, + "learning_rate": 6.6353936194041e-06, + "loss": 0.3265, + "step": 14209 + }, + { + "epoch": 0.41, + "grad_norm": 1.3366836883292432, + "learning_rate": 6.634949739823077e-06, + "loss": 0.3776, + "step": 14210 + }, + { + "epoch": 0.41, + "grad_norm": 1.4340660540074075, + "learning_rate": 6.634505845813603e-06, + "loss": 0.325, + "step": 14211 + }, + { + "epoch": 0.41, + "grad_norm": 2.7903345467215463, + "learning_rate": 6.634061937379597e-06, + "loss": 0.3078, + "step": 14212 + }, + { + "epoch": 0.41, + "grad_norm": 1.260019467643643, + "learning_rate": 6.6336180145249764e-06, + "loss": 0.3331, + "step": 14213 + }, + { + "epoch": 0.41, + "grad_norm": 1.212543404241252, + "learning_rate": 6.63317407725366e-06, + "loss": 0.3104, + "step": 14214 + }, + { + "epoch": 0.41, + "grad_norm": 1.3586503996828, + "learning_rate": 6.632730125569563e-06, + "loss": 0.3116, + "step": 14215 + }, + { + "epoch": 0.41, + "grad_norm": 1.1991744641748951, + "learning_rate": 6.6322861594766046e-06, + "loss": 0.3342, + "step": 14216 + }, + { + "epoch": 0.41, + "grad_norm": 1.3474033932552032, + "learning_rate": 6.631842178978703e-06, + "loss": 0.3442, + "step": 14217 + }, + { + "epoch": 0.41, + "grad_norm": 1.3343488540442932, + "learning_rate": 6.631398184079776e-06, + "loss": 0.3312, + "step": 14218 + }, + { + "epoch": 0.41, + "grad_norm": 1.231587080194675, + "learning_rate": 6.6309541747837415e-06, + "loss": 0.607, + "step": 14219 + }, + { + "epoch": 0.41, + "grad_norm": 1.3574069334215, + "learning_rate": 6.630510151094519e-06, + "loss": 0.3615, + "step": 14220 + }, + { + "epoch": 0.41, + "grad_norm": 1.1870116972964369, + "learning_rate": 6.630066113016026e-06, + "loss": 0.3125, + "step": 14221 + }, + { + "epoch": 0.41, + "grad_norm": 1.259199289312021, + "learning_rate": 6.6296220605521825e-06, + "loss": 0.3238, + "step": 14222 + }, + { + "epoch": 0.41, + "grad_norm": 1.253766574693821, + "learning_rate": 6.629177993706906e-06, + "loss": 0.2988, + "step": 14223 + }, + { + "epoch": 0.41, + "grad_norm": 1.4220931088050401, + "learning_rate": 6.628733912484114e-06, + "loss": 0.315, + "step": 14224 + }, + { + "epoch": 0.41, + "grad_norm": 1.1788179954778595, + "learning_rate": 6.628289816887729e-06, + "loss": 0.2942, + "step": 14225 + }, + { + "epoch": 0.41, + "grad_norm": 1.307314423919088, + "learning_rate": 6.627845706921667e-06, + "loss": 0.3436, + "step": 14226 + }, + { + "epoch": 0.41, + "grad_norm": 1.4831181790826864, + "learning_rate": 6.6274015825898495e-06, + "loss": 0.3059, + "step": 14227 + }, + { + "epoch": 0.41, + "grad_norm": 1.2632816971646108, + "learning_rate": 6.626957443896194e-06, + "loss": 0.3206, + "step": 14228 + }, + { + "epoch": 0.41, + "grad_norm": 1.3150632648771452, + "learning_rate": 6.62651329084462e-06, + "loss": 0.3522, + "step": 14229 + }, + { + "epoch": 0.41, + "grad_norm": 1.3441183047087377, + "learning_rate": 6.626069123439049e-06, + "loss": 0.3703, + "step": 14230 + }, + { + "epoch": 0.41, + "grad_norm": 1.2973498950946154, + "learning_rate": 6.625624941683402e-06, + "loss": 0.3117, + "step": 14231 + }, + { + "epoch": 0.41, + "grad_norm": 1.37836707078219, + "learning_rate": 6.625180745581595e-06, + "loss": 0.3445, + "step": 14232 + }, + { + "epoch": 0.41, + "grad_norm": 1.4488493148135253, + "learning_rate": 6.62473653513755e-06, + "loss": 0.3311, + "step": 14233 + }, + { + "epoch": 0.41, + "grad_norm": 1.3316601727737731, + "learning_rate": 6.6242923103551885e-06, + "loss": 0.3153, + "step": 14234 + }, + { + "epoch": 0.41, + "grad_norm": 1.5637352114302159, + "learning_rate": 6.6238480712384275e-06, + "loss": 0.3183, + "step": 14235 + }, + { + "epoch": 0.41, + "grad_norm": 1.34550404385083, + "learning_rate": 6.623403817791191e-06, + "loss": 0.3545, + "step": 14236 + }, + { + "epoch": 0.41, + "grad_norm": 1.3556548221337503, + "learning_rate": 6.622959550017397e-06, + "loss": 0.3155, + "step": 14237 + }, + { + "epoch": 0.41, + "grad_norm": 1.309802195181019, + "learning_rate": 6.6225152679209655e-06, + "loss": 0.3239, + "step": 14238 + }, + { + "epoch": 0.41, + "grad_norm": 1.3875831946302635, + "learning_rate": 6.6220709715058205e-06, + "loss": 0.3265, + "step": 14239 + }, + { + "epoch": 0.41, + "grad_norm": 1.370854791230208, + "learning_rate": 6.6216266607758805e-06, + "loss": 0.3327, + "step": 14240 + }, + { + "epoch": 0.41, + "grad_norm": 1.2227335487896018, + "learning_rate": 6.621182335735068e-06, + "loss": 0.3206, + "step": 14241 + }, + { + "epoch": 0.41, + "grad_norm": 1.2372850415919872, + "learning_rate": 6.6207379963873034e-06, + "loss": 0.3283, + "step": 14242 + }, + { + "epoch": 0.41, + "grad_norm": 2.227348579610664, + "learning_rate": 6.620293642736506e-06, + "loss": 0.3241, + "step": 14243 + }, + { + "epoch": 0.41, + "grad_norm": 1.2046788128951145, + "learning_rate": 6.619849274786601e-06, + "loss": 0.3369, + "step": 14244 + }, + { + "epoch": 0.41, + "grad_norm": 1.3252421412165827, + "learning_rate": 6.619404892541509e-06, + "loss": 0.33, + "step": 14245 + }, + { + "epoch": 0.41, + "grad_norm": 1.3823847924995551, + "learning_rate": 6.618960496005149e-06, + "loss": 0.3159, + "step": 14246 + }, + { + "epoch": 0.41, + "grad_norm": 1.2074357233067712, + "learning_rate": 6.6185160851814455e-06, + "loss": 0.3259, + "step": 14247 + }, + { + "epoch": 0.41, + "grad_norm": 1.4368082077707744, + "learning_rate": 6.618071660074319e-06, + "loss": 0.337, + "step": 14248 + }, + { + "epoch": 0.41, + "grad_norm": 1.401736671626226, + "learning_rate": 6.6176272206876925e-06, + "loss": 0.3656, + "step": 14249 + }, + { + "epoch": 0.41, + "grad_norm": 1.2548553060080232, + "learning_rate": 6.617182767025489e-06, + "loss": 0.3519, + "step": 14250 + }, + { + "epoch": 0.41, + "grad_norm": 1.4500280559203311, + "learning_rate": 6.6167382990916284e-06, + "loss": 0.3373, + "step": 14251 + }, + { + "epoch": 0.41, + "grad_norm": 1.544304476540777, + "learning_rate": 6.6162938168900335e-06, + "loss": 0.3493, + "step": 14252 + }, + { + "epoch": 0.41, + "grad_norm": 1.5064711630548562, + "learning_rate": 6.61584932042463e-06, + "loss": 0.3287, + "step": 14253 + }, + { + "epoch": 0.41, + "grad_norm": 1.3215887553079237, + "learning_rate": 6.615404809699336e-06, + "loss": 0.3391, + "step": 14254 + }, + { + "epoch": 0.41, + "grad_norm": 1.4184183087107227, + "learning_rate": 6.614960284718079e-06, + "loss": 0.3464, + "step": 14255 + }, + { + "epoch": 0.41, + "grad_norm": 1.4788899989437378, + "learning_rate": 6.614515745484777e-06, + "loss": 0.3181, + "step": 14256 + }, + { + "epoch": 0.41, + "grad_norm": 1.27113167039296, + "learning_rate": 6.614071192003357e-06, + "loss": 0.3176, + "step": 14257 + }, + { + "epoch": 0.41, + "grad_norm": 1.2600631026846982, + "learning_rate": 6.61362662427774e-06, + "loss": 0.3334, + "step": 14258 + }, + { + "epoch": 0.41, + "grad_norm": 1.370442644774718, + "learning_rate": 6.613182042311852e-06, + "loss": 0.3387, + "step": 14259 + }, + { + "epoch": 0.41, + "grad_norm": 1.267150810551749, + "learning_rate": 6.612737446109614e-06, + "loss": 0.3237, + "step": 14260 + }, + { + "epoch": 0.41, + "grad_norm": 1.6132780346441387, + "learning_rate": 6.6122928356749495e-06, + "loss": 0.3445, + "step": 14261 + }, + { + "epoch": 0.41, + "grad_norm": 1.668509959755478, + "learning_rate": 6.611848211011782e-06, + "loss": 0.3165, + "step": 14262 + }, + { + "epoch": 0.41, + "grad_norm": 1.3940510723225248, + "learning_rate": 6.6114035721240376e-06, + "loss": 0.3318, + "step": 14263 + }, + { + "epoch": 0.41, + "grad_norm": 1.3449208948659042, + "learning_rate": 6.610958919015638e-06, + "loss": 0.3311, + "step": 14264 + }, + { + "epoch": 0.41, + "grad_norm": 1.2168248206470487, + "learning_rate": 6.6105142516905095e-06, + "loss": 0.3087, + "step": 14265 + }, + { + "epoch": 0.41, + "grad_norm": 1.3046705775326095, + "learning_rate": 6.610069570152573e-06, + "loss": 0.3201, + "step": 14266 + }, + { + "epoch": 0.41, + "grad_norm": 1.3250184121923123, + "learning_rate": 6.6096248744057555e-06, + "loss": 0.3235, + "step": 14267 + }, + { + "epoch": 0.41, + "grad_norm": 1.1913194082491652, + "learning_rate": 6.609180164453981e-06, + "loss": 0.2894, + "step": 14268 + }, + { + "epoch": 0.41, + "grad_norm": 0.9495945444337454, + "learning_rate": 6.608735440301174e-06, + "loss": 0.5497, + "step": 14269 + }, + { + "epoch": 0.41, + "grad_norm": 1.4947987351386245, + "learning_rate": 6.608290701951258e-06, + "loss": 0.3163, + "step": 14270 + }, + { + "epoch": 0.41, + "grad_norm": 1.3081847487709952, + "learning_rate": 6.607845949408158e-06, + "loss": 0.3211, + "step": 14271 + }, + { + "epoch": 0.41, + "grad_norm": 1.3245132492952254, + "learning_rate": 6.6074011826758e-06, + "loss": 0.3297, + "step": 14272 + }, + { + "epoch": 0.41, + "grad_norm": 1.2325387369102114, + "learning_rate": 6.60695640175811e-06, + "loss": 0.3266, + "step": 14273 + }, + { + "epoch": 0.41, + "grad_norm": 1.2020909875956323, + "learning_rate": 6.606511606659012e-06, + "loss": 0.3137, + "step": 14274 + }, + { + "epoch": 0.41, + "grad_norm": 1.3406363931789864, + "learning_rate": 6.6060667973824285e-06, + "loss": 0.3183, + "step": 14275 + }, + { + "epoch": 0.41, + "grad_norm": 1.3255129713563525, + "learning_rate": 6.60562197393229e-06, + "loss": 0.3426, + "step": 14276 + }, + { + "epoch": 0.41, + "grad_norm": 0.8460480845152893, + "learning_rate": 6.60517713631252e-06, + "loss": 0.6205, + "step": 14277 + }, + { + "epoch": 0.41, + "grad_norm": 1.335918333982836, + "learning_rate": 6.604732284527044e-06, + "loss": 0.3308, + "step": 14278 + }, + { + "epoch": 0.41, + "grad_norm": 1.2762271000208179, + "learning_rate": 6.604287418579788e-06, + "loss": 0.3347, + "step": 14279 + }, + { + "epoch": 0.41, + "grad_norm": 1.724162729137123, + "learning_rate": 6.6038425384746765e-06, + "loss": 0.3169, + "step": 14280 + }, + { + "epoch": 0.41, + "grad_norm": 1.2988103484249909, + "learning_rate": 6.6033976442156365e-06, + "loss": 0.3348, + "step": 14281 + }, + { + "epoch": 0.41, + "grad_norm": 1.2815905328119261, + "learning_rate": 6.602952735806596e-06, + "loss": 0.3268, + "step": 14282 + }, + { + "epoch": 0.41, + "grad_norm": 0.967918894031697, + "learning_rate": 6.602507813251478e-06, + "loss": 0.5701, + "step": 14283 + }, + { + "epoch": 0.41, + "grad_norm": 1.3670899977774402, + "learning_rate": 6.602062876554211e-06, + "loss": 0.3415, + "step": 14284 + }, + { + "epoch": 0.41, + "grad_norm": 1.4007210722482317, + "learning_rate": 6.601617925718723e-06, + "loss": 0.3323, + "step": 14285 + }, + { + "epoch": 0.41, + "grad_norm": 2.4772491560230656, + "learning_rate": 6.601172960748939e-06, + "loss": 0.3113, + "step": 14286 + }, + { + "epoch": 0.41, + "grad_norm": 1.2810939971055917, + "learning_rate": 6.600727981648785e-06, + "loss": 0.3397, + "step": 14287 + }, + { + "epoch": 0.41, + "grad_norm": 1.2634671834718258, + "learning_rate": 6.600282988422189e-06, + "loss": 0.3418, + "step": 14288 + }, + { + "epoch": 0.41, + "grad_norm": 1.231440580607407, + "learning_rate": 6.599837981073076e-06, + "loss": 0.3223, + "step": 14289 + }, + { + "epoch": 0.41, + "grad_norm": 1.4282009480155713, + "learning_rate": 6.599392959605377e-06, + "loss": 0.329, + "step": 14290 + }, + { + "epoch": 0.41, + "grad_norm": 1.1723115560976414, + "learning_rate": 6.598947924023018e-06, + "loss": 0.2998, + "step": 14291 + }, + { + "epoch": 0.41, + "grad_norm": 1.2415406116844743, + "learning_rate": 6.598502874329925e-06, + "loss": 0.3314, + "step": 14292 + }, + { + "epoch": 0.41, + "grad_norm": 1.1872362354072767, + "learning_rate": 6.598057810530027e-06, + "loss": 0.3133, + "step": 14293 + }, + { + "epoch": 0.41, + "grad_norm": 1.346205948544322, + "learning_rate": 6.59761273262725e-06, + "loss": 0.3209, + "step": 14294 + }, + { + "epoch": 0.41, + "grad_norm": 1.2988012305288759, + "learning_rate": 6.597167640625523e-06, + "loss": 0.3416, + "step": 14295 + }, + { + "epoch": 0.41, + "grad_norm": 1.4353024547935753, + "learning_rate": 6.596722534528775e-06, + "loss": 0.3326, + "step": 14296 + }, + { + "epoch": 0.41, + "grad_norm": 1.5801954420178252, + "learning_rate": 6.596277414340933e-06, + "loss": 0.3337, + "step": 14297 + }, + { + "epoch": 0.41, + "grad_norm": 1.3620428737377108, + "learning_rate": 6.5958322800659235e-06, + "loss": 0.3359, + "step": 14298 + }, + { + "epoch": 0.41, + "grad_norm": 0.9923511444176886, + "learning_rate": 6.595387131707677e-06, + "loss": 0.6142, + "step": 14299 + }, + { + "epoch": 0.41, + "grad_norm": 1.3445293469161856, + "learning_rate": 6.5949419692701225e-06, + "loss": 0.3265, + "step": 14300 + }, + { + "epoch": 0.41, + "grad_norm": 1.2850067261176876, + "learning_rate": 6.594496792757186e-06, + "loss": 0.3176, + "step": 14301 + }, + { + "epoch": 0.41, + "grad_norm": 1.351129623110881, + "learning_rate": 6.594051602172798e-06, + "loss": 0.3358, + "step": 14302 + }, + { + "epoch": 0.41, + "grad_norm": 1.6788997966555426, + "learning_rate": 6.593606397520886e-06, + "loss": 0.3313, + "step": 14303 + }, + { + "epoch": 0.41, + "grad_norm": 1.2346271337436145, + "learning_rate": 6.59316117880538e-06, + "loss": 0.3619, + "step": 14304 + }, + { + "epoch": 0.41, + "grad_norm": 1.2616284598524217, + "learning_rate": 6.592715946030212e-06, + "loss": 0.343, + "step": 14305 + }, + { + "epoch": 0.41, + "grad_norm": 1.4954588603224996, + "learning_rate": 6.592270699199306e-06, + "loss": 0.3412, + "step": 14306 + }, + { + "epoch": 0.41, + "grad_norm": 1.3196089448415207, + "learning_rate": 6.591825438316593e-06, + "loss": 0.3299, + "step": 14307 + }, + { + "epoch": 0.42, + "grad_norm": 1.448572715297682, + "learning_rate": 6.591380163386003e-06, + "loss": 0.3191, + "step": 14308 + }, + { + "epoch": 0.42, + "grad_norm": 1.206671159899234, + "learning_rate": 6.590934874411466e-06, + "loss": 0.3259, + "step": 14309 + }, + { + "epoch": 0.42, + "grad_norm": 1.2842708820230793, + "learning_rate": 6.5904895713969105e-06, + "loss": 0.3473, + "step": 14310 + }, + { + "epoch": 0.42, + "grad_norm": 1.2320891866662895, + "learning_rate": 6.590044254346267e-06, + "loss": 0.3053, + "step": 14311 + }, + { + "epoch": 0.42, + "grad_norm": 1.3059731167565165, + "learning_rate": 6.589598923263465e-06, + "loss": 0.3277, + "step": 14312 + }, + { + "epoch": 0.42, + "grad_norm": 1.8563204259286643, + "learning_rate": 6.589153578152437e-06, + "loss": 0.3325, + "step": 14313 + }, + { + "epoch": 0.42, + "grad_norm": 1.631226717757628, + "learning_rate": 6.588708219017108e-06, + "loss": 0.2985, + "step": 14314 + }, + { + "epoch": 0.42, + "grad_norm": 1.5219974431222834, + "learning_rate": 6.588262845861413e-06, + "loss": 0.3098, + "step": 14315 + }, + { + "epoch": 0.42, + "grad_norm": 1.2694478272824548, + "learning_rate": 6.587817458689281e-06, + "loss": 0.3312, + "step": 14316 + }, + { + "epoch": 0.42, + "grad_norm": 2.523868370253328, + "learning_rate": 6.587372057504641e-06, + "loss": 0.3247, + "step": 14317 + }, + { + "epoch": 0.42, + "grad_norm": 1.4287296067662925, + "learning_rate": 6.586926642311426e-06, + "loss": 0.3297, + "step": 14318 + }, + { + "epoch": 0.42, + "grad_norm": 1.1968763818174664, + "learning_rate": 6.586481213113565e-06, + "loss": 0.3301, + "step": 14319 + }, + { + "epoch": 0.42, + "grad_norm": 1.642721808206523, + "learning_rate": 6.58603576991499e-06, + "loss": 0.3387, + "step": 14320 + }, + { + "epoch": 0.42, + "grad_norm": 1.3056430343045125, + "learning_rate": 6.585590312719633e-06, + "loss": 0.3091, + "step": 14321 + }, + { + "epoch": 0.42, + "grad_norm": 1.5829631910237767, + "learning_rate": 6.585144841531422e-06, + "loss": 0.3431, + "step": 14322 + }, + { + "epoch": 0.42, + "grad_norm": 1.267950298563096, + "learning_rate": 6.5846993563542915e-06, + "loss": 0.3342, + "step": 14323 + }, + { + "epoch": 0.42, + "grad_norm": 1.2606575512176192, + "learning_rate": 6.584253857192171e-06, + "loss": 0.3245, + "step": 14324 + }, + { + "epoch": 0.42, + "grad_norm": 1.4925016644441358, + "learning_rate": 6.583808344048993e-06, + "loss": 0.3434, + "step": 14325 + }, + { + "epoch": 0.42, + "grad_norm": 1.2958162129825552, + "learning_rate": 6.583362816928689e-06, + "loss": 0.3299, + "step": 14326 + }, + { + "epoch": 0.42, + "grad_norm": 1.4999073476830467, + "learning_rate": 6.582917275835189e-06, + "loss": 0.3086, + "step": 14327 + }, + { + "epoch": 0.42, + "grad_norm": 1.694225852264108, + "learning_rate": 6.582471720772429e-06, + "loss": 0.3339, + "step": 14328 + }, + { + "epoch": 0.42, + "grad_norm": 1.7086478992530234, + "learning_rate": 6.5820261517443365e-06, + "loss": 0.3412, + "step": 14329 + }, + { + "epoch": 0.42, + "grad_norm": 1.3714770572619932, + "learning_rate": 6.581580568754847e-06, + "loss": 0.3467, + "step": 14330 + }, + { + "epoch": 0.42, + "grad_norm": 1.1793574540761045, + "learning_rate": 6.58113497180789e-06, + "loss": 0.3131, + "step": 14331 + }, + { + "epoch": 0.42, + "grad_norm": 1.3204574257634232, + "learning_rate": 6.5806893609074e-06, + "loss": 0.2927, + "step": 14332 + }, + { + "epoch": 0.42, + "grad_norm": 1.2547425176391984, + "learning_rate": 6.580243736057309e-06, + "loss": 0.3147, + "step": 14333 + }, + { + "epoch": 0.42, + "grad_norm": 1.1835518842548265, + "learning_rate": 6.57979809726155e-06, + "loss": 0.3304, + "step": 14334 + }, + { + "epoch": 0.42, + "grad_norm": 1.810370450419175, + "learning_rate": 6.579352444524054e-06, + "loss": 0.3194, + "step": 14335 + }, + { + "epoch": 0.42, + "grad_norm": 1.5734714911641863, + "learning_rate": 6.578906777848756e-06, + "loss": 0.3511, + "step": 14336 + }, + { + "epoch": 0.42, + "grad_norm": 1.2590250487162808, + "learning_rate": 6.578461097239589e-06, + "loss": 0.3293, + "step": 14337 + }, + { + "epoch": 0.42, + "grad_norm": 1.2455509385075014, + "learning_rate": 6.578015402700484e-06, + "loss": 0.3396, + "step": 14338 + }, + { + "epoch": 0.42, + "grad_norm": 1.5168548782841234, + "learning_rate": 6.577569694235375e-06, + "loss": 0.3233, + "step": 14339 + }, + { + "epoch": 0.42, + "grad_norm": 1.273427681255266, + "learning_rate": 6.577123971848196e-06, + "loss": 0.3109, + "step": 14340 + }, + { + "epoch": 0.42, + "grad_norm": 2.226816968399963, + "learning_rate": 6.576678235542881e-06, + "loss": 0.325, + "step": 14341 + }, + { + "epoch": 0.42, + "grad_norm": 1.176086987917737, + "learning_rate": 6.576232485323365e-06, + "loss": 0.3097, + "step": 14342 + }, + { + "epoch": 0.42, + "grad_norm": 1.2849026743907284, + "learning_rate": 6.575786721193578e-06, + "loss": 0.3288, + "step": 14343 + }, + { + "epoch": 0.42, + "grad_norm": 1.4135234578651708, + "learning_rate": 6.575340943157455e-06, + "loss": 0.3362, + "step": 14344 + }, + { + "epoch": 0.42, + "grad_norm": 1.3805849553082816, + "learning_rate": 6.574895151218932e-06, + "loss": 0.3178, + "step": 14345 + }, + { + "epoch": 0.42, + "grad_norm": 1.2841597582557305, + "learning_rate": 6.574449345381941e-06, + "loss": 0.3157, + "step": 14346 + }, + { + "epoch": 0.42, + "grad_norm": 1.3446205011128955, + "learning_rate": 6.574003525650417e-06, + "loss": 0.3444, + "step": 14347 + }, + { + "epoch": 0.42, + "grad_norm": 1.304154372729422, + "learning_rate": 6.573557692028296e-06, + "loss": 0.3079, + "step": 14348 + }, + { + "epoch": 0.42, + "grad_norm": 1.4381473285232567, + "learning_rate": 6.573111844519511e-06, + "loss": 0.3295, + "step": 14349 + }, + { + "epoch": 0.42, + "grad_norm": 3.3633847709331453, + "learning_rate": 6.572665983127994e-06, + "loss": 0.3346, + "step": 14350 + }, + { + "epoch": 0.42, + "grad_norm": 1.723733077320946, + "learning_rate": 6.572220107857684e-06, + "loss": 0.3204, + "step": 14351 + }, + { + "epoch": 0.42, + "grad_norm": 1.321947709329309, + "learning_rate": 6.5717742187125146e-06, + "loss": 0.3079, + "step": 14352 + }, + { + "epoch": 0.42, + "grad_norm": 1.9183281086924315, + "learning_rate": 6.571328315696418e-06, + "loss": 0.3523, + "step": 14353 + }, + { + "epoch": 0.42, + "grad_norm": 1.2915229949353262, + "learning_rate": 6.570882398813333e-06, + "loss": 0.3481, + "step": 14354 + }, + { + "epoch": 0.42, + "grad_norm": 1.3199130391479716, + "learning_rate": 6.570436468067194e-06, + "loss": 0.3007, + "step": 14355 + }, + { + "epoch": 0.42, + "grad_norm": 1.2452516565989995, + "learning_rate": 6.569990523461936e-06, + "loss": 0.3253, + "step": 14356 + }, + { + "epoch": 0.42, + "grad_norm": 1.5261027666209246, + "learning_rate": 6.569544565001492e-06, + "loss": 0.3664, + "step": 14357 + }, + { + "epoch": 0.42, + "grad_norm": 1.5300941192639157, + "learning_rate": 6.569098592689801e-06, + "loss": 0.3379, + "step": 14358 + }, + { + "epoch": 0.42, + "grad_norm": 1.3137876764326832, + "learning_rate": 6.568652606530796e-06, + "loss": 0.3481, + "step": 14359 + }, + { + "epoch": 0.42, + "grad_norm": 1.3573181401455603, + "learning_rate": 6.568206606528418e-06, + "loss": 0.3426, + "step": 14360 + }, + { + "epoch": 0.42, + "grad_norm": 1.8999992019711336, + "learning_rate": 6.5677605926865975e-06, + "loss": 0.3298, + "step": 14361 + }, + { + "epoch": 0.42, + "grad_norm": 1.3240009108351558, + "learning_rate": 6.567314565009271e-06, + "loss": 0.3587, + "step": 14362 + }, + { + "epoch": 0.42, + "grad_norm": 1.3401488854389756, + "learning_rate": 6.566868523500378e-06, + "loss": 0.3401, + "step": 14363 + }, + { + "epoch": 0.42, + "grad_norm": 1.2977140526433062, + "learning_rate": 6.566422468163852e-06, + "loss": 0.3089, + "step": 14364 + }, + { + "epoch": 0.42, + "grad_norm": 1.291797340807097, + "learning_rate": 6.56597639900363e-06, + "loss": 0.3185, + "step": 14365 + }, + { + "epoch": 0.42, + "grad_norm": 1.558566468037066, + "learning_rate": 6.56553031602365e-06, + "loss": 0.2984, + "step": 14366 + }, + { + "epoch": 0.42, + "grad_norm": 1.1962994348248086, + "learning_rate": 6.565084219227847e-06, + "loss": 0.3174, + "step": 14367 + }, + { + "epoch": 0.42, + "grad_norm": 1.2970102977473887, + "learning_rate": 6.564638108620158e-06, + "loss": 0.3139, + "step": 14368 + }, + { + "epoch": 0.42, + "grad_norm": 1.2321797632536662, + "learning_rate": 6.564191984204522e-06, + "loss": 0.3232, + "step": 14369 + }, + { + "epoch": 0.42, + "grad_norm": 1.4233786279176237, + "learning_rate": 6.563745845984872e-06, + "loss": 0.3285, + "step": 14370 + }, + { + "epoch": 0.42, + "grad_norm": 1.2511835501792428, + "learning_rate": 6.563299693965148e-06, + "loss": 0.3132, + "step": 14371 + }, + { + "epoch": 0.42, + "grad_norm": 1.3760920329136717, + "learning_rate": 6.562853528149288e-06, + "loss": 0.3093, + "step": 14372 + }, + { + "epoch": 0.42, + "grad_norm": 1.2635383758690997, + "learning_rate": 6.562407348541229e-06, + "loss": 0.3368, + "step": 14373 + }, + { + "epoch": 0.42, + "grad_norm": 1.2604182047469585, + "learning_rate": 6.561961155144906e-06, + "loss": 0.317, + "step": 14374 + }, + { + "epoch": 0.42, + "grad_norm": 1.2190510007386113, + "learning_rate": 6.561514947964258e-06, + "loss": 0.3131, + "step": 14375 + }, + { + "epoch": 0.42, + "grad_norm": 1.2772289988023837, + "learning_rate": 6.561068727003225e-06, + "loss": 0.3197, + "step": 14376 + }, + { + "epoch": 0.42, + "grad_norm": 1.4277694156818668, + "learning_rate": 6.560622492265742e-06, + "loss": 0.3168, + "step": 14377 + }, + { + "epoch": 0.42, + "grad_norm": 1.6091668295616448, + "learning_rate": 6.5601762437557505e-06, + "loss": 0.3398, + "step": 14378 + }, + { + "epoch": 0.42, + "grad_norm": 1.1920933579192654, + "learning_rate": 6.559729981477184e-06, + "loss": 0.3318, + "step": 14379 + }, + { + "epoch": 0.42, + "grad_norm": 1.3414255240597157, + "learning_rate": 6.559283705433983e-06, + "loss": 0.3434, + "step": 14380 + }, + { + "epoch": 0.42, + "grad_norm": 1.22611523976435, + "learning_rate": 6.558837415630087e-06, + "loss": 0.3292, + "step": 14381 + }, + { + "epoch": 0.42, + "grad_norm": 1.255938989896924, + "learning_rate": 6.5583911120694335e-06, + "loss": 0.3073, + "step": 14382 + }, + { + "epoch": 0.42, + "grad_norm": 1.2340947975697492, + "learning_rate": 6.557944794755962e-06, + "loss": 0.3112, + "step": 14383 + }, + { + "epoch": 0.42, + "grad_norm": 1.3114372870302424, + "learning_rate": 6.557498463693608e-06, + "loss": 0.314, + "step": 14384 + }, + { + "epoch": 0.42, + "grad_norm": 1.4453741286845263, + "learning_rate": 6.557052118886314e-06, + "loss": 0.3285, + "step": 14385 + }, + { + "epoch": 0.42, + "grad_norm": 1.6311014630574086, + "learning_rate": 6.5566057603380176e-06, + "loss": 0.3133, + "step": 14386 + }, + { + "epoch": 0.42, + "grad_norm": 1.5220003970330263, + "learning_rate": 6.556159388052659e-06, + "loss": 0.311, + "step": 14387 + }, + { + "epoch": 0.42, + "grad_norm": 1.3609448313519341, + "learning_rate": 6.555713002034175e-06, + "loss": 0.3154, + "step": 14388 + }, + { + "epoch": 0.42, + "grad_norm": 1.4909424601764234, + "learning_rate": 6.5552666022865075e-06, + "loss": 0.3346, + "step": 14389 + }, + { + "epoch": 0.42, + "grad_norm": 1.3316793666204043, + "learning_rate": 6.554820188813594e-06, + "loss": 0.3354, + "step": 14390 + }, + { + "epoch": 0.42, + "grad_norm": 1.3847369281273079, + "learning_rate": 6.554373761619377e-06, + "loss": 0.315, + "step": 14391 + }, + { + "epoch": 0.42, + "grad_norm": 1.2395590775841723, + "learning_rate": 6.5539273207077935e-06, + "loss": 0.3573, + "step": 14392 + }, + { + "epoch": 0.42, + "grad_norm": 1.2712713048978974, + "learning_rate": 6.553480866082785e-06, + "loss": 0.3047, + "step": 14393 + }, + { + "epoch": 0.42, + "grad_norm": 1.8973097308222286, + "learning_rate": 6.553034397748288e-06, + "loss": 0.3617, + "step": 14394 + }, + { + "epoch": 0.42, + "grad_norm": 1.2761575356065307, + "learning_rate": 6.552587915708246e-06, + "loss": 0.3131, + "step": 14395 + }, + { + "epoch": 0.42, + "grad_norm": 1.2777057880536806, + "learning_rate": 6.5521414199666e-06, + "loss": 0.3356, + "step": 14396 + }, + { + "epoch": 0.42, + "grad_norm": 1.3654688430237016, + "learning_rate": 6.551694910527287e-06, + "loss": 0.3179, + "step": 14397 + }, + { + "epoch": 0.42, + "grad_norm": 4.621647714483255, + "learning_rate": 6.551248387394251e-06, + "loss": 0.3224, + "step": 14398 + }, + { + "epoch": 0.42, + "grad_norm": 1.2807974693406454, + "learning_rate": 6.5508018505714285e-06, + "loss": 0.348, + "step": 14399 + }, + { + "epoch": 0.42, + "grad_norm": 1.5394136893837964, + "learning_rate": 6.550355300062764e-06, + "loss": 0.3224, + "step": 14400 + }, + { + "epoch": 0.42, + "grad_norm": 1.2916855411607326, + "learning_rate": 6.549908735872197e-06, + "loss": 0.316, + "step": 14401 + }, + { + "epoch": 0.42, + "grad_norm": 1.251473018018008, + "learning_rate": 6.5494621580036675e-06, + "loss": 0.3082, + "step": 14402 + }, + { + "epoch": 0.42, + "grad_norm": 1.3280963008629092, + "learning_rate": 6.549015566461117e-06, + "loss": 0.3287, + "step": 14403 + }, + { + "epoch": 0.42, + "grad_norm": 1.1526018394600788, + "learning_rate": 6.548568961248488e-06, + "loss": 0.3093, + "step": 14404 + }, + { + "epoch": 0.42, + "grad_norm": 1.421162309911776, + "learning_rate": 6.5481223423697205e-06, + "loss": 0.36, + "step": 14405 + }, + { + "epoch": 0.42, + "grad_norm": 1.2853180196812708, + "learning_rate": 6.547675709828757e-06, + "loss": 0.3177, + "step": 14406 + }, + { + "epoch": 0.42, + "grad_norm": 1.3948664818232273, + "learning_rate": 6.547229063629537e-06, + "loss": 0.3276, + "step": 14407 + }, + { + "epoch": 0.42, + "grad_norm": 1.6599888078555052, + "learning_rate": 6.546782403776003e-06, + "loss": 0.3332, + "step": 14408 + }, + { + "epoch": 0.42, + "grad_norm": 1.4581668486800858, + "learning_rate": 6.546335730272098e-06, + "loss": 0.3216, + "step": 14409 + }, + { + "epoch": 0.42, + "grad_norm": 1.407247581701091, + "learning_rate": 6.545889043121762e-06, + "loss": 0.317, + "step": 14410 + }, + { + "epoch": 0.42, + "grad_norm": 1.2450450199354046, + "learning_rate": 6.545442342328939e-06, + "loss": 0.3379, + "step": 14411 + }, + { + "epoch": 0.42, + "grad_norm": 1.2750066127255395, + "learning_rate": 6.54499562789757e-06, + "loss": 0.3228, + "step": 14412 + }, + { + "epoch": 0.42, + "grad_norm": 1.3326314107247375, + "learning_rate": 6.544548899831597e-06, + "loss": 0.3107, + "step": 14413 + }, + { + "epoch": 0.42, + "grad_norm": 1.311643002526015, + "learning_rate": 6.544102158134964e-06, + "loss": 0.3286, + "step": 14414 + }, + { + "epoch": 0.42, + "grad_norm": 1.2582209375243472, + "learning_rate": 6.54365540281161e-06, + "loss": 0.3096, + "step": 14415 + }, + { + "epoch": 0.42, + "grad_norm": 1.4832023838221444, + "learning_rate": 6.543208633865481e-06, + "loss": 0.3327, + "step": 14416 + }, + { + "epoch": 0.42, + "grad_norm": 1.374526250933913, + "learning_rate": 6.542761851300519e-06, + "loss": 0.3033, + "step": 14417 + }, + { + "epoch": 0.42, + "grad_norm": 1.2307443681627204, + "learning_rate": 6.542315055120667e-06, + "loss": 0.3276, + "step": 14418 + }, + { + "epoch": 0.42, + "grad_norm": 1.2545673323926123, + "learning_rate": 6.5418682453298676e-06, + "loss": 0.3339, + "step": 14419 + }, + { + "epoch": 0.42, + "grad_norm": 1.263194665283526, + "learning_rate": 6.5414214219320635e-06, + "loss": 0.3194, + "step": 14420 + }, + { + "epoch": 0.42, + "grad_norm": 1.3049425597511806, + "learning_rate": 6.540974584931199e-06, + "loss": 0.3211, + "step": 14421 + }, + { + "epoch": 0.42, + "grad_norm": 1.2665995001454768, + "learning_rate": 6.540527734331215e-06, + "loss": 0.2966, + "step": 14422 + }, + { + "epoch": 0.42, + "grad_norm": 1.287969653360759, + "learning_rate": 6.540080870136056e-06, + "loss": 0.3061, + "step": 14423 + }, + { + "epoch": 0.42, + "grad_norm": 1.4126804255147698, + "learning_rate": 6.53963399234967e-06, + "loss": 0.3451, + "step": 14424 + }, + { + "epoch": 0.42, + "grad_norm": 1.3135815139976486, + "learning_rate": 6.539187100975995e-06, + "loss": 0.3251, + "step": 14425 + }, + { + "epoch": 0.42, + "grad_norm": 1.3588788806574765, + "learning_rate": 6.538740196018975e-06, + "loss": 0.3345, + "step": 14426 + }, + { + "epoch": 0.42, + "grad_norm": 1.279227170334787, + "learning_rate": 6.538293277482557e-06, + "loss": 0.3146, + "step": 14427 + }, + { + "epoch": 0.42, + "grad_norm": 1.2892010126081948, + "learning_rate": 6.537846345370684e-06, + "loss": 0.3166, + "step": 14428 + }, + { + "epoch": 0.42, + "grad_norm": 1.519565234523623, + "learning_rate": 6.5373993996873e-06, + "loss": 0.3198, + "step": 14429 + }, + { + "epoch": 0.42, + "grad_norm": 1.5788622173201148, + "learning_rate": 6.536952440436348e-06, + "loss": 0.3401, + "step": 14430 + }, + { + "epoch": 0.42, + "grad_norm": 1.399886793792814, + "learning_rate": 6.536505467621774e-06, + "loss": 0.3196, + "step": 14431 + }, + { + "epoch": 0.42, + "grad_norm": 2.280372115629312, + "learning_rate": 6.536058481247525e-06, + "loss": 0.3065, + "step": 14432 + }, + { + "epoch": 0.42, + "grad_norm": 1.3691633450124754, + "learning_rate": 6.535611481317541e-06, + "loss": 0.3295, + "step": 14433 + }, + { + "epoch": 0.42, + "grad_norm": 1.3666688608119422, + "learning_rate": 6.535164467835769e-06, + "loss": 0.3182, + "step": 14434 + }, + { + "epoch": 0.42, + "grad_norm": 1.327050413405896, + "learning_rate": 6.534717440806152e-06, + "loss": 0.3117, + "step": 14435 + }, + { + "epoch": 0.42, + "grad_norm": 1.8739317487039997, + "learning_rate": 6.534270400232637e-06, + "loss": 0.3282, + "step": 14436 + }, + { + "epoch": 0.42, + "grad_norm": 1.4150600663457849, + "learning_rate": 6.533823346119169e-06, + "loss": 0.333, + "step": 14437 + }, + { + "epoch": 0.42, + "grad_norm": 1.5500713946537077, + "learning_rate": 6.533376278469694e-06, + "loss": 0.3196, + "step": 14438 + }, + { + "epoch": 0.42, + "grad_norm": 1.4801008367718915, + "learning_rate": 6.532929197288155e-06, + "loss": 0.345, + "step": 14439 + }, + { + "epoch": 0.42, + "grad_norm": 1.3330231976862077, + "learning_rate": 6.532482102578499e-06, + "loss": 0.2893, + "step": 14440 + }, + { + "epoch": 0.42, + "grad_norm": 1.3654304438389584, + "learning_rate": 6.532034994344671e-06, + "loss": 0.327, + "step": 14441 + }, + { + "epoch": 0.42, + "grad_norm": 1.4057569543357986, + "learning_rate": 6.531587872590619e-06, + "loss": 0.3167, + "step": 14442 + }, + { + "epoch": 0.42, + "grad_norm": 1.2363354658098835, + "learning_rate": 6.531140737320286e-06, + "loss": 0.326, + "step": 14443 + }, + { + "epoch": 0.42, + "grad_norm": 1.725898472983111, + "learning_rate": 6.530693588537619e-06, + "loss": 0.3397, + "step": 14444 + }, + { + "epoch": 0.42, + "grad_norm": 1.3890310748611931, + "learning_rate": 6.530246426246564e-06, + "loss": 0.3552, + "step": 14445 + }, + { + "epoch": 0.42, + "grad_norm": 1.7609545341195796, + "learning_rate": 6.529799250451067e-06, + "loss": 0.3137, + "step": 14446 + }, + { + "epoch": 0.42, + "grad_norm": 1.5974373882009716, + "learning_rate": 6.529352061155077e-06, + "loss": 0.3296, + "step": 14447 + }, + { + "epoch": 0.42, + "grad_norm": 1.4764429652396887, + "learning_rate": 6.528904858362535e-06, + "loss": 0.3155, + "step": 14448 + }, + { + "epoch": 0.42, + "grad_norm": 1.2404199534653912, + "learning_rate": 6.528457642077391e-06, + "loss": 0.3017, + "step": 14449 + }, + { + "epoch": 0.42, + "grad_norm": 1.984198802226226, + "learning_rate": 6.528010412303593e-06, + "loss": 0.3261, + "step": 14450 + }, + { + "epoch": 0.42, + "grad_norm": 1.2788948480615654, + "learning_rate": 6.5275631690450855e-06, + "loss": 0.3283, + "step": 14451 + }, + { + "epoch": 0.42, + "grad_norm": 1.3939262283026035, + "learning_rate": 6.527115912305816e-06, + "loss": 0.3077, + "step": 14452 + }, + { + "epoch": 0.42, + "grad_norm": 1.2522543721813406, + "learning_rate": 6.5266686420897305e-06, + "loss": 0.3041, + "step": 14453 + }, + { + "epoch": 0.42, + "grad_norm": 1.3243116403835635, + "learning_rate": 6.526221358400778e-06, + "loss": 0.3317, + "step": 14454 + }, + { + "epoch": 0.42, + "grad_norm": 1.2961692478046358, + "learning_rate": 6.525774061242905e-06, + "loss": 0.3216, + "step": 14455 + }, + { + "epoch": 0.42, + "grad_norm": 1.2437159206193171, + "learning_rate": 6.52532675062006e-06, + "loss": 0.3215, + "step": 14456 + }, + { + "epoch": 0.42, + "grad_norm": 1.3927704882746512, + "learning_rate": 6.524879426536189e-06, + "loss": 0.3238, + "step": 14457 + }, + { + "epoch": 0.42, + "grad_norm": 1.5357131428120996, + "learning_rate": 6.524432088995241e-06, + "loss": 0.3511, + "step": 14458 + }, + { + "epoch": 0.42, + "grad_norm": 1.309340328330751, + "learning_rate": 6.523984738001161e-06, + "loss": 0.3327, + "step": 14459 + }, + { + "epoch": 0.42, + "grad_norm": 1.5141250214439135, + "learning_rate": 6.5235373735579e-06, + "loss": 0.3269, + "step": 14460 + }, + { + "epoch": 0.42, + "grad_norm": 1.2766083858317543, + "learning_rate": 6.523089995669405e-06, + "loss": 0.3387, + "step": 14461 + }, + { + "epoch": 0.42, + "grad_norm": 3.124303221990203, + "learning_rate": 6.522642604339624e-06, + "loss": 0.3081, + "step": 14462 + }, + { + "epoch": 0.42, + "grad_norm": 1.4294939259985795, + "learning_rate": 6.5221951995725045e-06, + "loss": 0.3217, + "step": 14463 + }, + { + "epoch": 0.42, + "grad_norm": 1.2819708751955075, + "learning_rate": 6.521747781371994e-06, + "loss": 0.3322, + "step": 14464 + }, + { + "epoch": 0.42, + "grad_norm": 1.394280514168848, + "learning_rate": 6.521300349742046e-06, + "loss": 0.3243, + "step": 14465 + }, + { + "epoch": 0.42, + "grad_norm": 1.2277103449193778, + "learning_rate": 6.520852904686602e-06, + "loss": 0.313, + "step": 14466 + }, + { + "epoch": 0.42, + "grad_norm": 1.5654556632985777, + "learning_rate": 6.520405446209615e-06, + "loss": 0.3056, + "step": 14467 + }, + { + "epoch": 0.42, + "grad_norm": 1.2706862670420638, + "learning_rate": 6.519957974315034e-06, + "loss": 0.3202, + "step": 14468 + }, + { + "epoch": 0.42, + "grad_norm": 1.2357134044717413, + "learning_rate": 6.519510489006809e-06, + "loss": 0.3297, + "step": 14469 + }, + { + "epoch": 0.42, + "grad_norm": 1.3912818534352414, + "learning_rate": 6.519062990288884e-06, + "loss": 0.3428, + "step": 14470 + }, + { + "epoch": 0.42, + "grad_norm": 1.386642818747486, + "learning_rate": 6.518615478165211e-06, + "loss": 0.3093, + "step": 14471 + }, + { + "epoch": 0.42, + "grad_norm": 1.3644454043261016, + "learning_rate": 6.51816795263974e-06, + "loss": 0.3165, + "step": 14472 + }, + { + "epoch": 0.42, + "grad_norm": 1.940244432225116, + "learning_rate": 6.517720413716422e-06, + "loss": 0.3098, + "step": 14473 + }, + { + "epoch": 0.42, + "grad_norm": 1.301468961584835, + "learning_rate": 6.517272861399203e-06, + "loss": 0.3158, + "step": 14474 + }, + { + "epoch": 0.42, + "grad_norm": 1.2662201465026695, + "learning_rate": 6.516825295692034e-06, + "loss": 0.3109, + "step": 14475 + }, + { + "epoch": 0.42, + "grad_norm": 1.6130210752967036, + "learning_rate": 6.516377716598866e-06, + "loss": 0.3133, + "step": 14476 + }, + { + "epoch": 0.42, + "grad_norm": 1.1896361176863504, + "learning_rate": 6.5159301241236475e-06, + "loss": 0.3023, + "step": 14477 + }, + { + "epoch": 0.42, + "grad_norm": 1.4209493426473674, + "learning_rate": 6.515482518270327e-06, + "loss": 0.322, + "step": 14478 + }, + { + "epoch": 0.42, + "grad_norm": 1.498196228608135, + "learning_rate": 6.515034899042859e-06, + "loss": 0.3707, + "step": 14479 + }, + { + "epoch": 0.42, + "grad_norm": 1.4512287490890114, + "learning_rate": 6.51458726644519e-06, + "loss": 0.3219, + "step": 14480 + }, + { + "epoch": 0.42, + "grad_norm": 1.5929804578370104, + "learning_rate": 6.5141396204812724e-06, + "loss": 0.3198, + "step": 14481 + }, + { + "epoch": 0.42, + "grad_norm": 1.3919859595601327, + "learning_rate": 6.513691961155054e-06, + "loss": 0.3252, + "step": 14482 + }, + { + "epoch": 0.42, + "grad_norm": 1.5096238844137466, + "learning_rate": 6.513244288470489e-06, + "loss": 0.3318, + "step": 14483 + }, + { + "epoch": 0.42, + "grad_norm": 1.358060024374482, + "learning_rate": 6.512796602431526e-06, + "loss": 0.3411, + "step": 14484 + }, + { + "epoch": 0.42, + "grad_norm": 7.52258694880362, + "learning_rate": 6.512348903042116e-06, + "loss": 0.3133, + "step": 14485 + }, + { + "epoch": 0.42, + "grad_norm": 0.9317386981413349, + "learning_rate": 6.511901190306209e-06, + "loss": 0.5953, + "step": 14486 + }, + { + "epoch": 0.42, + "grad_norm": 1.2709733873487936, + "learning_rate": 6.5114534642277595e-06, + "loss": 0.3278, + "step": 14487 + }, + { + "epoch": 0.42, + "grad_norm": 1.6415562412127518, + "learning_rate": 6.511005724810715e-06, + "loss": 0.3131, + "step": 14488 + }, + { + "epoch": 0.42, + "grad_norm": 1.714316538252613, + "learning_rate": 6.5105579720590284e-06, + "loss": 0.3359, + "step": 14489 + }, + { + "epoch": 0.42, + "grad_norm": 1.6816437574800116, + "learning_rate": 6.510110205976652e-06, + "loss": 0.3255, + "step": 14490 + }, + { + "epoch": 0.42, + "grad_norm": 1.5793514969628906, + "learning_rate": 6.5096624265675345e-06, + "loss": 0.3317, + "step": 14491 + }, + { + "epoch": 0.42, + "grad_norm": 1.4675704681286106, + "learning_rate": 6.50921463383563e-06, + "loss": 0.3191, + "step": 14492 + }, + { + "epoch": 0.42, + "grad_norm": 1.5321081005031427, + "learning_rate": 6.508766827784891e-06, + "loss": 0.3257, + "step": 14493 + }, + { + "epoch": 0.42, + "grad_norm": 1.537912906129798, + "learning_rate": 6.508319008419266e-06, + "loss": 0.3586, + "step": 14494 + }, + { + "epoch": 0.42, + "grad_norm": 1.4140416771071809, + "learning_rate": 6.507871175742709e-06, + "loss": 0.3302, + "step": 14495 + }, + { + "epoch": 0.42, + "grad_norm": 1.7479371796912242, + "learning_rate": 6.507423329759173e-06, + "loss": 0.3456, + "step": 14496 + }, + { + "epoch": 0.42, + "grad_norm": 1.2760531681595944, + "learning_rate": 6.5069754704726095e-06, + "loss": 0.3216, + "step": 14497 + }, + { + "epoch": 0.42, + "grad_norm": 1.4149906243431476, + "learning_rate": 6.506527597886971e-06, + "loss": 0.3242, + "step": 14498 + }, + { + "epoch": 0.42, + "grad_norm": 1.305719124798416, + "learning_rate": 6.506079712006209e-06, + "loss": 0.331, + "step": 14499 + }, + { + "epoch": 0.42, + "grad_norm": 1.287128888272965, + "learning_rate": 6.5056318128342764e-06, + "loss": 0.315, + "step": 14500 + }, + { + "epoch": 0.42, + "grad_norm": 1.255309430322176, + "learning_rate": 6.5051839003751285e-06, + "loss": 0.3256, + "step": 14501 + }, + { + "epoch": 0.42, + "grad_norm": 1.5018131041195038, + "learning_rate": 6.504735974632714e-06, + "loss": 0.339, + "step": 14502 + }, + { + "epoch": 0.42, + "grad_norm": 1.214179189377621, + "learning_rate": 6.504288035610988e-06, + "loss": 0.3383, + "step": 14503 + }, + { + "epoch": 0.42, + "grad_norm": 1.224091968945049, + "learning_rate": 6.503840083313905e-06, + "loss": 0.3137, + "step": 14504 + }, + { + "epoch": 0.42, + "grad_norm": 1.3786207374323458, + "learning_rate": 6.503392117745415e-06, + "loss": 0.3128, + "step": 14505 + }, + { + "epoch": 0.42, + "grad_norm": 1.4084691967222338, + "learning_rate": 6.5029441389094735e-06, + "loss": 0.3004, + "step": 14506 + }, + { + "epoch": 0.42, + "grad_norm": 1.507160693372229, + "learning_rate": 6.502496146810033e-06, + "loss": 0.3516, + "step": 14507 + }, + { + "epoch": 0.42, + "grad_norm": 1.347611815897115, + "learning_rate": 6.502048141451047e-06, + "loss": 0.3229, + "step": 14508 + }, + { + "epoch": 0.42, + "grad_norm": 1.3933393830851437, + "learning_rate": 6.50160012283647e-06, + "loss": 0.3424, + "step": 14509 + }, + { + "epoch": 0.42, + "grad_norm": 1.2462044784541302, + "learning_rate": 6.501152090970255e-06, + "loss": 0.3105, + "step": 14510 + }, + { + "epoch": 0.42, + "grad_norm": 1.3646528389974415, + "learning_rate": 6.500704045856357e-06, + "loss": 0.332, + "step": 14511 + }, + { + "epoch": 0.42, + "grad_norm": 1.704583136966545, + "learning_rate": 6.500255987498728e-06, + "loss": 0.3206, + "step": 14512 + }, + { + "epoch": 0.42, + "grad_norm": 1.424661750519203, + "learning_rate": 6.4998079159013236e-06, + "loss": 0.3255, + "step": 14513 + }, + { + "epoch": 0.42, + "grad_norm": 0.9709474745921193, + "learning_rate": 6.499359831068097e-06, + "loss": 0.5996, + "step": 14514 + }, + { + "epoch": 0.42, + "grad_norm": 1.5234371618469247, + "learning_rate": 6.498911733003005e-06, + "loss": 0.3457, + "step": 14515 + }, + { + "epoch": 0.42, + "grad_norm": 1.7441676115599785, + "learning_rate": 6.49846362171e-06, + "loss": 0.3325, + "step": 14516 + }, + { + "epoch": 0.42, + "grad_norm": 1.3999407097081003, + "learning_rate": 6.4980154971930355e-06, + "loss": 0.3371, + "step": 14517 + }, + { + "epoch": 0.42, + "grad_norm": 1.4296296953747574, + "learning_rate": 6.49756735945607e-06, + "loss": 0.311, + "step": 14518 + }, + { + "epoch": 0.42, + "grad_norm": 1.334356714294982, + "learning_rate": 6.497119208503053e-06, + "loss": 0.3348, + "step": 14519 + }, + { + "epoch": 0.42, + "grad_norm": 1.2724527792110463, + "learning_rate": 6.4966710443379445e-06, + "loss": 0.3144, + "step": 14520 + }, + { + "epoch": 0.42, + "grad_norm": 1.3793373774320852, + "learning_rate": 6.496222866964696e-06, + "loss": 0.3253, + "step": 14521 + }, + { + "epoch": 0.42, + "grad_norm": 1.380806438552066, + "learning_rate": 6.495774676387266e-06, + "loss": 0.3628, + "step": 14522 + }, + { + "epoch": 0.42, + "grad_norm": 1.9666283845379011, + "learning_rate": 6.495326472609605e-06, + "loss": 0.3095, + "step": 14523 + }, + { + "epoch": 0.42, + "grad_norm": 1.2818259750639582, + "learning_rate": 6.494878255635675e-06, + "loss": 0.3215, + "step": 14524 + }, + { + "epoch": 0.42, + "grad_norm": 1.386125849498779, + "learning_rate": 6.494430025469425e-06, + "loss": 0.315, + "step": 14525 + }, + { + "epoch": 0.42, + "grad_norm": 1.9671785221595708, + "learning_rate": 6.493981782114813e-06, + "loss": 0.3538, + "step": 14526 + }, + { + "epoch": 0.42, + "grad_norm": 1.4681132961776122, + "learning_rate": 6.493533525575797e-06, + "loss": 0.3211, + "step": 14527 + }, + { + "epoch": 0.42, + "grad_norm": 1.3214736734107297, + "learning_rate": 6.493085255856329e-06, + "loss": 0.3629, + "step": 14528 + }, + { + "epoch": 0.42, + "grad_norm": 1.8343084709607553, + "learning_rate": 6.49263697296037e-06, + "loss": 0.3142, + "step": 14529 + }, + { + "epoch": 0.42, + "grad_norm": 1.2944400234211026, + "learning_rate": 6.492188676891872e-06, + "loss": 0.3348, + "step": 14530 + }, + { + "epoch": 0.42, + "grad_norm": 1.2956324249088704, + "learning_rate": 6.491740367654791e-06, + "loss": 0.3071, + "step": 14531 + }, + { + "epoch": 0.42, + "grad_norm": 1.373947284231006, + "learning_rate": 6.491292045253087e-06, + "loss": 0.3533, + "step": 14532 + }, + { + "epoch": 0.42, + "grad_norm": 1.232112555374176, + "learning_rate": 6.490843709690713e-06, + "loss": 0.3244, + "step": 14533 + }, + { + "epoch": 0.42, + "grad_norm": 1.3952767579843064, + "learning_rate": 6.490395360971627e-06, + "loss": 0.3447, + "step": 14534 + }, + { + "epoch": 0.42, + "grad_norm": 1.4172115032911459, + "learning_rate": 6.489946999099786e-06, + "loss": 0.3266, + "step": 14535 + }, + { + "epoch": 0.42, + "grad_norm": 1.9616263237061249, + "learning_rate": 6.489498624079144e-06, + "loss": 0.3343, + "step": 14536 + }, + { + "epoch": 0.42, + "grad_norm": 1.4324207655879155, + "learning_rate": 6.489050235913661e-06, + "loss": 0.2963, + "step": 14537 + }, + { + "epoch": 0.42, + "grad_norm": 0.9582360177940503, + "learning_rate": 6.488601834607294e-06, + "loss": 0.5967, + "step": 14538 + }, + { + "epoch": 0.42, + "grad_norm": 1.2473018556180737, + "learning_rate": 6.488153420163998e-06, + "loss": 0.3404, + "step": 14539 + }, + { + "epoch": 0.42, + "grad_norm": 1.221875559077736, + "learning_rate": 6.487704992587733e-06, + "loss": 0.3285, + "step": 14540 + }, + { + "epoch": 0.42, + "grad_norm": 1.6314056240664707, + "learning_rate": 6.4872565518824535e-06, + "loss": 0.3461, + "step": 14541 + }, + { + "epoch": 0.42, + "grad_norm": 0.9016925054277638, + "learning_rate": 6.486808098052121e-06, + "loss": 0.5759, + "step": 14542 + }, + { + "epoch": 0.42, + "grad_norm": 1.3592736303663961, + "learning_rate": 6.4863596311006886e-06, + "loss": 0.3228, + "step": 14543 + }, + { + "epoch": 0.42, + "grad_norm": 1.3549436651300582, + "learning_rate": 6.485911151032115e-06, + "loss": 0.3085, + "step": 14544 + }, + { + "epoch": 0.42, + "grad_norm": 1.7213376082154481, + "learning_rate": 6.48546265785036e-06, + "loss": 0.3417, + "step": 14545 + }, + { + "epoch": 0.42, + "grad_norm": 1.4518759081891757, + "learning_rate": 6.485014151559381e-06, + "loss": 0.3175, + "step": 14546 + }, + { + "epoch": 0.42, + "grad_norm": 1.514626272236274, + "learning_rate": 6.484565632163136e-06, + "loss": 0.3249, + "step": 14547 + }, + { + "epoch": 0.42, + "grad_norm": 1.3527089345830599, + "learning_rate": 6.484117099665583e-06, + "loss": 0.3196, + "step": 14548 + }, + { + "epoch": 0.42, + "grad_norm": 1.6400605770605026, + "learning_rate": 6.483668554070679e-06, + "loss": 0.314, + "step": 14549 + }, + { + "epoch": 0.42, + "grad_norm": 1.4293044109129174, + "learning_rate": 6.4832199953823835e-06, + "loss": 0.3156, + "step": 14550 + }, + { + "epoch": 0.42, + "grad_norm": 1.3511818622094747, + "learning_rate": 6.482771423604655e-06, + "loss": 0.3491, + "step": 14551 + }, + { + "epoch": 0.42, + "grad_norm": 1.240661749899625, + "learning_rate": 6.4823228387414535e-06, + "loss": 0.3043, + "step": 14552 + }, + { + "epoch": 0.42, + "grad_norm": 1.3501015695211653, + "learning_rate": 6.481874240796735e-06, + "loss": 0.3396, + "step": 14553 + }, + { + "epoch": 0.42, + "grad_norm": 5.033218457234411, + "learning_rate": 6.48142562977446e-06, + "loss": 0.3359, + "step": 14554 + }, + { + "epoch": 0.42, + "grad_norm": 1.347696094829989, + "learning_rate": 6.480977005678588e-06, + "loss": 0.3215, + "step": 14555 + }, + { + "epoch": 0.42, + "grad_norm": 1.3823208840305754, + "learning_rate": 6.48052836851308e-06, + "loss": 0.3214, + "step": 14556 + }, + { + "epoch": 0.42, + "grad_norm": 1.4086891210972738, + "learning_rate": 6.480079718281889e-06, + "loss": 0.3051, + "step": 14557 + }, + { + "epoch": 0.42, + "grad_norm": 1.3344325742145098, + "learning_rate": 6.479631054988979e-06, + "loss": 0.3154, + "step": 14558 + }, + { + "epoch": 0.42, + "grad_norm": 1.4270490492900603, + "learning_rate": 6.479182378638308e-06, + "loss": 0.3366, + "step": 14559 + }, + { + "epoch": 0.42, + "grad_norm": 1.370608055068994, + "learning_rate": 6.478733689233838e-06, + "loss": 0.3566, + "step": 14560 + }, + { + "epoch": 0.42, + "grad_norm": 1.4680397007766344, + "learning_rate": 6.478284986779526e-06, + "loss": 0.3124, + "step": 14561 + }, + { + "epoch": 0.42, + "grad_norm": 1.3053836994177082, + "learning_rate": 6.477836271279332e-06, + "loss": 0.3102, + "step": 14562 + }, + { + "epoch": 0.42, + "grad_norm": 1.4071967518944821, + "learning_rate": 6.477387542737217e-06, + "loss": 0.3222, + "step": 14563 + }, + { + "epoch": 0.42, + "grad_norm": 1.3782091890080994, + "learning_rate": 6.47693880115714e-06, + "loss": 0.33, + "step": 14564 + }, + { + "epoch": 0.42, + "grad_norm": 1.3713046745367143, + "learning_rate": 6.476490046543063e-06, + "loss": 0.3187, + "step": 14565 + }, + { + "epoch": 0.42, + "grad_norm": 1.5280954796247042, + "learning_rate": 6.476041278898944e-06, + "loss": 0.338, + "step": 14566 + }, + { + "epoch": 0.42, + "grad_norm": 1.467694613469466, + "learning_rate": 6.475592498228743e-06, + "loss": 0.3284, + "step": 14567 + }, + { + "epoch": 0.42, + "grad_norm": 7.38144014821426, + "learning_rate": 6.475143704536423e-06, + "loss": 0.3318, + "step": 14568 + }, + { + "epoch": 0.42, + "grad_norm": 1.3449489541750266, + "learning_rate": 6.474694897825944e-06, + "loss": 0.3079, + "step": 14569 + }, + { + "epoch": 0.42, + "grad_norm": 1.5413370923900311, + "learning_rate": 6.4742460781012665e-06, + "loss": 0.3307, + "step": 14570 + }, + { + "epoch": 0.42, + "grad_norm": 1.2977993784738802, + "learning_rate": 6.473797245366349e-06, + "loss": 0.3264, + "step": 14571 + }, + { + "epoch": 0.42, + "grad_norm": 1.3364594151048999, + "learning_rate": 6.473348399625156e-06, + "loss": 0.314, + "step": 14572 + }, + { + "epoch": 0.42, + "grad_norm": 1.4795482835259886, + "learning_rate": 6.472899540881646e-06, + "loss": 0.3139, + "step": 14573 + }, + { + "epoch": 0.42, + "grad_norm": 1.285885867346092, + "learning_rate": 6.472450669139784e-06, + "loss": 0.3257, + "step": 14574 + }, + { + "epoch": 0.42, + "grad_norm": 1.2729186988493213, + "learning_rate": 6.472001784403526e-06, + "loss": 0.3262, + "step": 14575 + }, + { + "epoch": 0.42, + "grad_norm": 1.781352109199048, + "learning_rate": 6.471552886676837e-06, + "loss": 0.3562, + "step": 14576 + }, + { + "epoch": 0.42, + "grad_norm": 1.4147635095811735, + "learning_rate": 6.4711039759636765e-06, + "loss": 0.3502, + "step": 14577 + }, + { + "epoch": 0.42, + "grad_norm": 1.566719050733107, + "learning_rate": 6.470655052268007e-06, + "loss": 0.3246, + "step": 14578 + }, + { + "epoch": 0.42, + "grad_norm": 1.4249283243985638, + "learning_rate": 6.470206115593791e-06, + "loss": 0.3281, + "step": 14579 + }, + { + "epoch": 0.42, + "grad_norm": 1.4740374464479589, + "learning_rate": 6.46975716594499e-06, + "loss": 0.3288, + "step": 14580 + }, + { + "epoch": 0.42, + "grad_norm": 1.5269990861053533, + "learning_rate": 6.469308203325565e-06, + "loss": 0.3351, + "step": 14581 + }, + { + "epoch": 0.42, + "grad_norm": 1.3480387208357076, + "learning_rate": 6.468859227739479e-06, + "loss": 0.3127, + "step": 14582 + }, + { + "epoch": 0.42, + "grad_norm": 2.0381892332468645, + "learning_rate": 6.468410239190694e-06, + "loss": 0.33, + "step": 14583 + }, + { + "epoch": 0.42, + "grad_norm": 1.634694342807784, + "learning_rate": 6.467961237683172e-06, + "loss": 0.3115, + "step": 14584 + }, + { + "epoch": 0.42, + "grad_norm": 1.28077311777696, + "learning_rate": 6.467512223220877e-06, + "loss": 0.3001, + "step": 14585 + }, + { + "epoch": 0.42, + "grad_norm": 1.6251604888798763, + "learning_rate": 6.467063195807769e-06, + "loss": 0.3124, + "step": 14586 + }, + { + "epoch": 0.42, + "grad_norm": 1.3377418589877128, + "learning_rate": 6.466614155447813e-06, + "loss": 0.3279, + "step": 14587 + }, + { + "epoch": 0.42, + "grad_norm": 1.342251902912492, + "learning_rate": 6.46616510214497e-06, + "loss": 0.2918, + "step": 14588 + }, + { + "epoch": 0.42, + "grad_norm": 1.4034815952311823, + "learning_rate": 6.465716035903204e-06, + "loss": 0.3133, + "step": 14589 + }, + { + "epoch": 0.42, + "grad_norm": 1.374493286288794, + "learning_rate": 6.465266956726478e-06, + "loss": 0.3141, + "step": 14590 + }, + { + "epoch": 0.42, + "grad_norm": 1.3345832389594723, + "learning_rate": 6.464817864618756e-06, + "loss": 0.3237, + "step": 14591 + }, + { + "epoch": 0.42, + "grad_norm": 1.5174379702784924, + "learning_rate": 6.464368759583998e-06, + "loss": 0.3361, + "step": 14592 + }, + { + "epoch": 0.42, + "grad_norm": 1.4008320525237246, + "learning_rate": 6.4639196416261704e-06, + "loss": 0.3158, + "step": 14593 + }, + { + "epoch": 0.42, + "grad_norm": 1.3333128908773095, + "learning_rate": 6.4634705107492345e-06, + "loss": 0.3292, + "step": 14594 + }, + { + "epoch": 0.42, + "grad_norm": 1.3497624200507885, + "learning_rate": 6.463021366957156e-06, + "loss": 0.3181, + "step": 14595 + }, + { + "epoch": 0.42, + "grad_norm": 2.1078745686324947, + "learning_rate": 6.462572210253898e-06, + "loss": 0.3104, + "step": 14596 + }, + { + "epoch": 0.42, + "grad_norm": 1.4111091856966596, + "learning_rate": 6.462123040643424e-06, + "loss": 0.3169, + "step": 14597 + }, + { + "epoch": 0.42, + "grad_norm": 1.642562960519896, + "learning_rate": 6.461673858129698e-06, + "loss": 0.3156, + "step": 14598 + }, + { + "epoch": 0.42, + "grad_norm": 1.5146883465114984, + "learning_rate": 6.461224662716683e-06, + "loss": 0.3209, + "step": 14599 + }, + { + "epoch": 0.42, + "grad_norm": 1.7252219145222345, + "learning_rate": 6.460775454408345e-06, + "loss": 0.3678, + "step": 14600 + }, + { + "epoch": 0.42, + "grad_norm": 1.7243090973768045, + "learning_rate": 6.460326233208649e-06, + "loss": 0.3239, + "step": 14601 + }, + { + "epoch": 0.42, + "grad_norm": 1.1987037498137019, + "learning_rate": 6.459876999121557e-06, + "loss": 0.2936, + "step": 14602 + }, + { + "epoch": 0.42, + "grad_norm": 1.4680887501809115, + "learning_rate": 6.459427752151033e-06, + "loss": 0.3093, + "step": 14603 + }, + { + "epoch": 0.42, + "grad_norm": 1.8862080697655768, + "learning_rate": 6.458978492301042e-06, + "loss": 0.3134, + "step": 14604 + }, + { + "epoch": 0.42, + "grad_norm": 2.127152036627928, + "learning_rate": 6.458529219575551e-06, + "loss": 0.3257, + "step": 14605 + }, + { + "epoch": 0.42, + "grad_norm": 1.3441053984384115, + "learning_rate": 6.458079933978523e-06, + "loss": 0.3191, + "step": 14606 + }, + { + "epoch": 0.42, + "grad_norm": 1.3643894153830503, + "learning_rate": 6.4576306355139255e-06, + "loss": 0.3144, + "step": 14607 + }, + { + "epoch": 0.42, + "grad_norm": 1.2742741072571853, + "learning_rate": 6.45718132418572e-06, + "loss": 0.3092, + "step": 14608 + }, + { + "epoch": 0.42, + "grad_norm": 1.6972202725022743, + "learning_rate": 6.456731999997872e-06, + "loss": 0.3424, + "step": 14609 + }, + { + "epoch": 0.42, + "grad_norm": 1.4432314268634143, + "learning_rate": 6.456282662954348e-06, + "loss": 0.3256, + "step": 14610 + }, + { + "epoch": 0.42, + "grad_norm": 1.3340683035125371, + "learning_rate": 6.455833313059115e-06, + "loss": 0.3199, + "step": 14611 + }, + { + "epoch": 0.42, + "grad_norm": 1.6098017519034171, + "learning_rate": 6.455383950316136e-06, + "loss": 0.3257, + "step": 14612 + }, + { + "epoch": 0.42, + "grad_norm": 1.3089922999467316, + "learning_rate": 6.454934574729377e-06, + "loss": 0.3, + "step": 14613 + }, + { + "epoch": 0.42, + "grad_norm": 1.3587500907338197, + "learning_rate": 6.454485186302805e-06, + "loss": 0.3137, + "step": 14614 + }, + { + "epoch": 0.42, + "grad_norm": 1.406594649525785, + "learning_rate": 6.454035785040386e-06, + "loss": 0.3077, + "step": 14615 + }, + { + "epoch": 0.42, + "grad_norm": 1.366589522402715, + "learning_rate": 6.453586370946085e-06, + "loss": 0.3505, + "step": 14616 + }, + { + "epoch": 0.42, + "grad_norm": 1.3902669845318607, + "learning_rate": 6.4531369440238654e-06, + "loss": 0.3137, + "step": 14617 + }, + { + "epoch": 0.42, + "grad_norm": 1.4491671265173156, + "learning_rate": 6.4526875042776985e-06, + "loss": 0.3371, + "step": 14618 + }, + { + "epoch": 0.42, + "grad_norm": 1.542745264514971, + "learning_rate": 6.4522380517115465e-06, + "loss": 0.345, + "step": 14619 + }, + { + "epoch": 0.42, + "grad_norm": 1.2893867857160892, + "learning_rate": 6.45178858632938e-06, + "loss": 0.3187, + "step": 14620 + }, + { + "epoch": 0.42, + "grad_norm": 1.3241515504096375, + "learning_rate": 6.451339108135161e-06, + "loss": 0.3316, + "step": 14621 + }, + { + "epoch": 0.42, + "grad_norm": 1.3875070643941794, + "learning_rate": 6.450889617132859e-06, + "loss": 0.3396, + "step": 14622 + }, + { + "epoch": 0.42, + "grad_norm": 1.1714397237406151, + "learning_rate": 6.450440113326441e-06, + "loss": 0.2882, + "step": 14623 + }, + { + "epoch": 0.42, + "grad_norm": 1.1792515253904792, + "learning_rate": 6.449990596719871e-06, + "loss": 0.3071, + "step": 14624 + }, + { + "epoch": 0.42, + "grad_norm": 1.2810081442784087, + "learning_rate": 6.44954106731712e-06, + "loss": 0.3156, + "step": 14625 + }, + { + "epoch": 0.42, + "grad_norm": 1.507877448758343, + "learning_rate": 6.449091525122151e-06, + "loss": 0.3145, + "step": 14626 + }, + { + "epoch": 0.42, + "grad_norm": 1.3221765517413635, + "learning_rate": 6.4486419701389345e-06, + "loss": 0.3085, + "step": 14627 + }, + { + "epoch": 0.42, + "grad_norm": 1.6254377171113474, + "learning_rate": 6.448192402371436e-06, + "loss": 0.3533, + "step": 14628 + }, + { + "epoch": 0.42, + "grad_norm": 1.3495356822441291, + "learning_rate": 6.447742821823624e-06, + "loss": 0.321, + "step": 14629 + }, + { + "epoch": 0.42, + "grad_norm": 1.5237661104837836, + "learning_rate": 6.447293228499466e-06, + "loss": 0.3458, + "step": 14630 + }, + { + "epoch": 0.42, + "grad_norm": 1.5426008791136494, + "learning_rate": 6.446843622402928e-06, + "loss": 0.3302, + "step": 14631 + }, + { + "epoch": 0.42, + "grad_norm": 1.2855497140539742, + "learning_rate": 6.446394003537979e-06, + "loss": 0.2982, + "step": 14632 + }, + { + "epoch": 0.42, + "grad_norm": 1.283495684413474, + "learning_rate": 6.445944371908588e-06, + "loss": 0.3246, + "step": 14633 + }, + { + "epoch": 0.42, + "grad_norm": 1.464368161893092, + "learning_rate": 6.445494727518722e-06, + "loss": 0.3104, + "step": 14634 + }, + { + "epoch": 0.42, + "grad_norm": 1.3663388277543294, + "learning_rate": 6.4450450703723476e-06, + "loss": 0.3094, + "step": 14635 + }, + { + "epoch": 0.42, + "grad_norm": 1.3162523472409153, + "learning_rate": 6.444595400473434e-06, + "loss": 0.366, + "step": 14636 + }, + { + "epoch": 0.42, + "grad_norm": 1.464625948654975, + "learning_rate": 6.4441457178259516e-06, + "loss": 0.3413, + "step": 14637 + }, + { + "epoch": 0.42, + "grad_norm": 1.3329884165993442, + "learning_rate": 6.443696022433867e-06, + "loss": 0.3275, + "step": 14638 + }, + { + "epoch": 0.42, + "grad_norm": 1.804586190472021, + "learning_rate": 6.443246314301148e-06, + "loss": 0.3257, + "step": 14639 + }, + { + "epoch": 0.42, + "grad_norm": 1.6133769203959332, + "learning_rate": 6.442796593431765e-06, + "loss": 0.3415, + "step": 14640 + }, + { + "epoch": 0.42, + "grad_norm": 1.3077483126100427, + "learning_rate": 6.442346859829686e-06, + "loss": 0.3148, + "step": 14641 + }, + { + "epoch": 0.42, + "grad_norm": 1.918440971062696, + "learning_rate": 6.441897113498878e-06, + "loss": 0.3262, + "step": 14642 + }, + { + "epoch": 0.42, + "grad_norm": 1.2590403315564744, + "learning_rate": 6.441447354443315e-06, + "loss": 0.317, + "step": 14643 + }, + { + "epoch": 0.42, + "grad_norm": 1.252026215527359, + "learning_rate": 6.440997582666964e-06, + "loss": 0.3211, + "step": 14644 + }, + { + "epoch": 0.42, + "grad_norm": 1.2719864990298373, + "learning_rate": 6.44054779817379e-06, + "loss": 0.305, + "step": 14645 + }, + { + "epoch": 0.42, + "grad_norm": 1.29108325003456, + "learning_rate": 6.440098000967767e-06, + "loss": 0.3208, + "step": 14646 + }, + { + "epoch": 0.42, + "grad_norm": 1.5032107645496713, + "learning_rate": 6.439648191052863e-06, + "loss": 0.3222, + "step": 14647 + }, + { + "epoch": 0.42, + "grad_norm": 0.9749760322336799, + "learning_rate": 6.439198368433048e-06, + "loss": 0.6267, + "step": 14648 + }, + { + "epoch": 0.42, + "grad_norm": 1.5785042643596647, + "learning_rate": 6.438748533112291e-06, + "loss": 0.3259, + "step": 14649 + }, + { + "epoch": 0.42, + "grad_norm": 1.605126995560206, + "learning_rate": 6.438298685094563e-06, + "loss": 0.3202, + "step": 14650 + }, + { + "epoch": 0.42, + "grad_norm": 1.3518779863127919, + "learning_rate": 6.437848824383832e-06, + "loss": 0.3422, + "step": 14651 + }, + { + "epoch": 0.42, + "grad_norm": 1.3001585411328203, + "learning_rate": 6.43739895098407e-06, + "loss": 0.3198, + "step": 14652 + }, + { + "epoch": 0.43, + "grad_norm": 1.2953032134529032, + "learning_rate": 6.436949064899247e-06, + "loss": 0.3171, + "step": 14653 + }, + { + "epoch": 0.43, + "grad_norm": 1.5328718908825267, + "learning_rate": 6.436499166133331e-06, + "loss": 0.3215, + "step": 14654 + }, + { + "epoch": 0.43, + "grad_norm": 1.3370835644579955, + "learning_rate": 6.436049254690294e-06, + "loss": 0.3347, + "step": 14655 + }, + { + "epoch": 0.43, + "grad_norm": 1.8736926020691833, + "learning_rate": 6.4355993305741085e-06, + "loss": 0.3144, + "step": 14656 + }, + { + "epoch": 0.43, + "grad_norm": 1.4757834912471457, + "learning_rate": 6.435149393788741e-06, + "loss": 0.315, + "step": 14657 + }, + { + "epoch": 0.43, + "grad_norm": 1.3101652014874672, + "learning_rate": 6.434699444338166e-06, + "loss": 0.3327, + "step": 14658 + }, + { + "epoch": 0.43, + "grad_norm": 1.3957140668232404, + "learning_rate": 6.43424948222635e-06, + "loss": 0.3186, + "step": 14659 + }, + { + "epoch": 0.43, + "grad_norm": 1.3942146573197378, + "learning_rate": 6.4337995074572675e-06, + "loss": 0.2972, + "step": 14660 + }, + { + "epoch": 0.43, + "grad_norm": 1.3445263462187091, + "learning_rate": 6.433349520034888e-06, + "loss": 0.3171, + "step": 14661 + }, + { + "epoch": 0.43, + "grad_norm": 1.4502111367679023, + "learning_rate": 6.432899519963185e-06, + "loss": 0.3278, + "step": 14662 + }, + { + "epoch": 0.43, + "grad_norm": 1.3407805348269275, + "learning_rate": 6.4324495072461255e-06, + "loss": 0.3049, + "step": 14663 + }, + { + "epoch": 0.43, + "grad_norm": 2.523823562057877, + "learning_rate": 6.431999481887685e-06, + "loss": 0.3266, + "step": 14664 + }, + { + "epoch": 0.43, + "grad_norm": 1.2845503486195518, + "learning_rate": 6.431549443891832e-06, + "loss": 0.308, + "step": 14665 + }, + { + "epoch": 0.43, + "grad_norm": 1.375616045198146, + "learning_rate": 6.4310993932625395e-06, + "loss": 0.3052, + "step": 14666 + }, + { + "epoch": 0.43, + "grad_norm": 1.5932126700400593, + "learning_rate": 6.430649330003779e-06, + "loss": 0.3179, + "step": 14667 + }, + { + "epoch": 0.43, + "grad_norm": 1.5112694038839674, + "learning_rate": 6.430199254119522e-06, + "loss": 0.3204, + "step": 14668 + }, + { + "epoch": 0.43, + "grad_norm": 1.4857078062260294, + "learning_rate": 6.429749165613741e-06, + "loss": 0.3215, + "step": 14669 + }, + { + "epoch": 0.43, + "grad_norm": 1.3391227467549616, + "learning_rate": 6.429299064490409e-06, + "loss": 0.3134, + "step": 14670 + }, + { + "epoch": 0.43, + "grad_norm": 1.3330258887748845, + "learning_rate": 6.428848950753496e-06, + "loss": 0.3186, + "step": 14671 + }, + { + "epoch": 0.43, + "grad_norm": 1.5697258551836843, + "learning_rate": 6.428398824406975e-06, + "loss": 0.3391, + "step": 14672 + }, + { + "epoch": 0.43, + "grad_norm": 1.2908384005189557, + "learning_rate": 6.427948685454819e-06, + "loss": 0.3283, + "step": 14673 + }, + { + "epoch": 0.43, + "grad_norm": 1.3885864487123607, + "learning_rate": 6.427498533900999e-06, + "loss": 0.3276, + "step": 14674 + }, + { + "epoch": 0.43, + "grad_norm": 1.2924892394033372, + "learning_rate": 6.427048369749489e-06, + "loss": 0.3231, + "step": 14675 + }, + { + "epoch": 0.43, + "grad_norm": 0.9462892959018453, + "learning_rate": 6.426598193004262e-06, + "loss": 0.5953, + "step": 14676 + }, + { + "epoch": 0.43, + "grad_norm": 1.4226016533364163, + "learning_rate": 6.426148003669289e-06, + "loss": 0.3328, + "step": 14677 + }, + { + "epoch": 0.43, + "grad_norm": 1.3084701615237744, + "learning_rate": 6.425697801748545e-06, + "loss": 0.3101, + "step": 14678 + }, + { + "epoch": 0.43, + "grad_norm": 1.2569556674857008, + "learning_rate": 6.425247587246001e-06, + "loss": 0.3332, + "step": 14679 + }, + { + "epoch": 0.43, + "grad_norm": 1.9419692307321308, + "learning_rate": 6.424797360165632e-06, + "loss": 0.3139, + "step": 14680 + }, + { + "epoch": 0.43, + "grad_norm": 1.4099939223713773, + "learning_rate": 6.424347120511411e-06, + "loss": 0.3407, + "step": 14681 + }, + { + "epoch": 0.43, + "grad_norm": 1.2901437971410887, + "learning_rate": 6.423896868287309e-06, + "loss": 0.3337, + "step": 14682 + }, + { + "epoch": 0.43, + "grad_norm": 1.7416663418189462, + "learning_rate": 6.423446603497303e-06, + "loss": 0.3295, + "step": 14683 + }, + { + "epoch": 0.43, + "grad_norm": 1.390232751301629, + "learning_rate": 6.422996326145365e-06, + "loss": 0.3408, + "step": 14684 + }, + { + "epoch": 0.43, + "grad_norm": 1.2185487705193698, + "learning_rate": 6.422546036235467e-06, + "loss": 0.3068, + "step": 14685 + }, + { + "epoch": 0.43, + "grad_norm": 1.2515647063533069, + "learning_rate": 6.422095733771587e-06, + "loss": 0.3259, + "step": 14686 + }, + { + "epoch": 0.43, + "grad_norm": 1.3662724925952956, + "learning_rate": 6.4216454187576945e-06, + "loss": 0.3578, + "step": 14687 + }, + { + "epoch": 0.43, + "grad_norm": 1.3336250218288925, + "learning_rate": 6.421195091197766e-06, + "loss": 0.3241, + "step": 14688 + }, + { + "epoch": 0.43, + "grad_norm": 1.3592370824179199, + "learning_rate": 6.420744751095775e-06, + "loss": 0.3335, + "step": 14689 + }, + { + "epoch": 0.43, + "grad_norm": 1.3863260260697838, + "learning_rate": 6.4202943984556965e-06, + "loss": 0.3111, + "step": 14690 + }, + { + "epoch": 0.43, + "grad_norm": 1.3846185857616309, + "learning_rate": 6.419844033281503e-06, + "loss": 0.3137, + "step": 14691 + }, + { + "epoch": 0.43, + "grad_norm": 1.4992476656975515, + "learning_rate": 6.419393655577171e-06, + "loss": 0.3231, + "step": 14692 + }, + { + "epoch": 0.43, + "grad_norm": 0.9600783067148237, + "learning_rate": 6.418943265346673e-06, + "loss": 0.6002, + "step": 14693 + }, + { + "epoch": 0.43, + "grad_norm": 2.0334104401692845, + "learning_rate": 6.4184928625939854e-06, + "loss": 0.3277, + "step": 14694 + }, + { + "epoch": 0.43, + "grad_norm": 1.4116606052030864, + "learning_rate": 6.418042447323083e-06, + "loss": 0.3398, + "step": 14695 + }, + { + "epoch": 0.43, + "grad_norm": 1.7815687364265131, + "learning_rate": 6.417592019537941e-06, + "loss": 0.3391, + "step": 14696 + }, + { + "epoch": 0.43, + "grad_norm": 1.335570630458346, + "learning_rate": 6.417141579242532e-06, + "loss": 0.2913, + "step": 14697 + }, + { + "epoch": 0.43, + "grad_norm": 1.2922459965152557, + "learning_rate": 6.416691126440835e-06, + "loss": 0.3019, + "step": 14698 + }, + { + "epoch": 0.43, + "grad_norm": 1.3375483462594673, + "learning_rate": 6.416240661136821e-06, + "loss": 0.3254, + "step": 14699 + }, + { + "epoch": 0.43, + "grad_norm": 1.3057685881280188, + "learning_rate": 6.41579018333447e-06, + "loss": 0.3188, + "step": 14700 + }, + { + "epoch": 0.43, + "grad_norm": 1.2990258285362557, + "learning_rate": 6.415339693037752e-06, + "loss": 0.3184, + "step": 14701 + }, + { + "epoch": 0.43, + "grad_norm": 1.4709336298459552, + "learning_rate": 6.414889190250646e-06, + "loss": 0.311, + "step": 14702 + }, + { + "epoch": 0.43, + "grad_norm": 1.4156316290364677, + "learning_rate": 6.414438674977129e-06, + "loss": 0.3162, + "step": 14703 + }, + { + "epoch": 0.43, + "grad_norm": 0.8991417222335409, + "learning_rate": 6.413988147221174e-06, + "loss": 0.5905, + "step": 14704 + }, + { + "epoch": 0.43, + "grad_norm": 1.3735444052468657, + "learning_rate": 6.413537606986757e-06, + "loss": 0.3138, + "step": 14705 + }, + { + "epoch": 0.43, + "grad_norm": 1.2223190464633336, + "learning_rate": 6.413087054277856e-06, + "loss": 0.3203, + "step": 14706 + }, + { + "epoch": 0.43, + "grad_norm": 1.6619969546636644, + "learning_rate": 6.412636489098446e-06, + "loss": 0.3207, + "step": 14707 + }, + { + "epoch": 0.43, + "grad_norm": 1.5012450566168172, + "learning_rate": 6.412185911452503e-06, + "loss": 0.3197, + "step": 14708 + }, + { + "epoch": 0.43, + "grad_norm": 3.0405816251561473, + "learning_rate": 6.4117353213440034e-06, + "loss": 0.3396, + "step": 14709 + }, + { + "epoch": 0.43, + "grad_norm": 1.488934807530108, + "learning_rate": 6.411284718776925e-06, + "loss": 0.3379, + "step": 14710 + }, + { + "epoch": 0.43, + "grad_norm": 1.4810076348774737, + "learning_rate": 6.410834103755243e-06, + "loss": 0.3164, + "step": 14711 + }, + { + "epoch": 0.43, + "grad_norm": 1.2611445046079324, + "learning_rate": 6.410383476282933e-06, + "loss": 0.286, + "step": 14712 + }, + { + "epoch": 0.43, + "grad_norm": 1.2379308171856316, + "learning_rate": 6.409932836363975e-06, + "loss": 0.3176, + "step": 14713 + }, + { + "epoch": 0.43, + "grad_norm": 1.329225726729139, + "learning_rate": 6.409482184002344e-06, + "loss": 0.3144, + "step": 14714 + }, + { + "epoch": 0.43, + "grad_norm": 1.2391472966191486, + "learning_rate": 6.409031519202016e-06, + "loss": 0.3157, + "step": 14715 + }, + { + "epoch": 0.43, + "grad_norm": 1.3006626400302792, + "learning_rate": 6.40858084196697e-06, + "loss": 0.3177, + "step": 14716 + }, + { + "epoch": 0.43, + "grad_norm": 1.2263937687644464, + "learning_rate": 6.408130152301182e-06, + "loss": 0.3087, + "step": 14717 + }, + { + "epoch": 0.43, + "grad_norm": 1.4551742052758156, + "learning_rate": 6.40767945020863e-06, + "loss": 0.3017, + "step": 14718 + }, + { + "epoch": 0.43, + "grad_norm": 1.7200827605269435, + "learning_rate": 6.40722873569329e-06, + "loss": 0.321, + "step": 14719 + }, + { + "epoch": 0.43, + "grad_norm": 1.3943049454136363, + "learning_rate": 6.4067780087591415e-06, + "loss": 0.3392, + "step": 14720 + }, + { + "epoch": 0.43, + "grad_norm": 1.3630230567982597, + "learning_rate": 6.406327269410163e-06, + "loss": 0.2879, + "step": 14721 + }, + { + "epoch": 0.43, + "grad_norm": 1.3229973562070998, + "learning_rate": 6.405876517650329e-06, + "loss": 0.3456, + "step": 14722 + }, + { + "epoch": 0.43, + "grad_norm": 1.702558146846287, + "learning_rate": 6.40542575348362e-06, + "loss": 0.33, + "step": 14723 + }, + { + "epoch": 0.43, + "grad_norm": 1.2793222301822031, + "learning_rate": 6.404974976914012e-06, + "loss": 0.3356, + "step": 14724 + }, + { + "epoch": 0.43, + "grad_norm": 1.2841264795160312, + "learning_rate": 6.404524187945485e-06, + "loss": 0.3189, + "step": 14725 + }, + { + "epoch": 0.43, + "grad_norm": 1.55045073726989, + "learning_rate": 6.404073386582017e-06, + "loss": 0.3338, + "step": 14726 + }, + { + "epoch": 0.43, + "grad_norm": 1.634784625812517, + "learning_rate": 6.403622572827584e-06, + "loss": 0.3377, + "step": 14727 + }, + { + "epoch": 0.43, + "grad_norm": 1.7433054130055798, + "learning_rate": 6.40317174668617e-06, + "loss": 0.3212, + "step": 14728 + }, + { + "epoch": 0.43, + "grad_norm": 1.4894604169500263, + "learning_rate": 6.402720908161746e-06, + "loss": 0.3182, + "step": 14729 + }, + { + "epoch": 0.43, + "grad_norm": 1.3512682422530644, + "learning_rate": 6.402270057258297e-06, + "loss": 0.3311, + "step": 14730 + }, + { + "epoch": 0.43, + "grad_norm": 1.2342348629288615, + "learning_rate": 6.401819193979797e-06, + "loss": 0.3041, + "step": 14731 + }, + { + "epoch": 0.43, + "grad_norm": 1.3551262096558236, + "learning_rate": 6.4013683183302275e-06, + "loss": 0.3026, + "step": 14732 + }, + { + "epoch": 0.43, + "grad_norm": 1.2482345623901494, + "learning_rate": 6.400917430313568e-06, + "loss": 0.3204, + "step": 14733 + }, + { + "epoch": 0.43, + "grad_norm": 1.2574913761382862, + "learning_rate": 6.400466529933796e-06, + "loss": 0.3151, + "step": 14734 + }, + { + "epoch": 0.43, + "grad_norm": 1.496385705764465, + "learning_rate": 6.400015617194892e-06, + "loss": 0.347, + "step": 14735 + }, + { + "epoch": 0.43, + "grad_norm": 1.5191809058326242, + "learning_rate": 6.399564692100833e-06, + "loss": 0.338, + "step": 14736 + }, + { + "epoch": 0.43, + "grad_norm": 1.3603711087569907, + "learning_rate": 6.399113754655602e-06, + "loss": 0.3369, + "step": 14737 + }, + { + "epoch": 0.43, + "grad_norm": 1.5901961368212205, + "learning_rate": 6.398662804863176e-06, + "loss": 0.3144, + "step": 14738 + }, + { + "epoch": 0.43, + "grad_norm": 1.4123825754682586, + "learning_rate": 6.398211842727536e-06, + "loss": 0.3292, + "step": 14739 + }, + { + "epoch": 0.43, + "grad_norm": 1.4536103318032607, + "learning_rate": 6.39776086825266e-06, + "loss": 0.3386, + "step": 14740 + }, + { + "epoch": 0.43, + "grad_norm": 1.23598714411601, + "learning_rate": 6.397309881442528e-06, + "loss": 0.3163, + "step": 14741 + }, + { + "epoch": 0.43, + "grad_norm": 1.3748272405654098, + "learning_rate": 6.396858882301121e-06, + "loss": 0.3179, + "step": 14742 + }, + { + "epoch": 0.43, + "grad_norm": 2.119194928300888, + "learning_rate": 6.396407870832419e-06, + "loss": 0.3658, + "step": 14743 + }, + { + "epoch": 0.43, + "grad_norm": 2.0059494897451815, + "learning_rate": 6.395956847040404e-06, + "loss": 0.3376, + "step": 14744 + }, + { + "epoch": 0.43, + "grad_norm": 1.3407045702420703, + "learning_rate": 6.395505810929053e-06, + "loss": 0.339, + "step": 14745 + }, + { + "epoch": 0.43, + "grad_norm": 1.4011517069247967, + "learning_rate": 6.395054762502347e-06, + "loss": 0.3169, + "step": 14746 + }, + { + "epoch": 0.43, + "grad_norm": 1.497506823148616, + "learning_rate": 6.394603701764268e-06, + "loss": 0.3209, + "step": 14747 + }, + { + "epoch": 0.43, + "grad_norm": 1.471725545940272, + "learning_rate": 6.3941526287187975e-06, + "loss": 0.3375, + "step": 14748 + }, + { + "epoch": 0.43, + "grad_norm": 1.427897033995981, + "learning_rate": 6.3937015433699125e-06, + "loss": 0.3281, + "step": 14749 + }, + { + "epoch": 0.43, + "grad_norm": 1.2293278240969563, + "learning_rate": 6.393250445721595e-06, + "loss": 0.2952, + "step": 14750 + }, + { + "epoch": 0.43, + "grad_norm": 2.3689156990564726, + "learning_rate": 6.3927993357778285e-06, + "loss": 0.3501, + "step": 14751 + }, + { + "epoch": 0.43, + "grad_norm": 2.061607571614923, + "learning_rate": 6.392348213542592e-06, + "loss": 0.3339, + "step": 14752 + }, + { + "epoch": 0.43, + "grad_norm": 1.3356913950605476, + "learning_rate": 6.391897079019866e-06, + "loss": 0.3238, + "step": 14753 + }, + { + "epoch": 0.43, + "grad_norm": 1.2455776469794684, + "learning_rate": 6.391445932213635e-06, + "loss": 0.3103, + "step": 14754 + }, + { + "epoch": 0.43, + "grad_norm": 1.3504827081456208, + "learning_rate": 6.3909947731278764e-06, + "loss": 0.3353, + "step": 14755 + }, + { + "epoch": 0.43, + "grad_norm": 1.3350006926047848, + "learning_rate": 6.390543601766574e-06, + "loss": 0.323, + "step": 14756 + }, + { + "epoch": 0.43, + "grad_norm": 1.558633667457883, + "learning_rate": 6.3900924181337095e-06, + "loss": 0.3077, + "step": 14757 + }, + { + "epoch": 0.43, + "grad_norm": 1.5319142094137659, + "learning_rate": 6.389641222233265e-06, + "loss": 0.3141, + "step": 14758 + }, + { + "epoch": 0.43, + "grad_norm": 1.315716637383777, + "learning_rate": 6.38919001406922e-06, + "loss": 0.3039, + "step": 14759 + }, + { + "epoch": 0.43, + "grad_norm": 2.285648984079623, + "learning_rate": 6.388738793645557e-06, + "loss": 0.3304, + "step": 14760 + }, + { + "epoch": 0.43, + "grad_norm": 1.4013004141223404, + "learning_rate": 6.388287560966259e-06, + "loss": 0.3241, + "step": 14761 + }, + { + "epoch": 0.43, + "grad_norm": 1.6608756691451867, + "learning_rate": 6.3878363160353095e-06, + "loss": 0.3638, + "step": 14762 + }, + { + "epoch": 0.43, + "grad_norm": 1.4333160677771593, + "learning_rate": 6.387385058856688e-06, + "loss": 0.3104, + "step": 14763 + }, + { + "epoch": 0.43, + "grad_norm": 1.365909269754125, + "learning_rate": 6.3869337894343766e-06, + "loss": 0.323, + "step": 14764 + }, + { + "epoch": 0.43, + "grad_norm": 1.3482669631166115, + "learning_rate": 6.38648250777236e-06, + "loss": 0.326, + "step": 14765 + }, + { + "epoch": 0.43, + "grad_norm": 1.289084522181366, + "learning_rate": 6.386031213874622e-06, + "loss": 0.2971, + "step": 14766 + }, + { + "epoch": 0.43, + "grad_norm": 0.9182803159384958, + "learning_rate": 6.385579907745141e-06, + "loss": 0.591, + "step": 14767 + }, + { + "epoch": 0.43, + "grad_norm": 1.4219974738942482, + "learning_rate": 6.385128589387901e-06, + "loss": 0.349, + "step": 14768 + }, + { + "epoch": 0.43, + "grad_norm": 2.4520366852211515, + "learning_rate": 6.384677258806887e-06, + "loss": 0.3003, + "step": 14769 + }, + { + "epoch": 0.43, + "grad_norm": 2.0223520726260316, + "learning_rate": 6.384225916006082e-06, + "loss": 0.3281, + "step": 14770 + }, + { + "epoch": 0.43, + "grad_norm": 1.6704692534161274, + "learning_rate": 6.383774560989467e-06, + "loss": 0.3013, + "step": 14771 + }, + { + "epoch": 0.43, + "grad_norm": 1.4290623496240504, + "learning_rate": 6.383323193761026e-06, + "loss": 0.3183, + "step": 14772 + }, + { + "epoch": 0.43, + "grad_norm": 1.3654096395980526, + "learning_rate": 6.382871814324742e-06, + "loss": 0.345, + "step": 14773 + }, + { + "epoch": 0.43, + "grad_norm": 1.5716651557313268, + "learning_rate": 6.3824204226846e-06, + "loss": 0.3129, + "step": 14774 + }, + { + "epoch": 0.43, + "grad_norm": 1.2654382942173181, + "learning_rate": 6.381969018844582e-06, + "loss": 0.3366, + "step": 14775 + }, + { + "epoch": 0.43, + "grad_norm": 1.359641053725739, + "learning_rate": 6.3815176028086724e-06, + "loss": 0.3289, + "step": 14776 + }, + { + "epoch": 0.43, + "grad_norm": 0.9436098332316314, + "learning_rate": 6.381066174580855e-06, + "loss": 0.5624, + "step": 14777 + }, + { + "epoch": 0.43, + "grad_norm": 1.3426215477922026, + "learning_rate": 6.380614734165111e-06, + "loss": 0.3176, + "step": 14778 + }, + { + "epoch": 0.43, + "grad_norm": 2.1423547564895355, + "learning_rate": 6.380163281565428e-06, + "loss": 0.3054, + "step": 14779 + }, + { + "epoch": 0.43, + "grad_norm": 1.4707477049134028, + "learning_rate": 6.37971181678579e-06, + "loss": 0.2966, + "step": 14780 + }, + { + "epoch": 0.43, + "grad_norm": 1.8477299368838793, + "learning_rate": 6.379260339830178e-06, + "loss": 0.3241, + "step": 14781 + }, + { + "epoch": 0.43, + "grad_norm": 1.5483746871877517, + "learning_rate": 6.378808850702578e-06, + "loss": 0.3292, + "step": 14782 + }, + { + "epoch": 0.43, + "grad_norm": 1.5347277870848073, + "learning_rate": 6.3783573494069764e-06, + "loss": 0.3239, + "step": 14783 + }, + { + "epoch": 0.43, + "grad_norm": 1.336262233384088, + "learning_rate": 6.377905835947355e-06, + "loss": 0.3331, + "step": 14784 + }, + { + "epoch": 0.43, + "grad_norm": 2.8006190775479745, + "learning_rate": 6.377454310327701e-06, + "loss": 0.3337, + "step": 14785 + }, + { + "epoch": 0.43, + "grad_norm": 1.3058901312325182, + "learning_rate": 6.377002772551995e-06, + "loss": 0.3411, + "step": 14786 + }, + { + "epoch": 0.43, + "grad_norm": 1.4049791800490339, + "learning_rate": 6.3765512226242255e-06, + "loss": 0.3169, + "step": 14787 + }, + { + "epoch": 0.43, + "grad_norm": 1.3680668283518094, + "learning_rate": 6.376099660548375e-06, + "loss": 0.2999, + "step": 14788 + }, + { + "epoch": 0.43, + "grad_norm": 1.434723703396782, + "learning_rate": 6.375648086328431e-06, + "loss": 0.3385, + "step": 14789 + }, + { + "epoch": 0.43, + "grad_norm": 1.3530676534655206, + "learning_rate": 6.375196499968377e-06, + "loss": 0.3298, + "step": 14790 + }, + { + "epoch": 0.43, + "grad_norm": 1.3510238785640254, + "learning_rate": 6.374744901472198e-06, + "loss": 0.3233, + "step": 14791 + }, + { + "epoch": 0.43, + "grad_norm": 1.469799218668696, + "learning_rate": 6.37429329084388e-06, + "loss": 0.3128, + "step": 14792 + }, + { + "epoch": 0.43, + "grad_norm": 1.5888908284389545, + "learning_rate": 6.3738416680874085e-06, + "loss": 0.3534, + "step": 14793 + }, + { + "epoch": 0.43, + "grad_norm": 1.324727304793539, + "learning_rate": 6.37339003320677e-06, + "loss": 0.3142, + "step": 14794 + }, + { + "epoch": 0.43, + "grad_norm": 1.4599951221863001, + "learning_rate": 6.372938386205949e-06, + "loss": 0.3285, + "step": 14795 + }, + { + "epoch": 0.43, + "grad_norm": 0.9874173739376695, + "learning_rate": 6.372486727088931e-06, + "loss": 0.5722, + "step": 14796 + }, + { + "epoch": 0.43, + "grad_norm": 0.9398513962103066, + "learning_rate": 6.372035055859702e-06, + "loss": 0.6226, + "step": 14797 + }, + { + "epoch": 0.43, + "grad_norm": 1.3370432410638973, + "learning_rate": 6.3715833725222495e-06, + "loss": 0.32, + "step": 14798 + }, + { + "epoch": 0.43, + "grad_norm": 1.345827598506664, + "learning_rate": 6.37113167708056e-06, + "loss": 0.335, + "step": 14799 + }, + { + "epoch": 0.43, + "grad_norm": 1.5441649882307482, + "learning_rate": 6.370679969538616e-06, + "loss": 0.3389, + "step": 14800 + }, + { + "epoch": 0.43, + "grad_norm": 1.3660746130863577, + "learning_rate": 6.370228249900405e-06, + "loss": 0.2892, + "step": 14801 + }, + { + "epoch": 0.43, + "grad_norm": 1.3922973767923585, + "learning_rate": 6.369776518169917e-06, + "loss": 0.3162, + "step": 14802 + }, + { + "epoch": 0.43, + "grad_norm": 1.763121766042214, + "learning_rate": 6.369324774351135e-06, + "loss": 0.3234, + "step": 14803 + }, + { + "epoch": 0.43, + "grad_norm": 2.147183366072166, + "learning_rate": 6.368873018448046e-06, + "loss": 0.3242, + "step": 14804 + }, + { + "epoch": 0.43, + "grad_norm": 1.337350602642572, + "learning_rate": 6.368421250464638e-06, + "loss": 0.364, + "step": 14805 + }, + { + "epoch": 0.43, + "grad_norm": 2.0087992371786467, + "learning_rate": 6.367969470404896e-06, + "loss": 0.3099, + "step": 14806 + }, + { + "epoch": 0.43, + "grad_norm": 1.6656969857471253, + "learning_rate": 6.367517678272811e-06, + "loss": 0.3343, + "step": 14807 + }, + { + "epoch": 0.43, + "grad_norm": 1.5307757504538393, + "learning_rate": 6.3670658740723655e-06, + "loss": 0.3323, + "step": 14808 + }, + { + "epoch": 0.43, + "grad_norm": 1.370715389834046, + "learning_rate": 6.366614057807548e-06, + "loss": 0.3037, + "step": 14809 + }, + { + "epoch": 0.43, + "grad_norm": 1.282128810294746, + "learning_rate": 6.366162229482347e-06, + "loss": 0.316, + "step": 14810 + }, + { + "epoch": 0.43, + "grad_norm": 1.3502787148934812, + "learning_rate": 6.36571038910075e-06, + "loss": 0.3057, + "step": 14811 + }, + { + "epoch": 0.43, + "grad_norm": 1.2407585701798631, + "learning_rate": 6.365258536666743e-06, + "loss": 0.3168, + "step": 14812 + }, + { + "epoch": 0.43, + "grad_norm": 1.481852224995986, + "learning_rate": 6.364806672184316e-06, + "loss": 0.3148, + "step": 14813 + }, + { + "epoch": 0.43, + "grad_norm": 1.5965445993796574, + "learning_rate": 6.364354795657453e-06, + "loss": 0.3159, + "step": 14814 + }, + { + "epoch": 0.43, + "grad_norm": 1.516588912788498, + "learning_rate": 6.363902907090143e-06, + "loss": 0.3245, + "step": 14815 + }, + { + "epoch": 0.43, + "grad_norm": 1.898733953787802, + "learning_rate": 6.363451006486375e-06, + "loss": 0.3096, + "step": 14816 + }, + { + "epoch": 0.43, + "grad_norm": 1.3648330026645963, + "learning_rate": 6.362999093850138e-06, + "loss": 0.3245, + "step": 14817 + }, + { + "epoch": 0.43, + "grad_norm": 1.29869401840369, + "learning_rate": 6.362547169185419e-06, + "loss": 0.3177, + "step": 14818 + }, + { + "epoch": 0.43, + "grad_norm": 1.4356100955352757, + "learning_rate": 6.362095232496205e-06, + "loss": 0.3227, + "step": 14819 + }, + { + "epoch": 0.43, + "grad_norm": 1.4200502297484878, + "learning_rate": 6.361643283786485e-06, + "loss": 0.3095, + "step": 14820 + }, + { + "epoch": 0.43, + "grad_norm": 1.6474384040169705, + "learning_rate": 6.36119132306025e-06, + "loss": 0.3178, + "step": 14821 + }, + { + "epoch": 0.43, + "grad_norm": 3.2321098429767763, + "learning_rate": 6.360739350321485e-06, + "loss": 0.3282, + "step": 14822 + }, + { + "epoch": 0.43, + "grad_norm": 1.4118053797730983, + "learning_rate": 6.3602873655741805e-06, + "loss": 0.3009, + "step": 14823 + }, + { + "epoch": 0.43, + "grad_norm": 1.5378815076772225, + "learning_rate": 6.359835368822325e-06, + "loss": 0.3212, + "step": 14824 + }, + { + "epoch": 0.43, + "grad_norm": 2.186913242140831, + "learning_rate": 6.3593833600699076e-06, + "loss": 0.3173, + "step": 14825 + }, + { + "epoch": 0.43, + "grad_norm": 1.460077359351232, + "learning_rate": 6.358931339320916e-06, + "loss": 0.3381, + "step": 14826 + }, + { + "epoch": 0.43, + "grad_norm": 1.4411186322501042, + "learning_rate": 6.358479306579341e-06, + "loss": 0.3429, + "step": 14827 + }, + { + "epoch": 0.43, + "grad_norm": 1.3457702912228118, + "learning_rate": 6.358027261849171e-06, + "loss": 0.3194, + "step": 14828 + }, + { + "epoch": 0.43, + "grad_norm": 0.9538018770250714, + "learning_rate": 6.357575205134395e-06, + "loss": 0.6111, + "step": 14829 + }, + { + "epoch": 0.43, + "grad_norm": 1.9811581713597108, + "learning_rate": 6.3571231364390025e-06, + "loss": 0.3305, + "step": 14830 + }, + { + "epoch": 0.43, + "grad_norm": 1.2949602610265332, + "learning_rate": 6.356671055766984e-06, + "loss": 0.3325, + "step": 14831 + }, + { + "epoch": 0.43, + "grad_norm": 1.3244458542223547, + "learning_rate": 6.356218963122328e-06, + "loss": 0.3419, + "step": 14832 + }, + { + "epoch": 0.43, + "grad_norm": 1.4705294713825177, + "learning_rate": 6.3557668585090245e-06, + "loss": 0.3277, + "step": 14833 + }, + { + "epoch": 0.43, + "grad_norm": 1.4385088845450515, + "learning_rate": 6.355314741931064e-06, + "loss": 0.3314, + "step": 14834 + }, + { + "epoch": 0.43, + "grad_norm": 1.5927912010515142, + "learning_rate": 6.354862613392436e-06, + "loss": 0.3139, + "step": 14835 + }, + { + "epoch": 0.43, + "grad_norm": 1.3946063360210748, + "learning_rate": 6.3544104728971286e-06, + "loss": 0.3111, + "step": 14836 + }, + { + "epoch": 0.43, + "grad_norm": 1.393079544200009, + "learning_rate": 6.353958320449135e-06, + "loss": 0.376, + "step": 14837 + }, + { + "epoch": 0.43, + "grad_norm": 1.003790307148401, + "learning_rate": 6.353506156052445e-06, + "loss": 0.6783, + "step": 14838 + }, + { + "epoch": 0.43, + "grad_norm": 1.7198306306225475, + "learning_rate": 6.353053979711048e-06, + "loss": 0.2931, + "step": 14839 + }, + { + "epoch": 0.43, + "grad_norm": 1.7101678058163057, + "learning_rate": 6.352601791428934e-06, + "loss": 0.3487, + "step": 14840 + }, + { + "epoch": 0.43, + "grad_norm": 1.6399522567376354, + "learning_rate": 6.3521495912100935e-06, + "loss": 0.3226, + "step": 14841 + }, + { + "epoch": 0.43, + "grad_norm": 1.3764882023772336, + "learning_rate": 6.351697379058519e-06, + "loss": 0.3171, + "step": 14842 + }, + { + "epoch": 0.43, + "grad_norm": 1.394502007244546, + "learning_rate": 6.3512451549782e-06, + "loss": 0.3151, + "step": 14843 + }, + { + "epoch": 0.43, + "grad_norm": 1.438168999628036, + "learning_rate": 6.3507929189731275e-06, + "loss": 0.3044, + "step": 14844 + }, + { + "epoch": 0.43, + "grad_norm": 1.4815022311035637, + "learning_rate": 6.350340671047291e-06, + "loss": 0.3276, + "step": 14845 + }, + { + "epoch": 0.43, + "grad_norm": 1.5539734265520266, + "learning_rate": 6.349888411204685e-06, + "loss": 0.3302, + "step": 14846 + }, + { + "epoch": 0.43, + "grad_norm": 1.2797582696213694, + "learning_rate": 6.349436139449297e-06, + "loss": 0.3539, + "step": 14847 + }, + { + "epoch": 0.43, + "grad_norm": 1.3663726156274256, + "learning_rate": 6.348983855785122e-06, + "loss": 0.3274, + "step": 14848 + }, + { + "epoch": 0.43, + "grad_norm": 1.362425134788181, + "learning_rate": 6.348531560216149e-06, + "loss": 0.3224, + "step": 14849 + }, + { + "epoch": 0.43, + "grad_norm": 1.456078815383625, + "learning_rate": 6.34807925274637e-06, + "loss": 0.3242, + "step": 14850 + }, + { + "epoch": 0.43, + "grad_norm": 1.307567740176383, + "learning_rate": 6.347626933379776e-06, + "loss": 0.3245, + "step": 14851 + }, + { + "epoch": 0.43, + "grad_norm": 1.5386146424501197, + "learning_rate": 6.347174602120358e-06, + "loss": 0.3178, + "step": 14852 + }, + { + "epoch": 0.43, + "grad_norm": 1.5969644148195086, + "learning_rate": 6.346722258972113e-06, + "loss": 0.3324, + "step": 14853 + }, + { + "epoch": 0.43, + "grad_norm": 1.4570975938454478, + "learning_rate": 6.346269903939026e-06, + "loss": 0.3245, + "step": 14854 + }, + { + "epoch": 0.43, + "grad_norm": 1.3759856616590977, + "learning_rate": 6.345817537025094e-06, + "loss": 0.2968, + "step": 14855 + }, + { + "epoch": 0.43, + "grad_norm": 1.3248888669837893, + "learning_rate": 6.3453651582343066e-06, + "loss": 0.2907, + "step": 14856 + }, + { + "epoch": 0.43, + "grad_norm": 1.8001817245729803, + "learning_rate": 6.3449127675706565e-06, + "loss": 0.3175, + "step": 14857 + }, + { + "epoch": 0.43, + "grad_norm": 1.5521479192620087, + "learning_rate": 6.344460365038138e-06, + "loss": 0.3513, + "step": 14858 + }, + { + "epoch": 0.43, + "grad_norm": 1.6406875410030661, + "learning_rate": 6.344007950640739e-06, + "loss": 0.3318, + "step": 14859 + }, + { + "epoch": 0.43, + "grad_norm": 1.3273103658191803, + "learning_rate": 6.343555524382456e-06, + "loss": 0.3137, + "step": 14860 + }, + { + "epoch": 0.43, + "grad_norm": 1.341721135741882, + "learning_rate": 6.34310308626728e-06, + "loss": 0.3092, + "step": 14861 + }, + { + "epoch": 0.43, + "grad_norm": 1.406472683508919, + "learning_rate": 6.342650636299207e-06, + "loss": 0.3185, + "step": 14862 + }, + { + "epoch": 0.43, + "grad_norm": 1.3324606960412722, + "learning_rate": 6.342198174482225e-06, + "loss": 0.323, + "step": 14863 + }, + { + "epoch": 0.43, + "grad_norm": 1.3614722378515138, + "learning_rate": 6.34174570082033e-06, + "loss": 0.3428, + "step": 14864 + }, + { + "epoch": 0.43, + "grad_norm": 1.3907072642522125, + "learning_rate": 6.341293215317513e-06, + "loss": 0.3373, + "step": 14865 + }, + { + "epoch": 0.43, + "grad_norm": 1.2303587014023643, + "learning_rate": 6.34084071797777e-06, + "loss": 0.3083, + "step": 14866 + }, + { + "epoch": 0.43, + "grad_norm": 1.5934776242189772, + "learning_rate": 6.340388208805093e-06, + "loss": 0.3175, + "step": 14867 + }, + { + "epoch": 0.43, + "grad_norm": 2.0588186405805073, + "learning_rate": 6.339935687803473e-06, + "loss": 0.3227, + "step": 14868 + }, + { + "epoch": 0.43, + "grad_norm": 1.5157910632377347, + "learning_rate": 6.339483154976909e-06, + "loss": 0.3366, + "step": 14869 + }, + { + "epoch": 0.43, + "grad_norm": 1.3054994823894632, + "learning_rate": 6.339030610329389e-06, + "loss": 0.3069, + "step": 14870 + }, + { + "epoch": 0.43, + "grad_norm": 2.0846305235251994, + "learning_rate": 6.338578053864909e-06, + "loss": 0.3252, + "step": 14871 + }, + { + "epoch": 0.43, + "grad_norm": 1.2487859472107279, + "learning_rate": 6.338125485587464e-06, + "loss": 0.3082, + "step": 14872 + }, + { + "epoch": 0.43, + "grad_norm": 1.2691447851591098, + "learning_rate": 6.337672905501045e-06, + "loss": 0.3205, + "step": 14873 + }, + { + "epoch": 0.43, + "grad_norm": 1.7195929134962313, + "learning_rate": 6.337220313609649e-06, + "loss": 0.347, + "step": 14874 + }, + { + "epoch": 0.43, + "grad_norm": 1.4205123689257315, + "learning_rate": 6.336767709917269e-06, + "loss": 0.3094, + "step": 14875 + }, + { + "epoch": 0.43, + "grad_norm": 1.43047210676195, + "learning_rate": 6.3363150944278996e-06, + "loss": 0.3333, + "step": 14876 + }, + { + "epoch": 0.43, + "grad_norm": 1.5877374844815944, + "learning_rate": 6.335862467145533e-06, + "loss": 0.3181, + "step": 14877 + }, + { + "epoch": 0.43, + "grad_norm": 1.5388176608849684, + "learning_rate": 6.335409828074166e-06, + "loss": 0.3177, + "step": 14878 + }, + { + "epoch": 0.43, + "grad_norm": 1.2580696665245303, + "learning_rate": 6.334957177217793e-06, + "loss": 0.3337, + "step": 14879 + }, + { + "epoch": 0.43, + "grad_norm": 1.4775725152011614, + "learning_rate": 6.334504514580408e-06, + "loss": 0.3193, + "step": 14880 + }, + { + "epoch": 0.43, + "grad_norm": 1.437778660665227, + "learning_rate": 6.334051840166006e-06, + "loss": 0.3379, + "step": 14881 + }, + { + "epoch": 0.43, + "grad_norm": 1.3258671038127252, + "learning_rate": 6.33359915397858e-06, + "loss": 0.306, + "step": 14882 + }, + { + "epoch": 0.43, + "grad_norm": 1.3124609283341162, + "learning_rate": 6.333146456022129e-06, + "loss": 0.3086, + "step": 14883 + }, + { + "epoch": 0.43, + "grad_norm": 0.9183701792815571, + "learning_rate": 6.3326937463006445e-06, + "loss": 0.5963, + "step": 14884 + }, + { + "epoch": 0.43, + "grad_norm": 3.6101225101973453, + "learning_rate": 6.332241024818124e-06, + "loss": 0.3234, + "step": 14885 + }, + { + "epoch": 0.43, + "grad_norm": 1.4244274748537478, + "learning_rate": 6.33178829157856e-06, + "loss": 0.334, + "step": 14886 + }, + { + "epoch": 0.43, + "grad_norm": 1.2972686425430682, + "learning_rate": 6.33133554658595e-06, + "loss": 0.3169, + "step": 14887 + }, + { + "epoch": 0.43, + "grad_norm": 1.2507526665316286, + "learning_rate": 6.33088278984429e-06, + "loss": 0.3058, + "step": 14888 + }, + { + "epoch": 0.43, + "grad_norm": 1.6035596743674276, + "learning_rate": 6.330430021357573e-06, + "loss": 0.3679, + "step": 14889 + }, + { + "epoch": 0.43, + "grad_norm": 1.5725848538715583, + "learning_rate": 6.329977241129799e-06, + "loss": 0.3361, + "step": 14890 + }, + { + "epoch": 0.43, + "grad_norm": 1.4422489589439182, + "learning_rate": 6.329524449164959e-06, + "loss": 0.3182, + "step": 14891 + }, + { + "epoch": 0.43, + "grad_norm": 1.8136400361700749, + "learning_rate": 6.329071645467051e-06, + "loss": 0.328, + "step": 14892 + }, + { + "epoch": 0.43, + "grad_norm": 1.7022726408261937, + "learning_rate": 6.328618830040071e-06, + "loss": 0.3189, + "step": 14893 + }, + { + "epoch": 0.43, + "grad_norm": 1.3975471326710363, + "learning_rate": 6.328166002888017e-06, + "loss": 0.3188, + "step": 14894 + }, + { + "epoch": 0.43, + "grad_norm": 1.3168465907794151, + "learning_rate": 6.32771316401488e-06, + "loss": 0.3275, + "step": 14895 + }, + { + "epoch": 0.43, + "grad_norm": 1.3137103293180674, + "learning_rate": 6.3272603134246616e-06, + "loss": 0.312, + "step": 14896 + }, + { + "epoch": 0.43, + "grad_norm": 1.5403479476060176, + "learning_rate": 6.326807451121357e-06, + "loss": 0.3154, + "step": 14897 + }, + { + "epoch": 0.43, + "grad_norm": 1.2610534852391888, + "learning_rate": 6.326354577108961e-06, + "loss": 0.3037, + "step": 14898 + }, + { + "epoch": 0.43, + "grad_norm": 1.4720055480130747, + "learning_rate": 6.325901691391472e-06, + "loss": 0.3115, + "step": 14899 + }, + { + "epoch": 0.43, + "grad_norm": 1.4332600700496667, + "learning_rate": 6.325448793972883e-06, + "loss": 0.316, + "step": 14900 + }, + { + "epoch": 0.43, + "grad_norm": 1.3352365731794793, + "learning_rate": 6.324995884857197e-06, + "loss": 0.3168, + "step": 14901 + }, + { + "epoch": 0.43, + "grad_norm": 1.4150970707629686, + "learning_rate": 6.324542964048406e-06, + "loss": 0.3309, + "step": 14902 + }, + { + "epoch": 0.43, + "grad_norm": 1.6680384945786022, + "learning_rate": 6.32409003155051e-06, + "loss": 0.3293, + "step": 14903 + }, + { + "epoch": 0.43, + "grad_norm": 1.5130003748081189, + "learning_rate": 6.3236370873675025e-06, + "loss": 0.3189, + "step": 14904 + }, + { + "epoch": 0.43, + "grad_norm": 1.3702447067806756, + "learning_rate": 6.323184131503384e-06, + "loss": 0.3234, + "step": 14905 + }, + { + "epoch": 0.43, + "grad_norm": 1.4254002509749988, + "learning_rate": 6.32273116396215e-06, + "loss": 0.3236, + "step": 14906 + }, + { + "epoch": 0.43, + "grad_norm": 1.330428417281971, + "learning_rate": 6.3222781847478e-06, + "loss": 0.328, + "step": 14907 + }, + { + "epoch": 0.43, + "grad_norm": 2.568634487518333, + "learning_rate": 6.321825193864331e-06, + "loss": 0.3085, + "step": 14908 + }, + { + "epoch": 0.43, + "grad_norm": 1.5080832931831507, + "learning_rate": 6.321372191315739e-06, + "loss": 0.3413, + "step": 14909 + }, + { + "epoch": 0.43, + "grad_norm": 5.575829496165659, + "learning_rate": 6.320919177106022e-06, + "loss": 0.3171, + "step": 14910 + }, + { + "epoch": 0.43, + "grad_norm": 1.4054367016524194, + "learning_rate": 6.3204661512391805e-06, + "loss": 0.3145, + "step": 14911 + }, + { + "epoch": 0.43, + "grad_norm": 1.4088333865896956, + "learning_rate": 6.32001311371921e-06, + "loss": 0.3309, + "step": 14912 + }, + { + "epoch": 0.43, + "grad_norm": 1.6910987068594823, + "learning_rate": 6.3195600645501085e-06, + "loss": 0.3286, + "step": 14913 + }, + { + "epoch": 0.43, + "grad_norm": 1.1918569125382497, + "learning_rate": 6.3191070037358746e-06, + "loss": 0.3155, + "step": 14914 + }, + { + "epoch": 0.43, + "grad_norm": 2.1572832705365568, + "learning_rate": 6.318653931280508e-06, + "loss": 0.3097, + "step": 14915 + }, + { + "epoch": 0.43, + "grad_norm": 1.3939156165459936, + "learning_rate": 6.318200847188004e-06, + "loss": 0.3431, + "step": 14916 + }, + { + "epoch": 0.43, + "grad_norm": 1.394904255402135, + "learning_rate": 6.3177477514623646e-06, + "loss": 0.3129, + "step": 14917 + }, + { + "epoch": 0.43, + "grad_norm": 1.2824387110461868, + "learning_rate": 6.317294644107585e-06, + "loss": 0.3276, + "step": 14918 + }, + { + "epoch": 0.43, + "grad_norm": 1.3747625146946536, + "learning_rate": 6.316841525127667e-06, + "loss": 0.322, + "step": 14919 + }, + { + "epoch": 0.43, + "grad_norm": 1.3479641815100878, + "learning_rate": 6.3163883945266066e-06, + "loss": 0.3331, + "step": 14920 + }, + { + "epoch": 0.43, + "grad_norm": 1.3632523950976483, + "learning_rate": 6.315935252308406e-06, + "loss": 0.3072, + "step": 14921 + }, + { + "epoch": 0.43, + "grad_norm": 1.4678849256840172, + "learning_rate": 6.315482098477062e-06, + "loss": 0.3311, + "step": 14922 + }, + { + "epoch": 0.43, + "grad_norm": 1.3957265726979047, + "learning_rate": 6.315028933036572e-06, + "loss": 0.3172, + "step": 14923 + }, + { + "epoch": 0.43, + "grad_norm": 1.5008872676374143, + "learning_rate": 6.314575755990937e-06, + "loss": 0.3198, + "step": 14924 + }, + { + "epoch": 0.43, + "grad_norm": 1.588549742912622, + "learning_rate": 6.3141225673441584e-06, + "loss": 0.3212, + "step": 14925 + }, + { + "epoch": 0.43, + "grad_norm": 1.373194570394661, + "learning_rate": 6.3136693671002325e-06, + "loss": 0.3023, + "step": 14926 + }, + { + "epoch": 0.43, + "grad_norm": 1.4141648665299447, + "learning_rate": 6.313216155263161e-06, + "loss": 0.3071, + "step": 14927 + }, + { + "epoch": 0.43, + "grad_norm": 1.4530569200128782, + "learning_rate": 6.312762931836941e-06, + "loss": 0.3153, + "step": 14928 + }, + { + "epoch": 0.43, + "grad_norm": 1.2765570972069256, + "learning_rate": 6.3123096968255735e-06, + "loss": 0.2971, + "step": 14929 + }, + { + "epoch": 0.43, + "grad_norm": 1.4792702292075552, + "learning_rate": 6.3118564502330584e-06, + "loss": 0.3065, + "step": 14930 + }, + { + "epoch": 0.43, + "grad_norm": 1.3379079507633984, + "learning_rate": 6.311403192063397e-06, + "loss": 0.3153, + "step": 14931 + }, + { + "epoch": 0.43, + "grad_norm": 1.6372882631232186, + "learning_rate": 6.310949922320586e-06, + "loss": 0.3231, + "step": 14932 + }, + { + "epoch": 0.43, + "grad_norm": 1.210407846217899, + "learning_rate": 6.310496641008628e-06, + "loss": 0.3075, + "step": 14933 + }, + { + "epoch": 0.43, + "grad_norm": 1.5872973378804096, + "learning_rate": 6.310043348131523e-06, + "loss": 0.3232, + "step": 14934 + }, + { + "epoch": 0.43, + "grad_norm": 1.3979719426495636, + "learning_rate": 6.309590043693271e-06, + "loss": 0.3155, + "step": 14935 + }, + { + "epoch": 0.43, + "grad_norm": 1.537063168779964, + "learning_rate": 6.309136727697872e-06, + "loss": 0.3433, + "step": 14936 + }, + { + "epoch": 0.43, + "grad_norm": 1.6089775168448008, + "learning_rate": 6.308683400149327e-06, + "loss": 0.3055, + "step": 14937 + }, + { + "epoch": 0.43, + "grad_norm": 1.5173616749654482, + "learning_rate": 6.308230061051636e-06, + "loss": 0.3069, + "step": 14938 + }, + { + "epoch": 0.43, + "grad_norm": 1.3329985931338793, + "learning_rate": 6.307776710408801e-06, + "loss": 0.3208, + "step": 14939 + }, + { + "epoch": 0.43, + "grad_norm": 1.453761722798387, + "learning_rate": 6.307323348224823e-06, + "loss": 0.3286, + "step": 14940 + }, + { + "epoch": 0.43, + "grad_norm": 1.338791126014944, + "learning_rate": 6.3068699745037e-06, + "loss": 0.3398, + "step": 14941 + }, + { + "epoch": 0.43, + "grad_norm": 1.5300256736726843, + "learning_rate": 6.306416589249435e-06, + "loss": 0.3607, + "step": 14942 + }, + { + "epoch": 0.43, + "grad_norm": 1.2732372937096763, + "learning_rate": 6.305963192466029e-06, + "loss": 0.2992, + "step": 14943 + }, + { + "epoch": 0.43, + "grad_norm": 1.8690729442740317, + "learning_rate": 6.3055097841574845e-06, + "loss": 0.3239, + "step": 14944 + }, + { + "epoch": 0.43, + "grad_norm": 1.3325268611709262, + "learning_rate": 6.3050563643278014e-06, + "loss": 0.3385, + "step": 14945 + }, + { + "epoch": 0.43, + "grad_norm": 1.4350072752058929, + "learning_rate": 6.304602932980981e-06, + "loss": 0.329, + "step": 14946 + }, + { + "epoch": 0.43, + "grad_norm": 1.348112269407357, + "learning_rate": 6.304149490121025e-06, + "loss": 0.3571, + "step": 14947 + }, + { + "epoch": 0.43, + "grad_norm": 1.405719356559921, + "learning_rate": 6.303696035751936e-06, + "loss": 0.3526, + "step": 14948 + }, + { + "epoch": 0.43, + "grad_norm": 1.2862584646800213, + "learning_rate": 6.303242569877715e-06, + "loss": 0.3198, + "step": 14949 + }, + { + "epoch": 0.43, + "grad_norm": 1.845410573333724, + "learning_rate": 6.302789092502364e-06, + "loss": 0.3206, + "step": 14950 + }, + { + "epoch": 0.43, + "grad_norm": 0.9445175075029872, + "learning_rate": 6.302335603629884e-06, + "loss": 0.5997, + "step": 14951 + }, + { + "epoch": 0.43, + "grad_norm": 1.4590722288828522, + "learning_rate": 6.301882103264277e-06, + "loss": 0.3197, + "step": 14952 + }, + { + "epoch": 0.43, + "grad_norm": 1.2424840870932934, + "learning_rate": 6.3014285914095485e-06, + "loss": 0.3091, + "step": 14953 + }, + { + "epoch": 0.43, + "grad_norm": 1.2369677502238778, + "learning_rate": 6.300975068069698e-06, + "loss": 0.3014, + "step": 14954 + }, + { + "epoch": 0.43, + "grad_norm": 1.5817458727154843, + "learning_rate": 6.300521533248727e-06, + "loss": 0.34, + "step": 14955 + }, + { + "epoch": 0.43, + "grad_norm": 1.5036402100160997, + "learning_rate": 6.3000679869506375e-06, + "loss": 0.2991, + "step": 14956 + }, + { + "epoch": 0.43, + "grad_norm": 1.2849689501701453, + "learning_rate": 6.299614429179436e-06, + "loss": 0.3098, + "step": 14957 + }, + { + "epoch": 0.43, + "grad_norm": 1.5600948271277535, + "learning_rate": 6.299160859939122e-06, + "loss": 0.3416, + "step": 14958 + }, + { + "epoch": 0.43, + "grad_norm": 1.4228335331274595, + "learning_rate": 6.298707279233699e-06, + "loss": 0.3182, + "step": 14959 + }, + { + "epoch": 0.43, + "grad_norm": 1.2047405652181757, + "learning_rate": 6.2982536870671694e-06, + "loss": 0.3059, + "step": 14960 + }, + { + "epoch": 0.43, + "grad_norm": 1.2348455580286537, + "learning_rate": 6.2978000834435374e-06, + "loss": 0.3187, + "step": 14961 + }, + { + "epoch": 0.43, + "grad_norm": 1.3293248903621864, + "learning_rate": 6.297346468366804e-06, + "loss": 0.3178, + "step": 14962 + }, + { + "epoch": 0.43, + "grad_norm": 1.2389818393818264, + "learning_rate": 6.296892841840976e-06, + "loss": 0.3099, + "step": 14963 + }, + { + "epoch": 0.43, + "grad_norm": 1.4120140774230132, + "learning_rate": 6.2964392038700515e-06, + "loss": 0.3174, + "step": 14964 + }, + { + "epoch": 0.43, + "grad_norm": 1.349143497255582, + "learning_rate": 6.2959855544580376e-06, + "loss": 0.2998, + "step": 14965 + }, + { + "epoch": 0.43, + "grad_norm": 1.3474414540959587, + "learning_rate": 6.295531893608937e-06, + "loss": 0.3056, + "step": 14966 + }, + { + "epoch": 0.43, + "grad_norm": 1.3053380254963936, + "learning_rate": 6.295078221326754e-06, + "loss": 0.3271, + "step": 14967 + }, + { + "epoch": 0.43, + "grad_norm": 1.5703029688568735, + "learning_rate": 6.294624537615492e-06, + "loss": 0.3236, + "step": 14968 + }, + { + "epoch": 0.43, + "grad_norm": 7.032797768903464, + "learning_rate": 6.294170842479152e-06, + "loss": 0.3292, + "step": 14969 + }, + { + "epoch": 0.43, + "grad_norm": 1.3954973546412612, + "learning_rate": 6.29371713592174e-06, + "loss": 0.3279, + "step": 14970 + }, + { + "epoch": 0.43, + "grad_norm": 1.2210755155508026, + "learning_rate": 6.29326341794726e-06, + "loss": 0.3093, + "step": 14971 + }, + { + "epoch": 0.43, + "grad_norm": 1.3219832256476112, + "learning_rate": 6.292809688559718e-06, + "loss": 0.308, + "step": 14972 + }, + { + "epoch": 0.43, + "grad_norm": 1.3833703263043808, + "learning_rate": 6.292355947763114e-06, + "loss": 0.3065, + "step": 14973 + }, + { + "epoch": 0.43, + "grad_norm": 1.432568515708903, + "learning_rate": 6.291902195561454e-06, + "loss": 0.3245, + "step": 14974 + }, + { + "epoch": 0.43, + "grad_norm": 0.97061309418976, + "learning_rate": 6.291448431958744e-06, + "loss": 0.664, + "step": 14975 + }, + { + "epoch": 0.43, + "grad_norm": 1.4824362494903693, + "learning_rate": 6.290994656958988e-06, + "loss": 0.3397, + "step": 14976 + }, + { + "epoch": 0.43, + "grad_norm": 1.3647952953042173, + "learning_rate": 6.2905408705661886e-06, + "loss": 0.3072, + "step": 14977 + }, + { + "epoch": 0.43, + "grad_norm": 1.6436201341283214, + "learning_rate": 6.290087072784352e-06, + "loss": 0.3425, + "step": 14978 + }, + { + "epoch": 0.43, + "grad_norm": 1.3310648778251806, + "learning_rate": 6.289633263617483e-06, + "loss": 0.3013, + "step": 14979 + }, + { + "epoch": 0.43, + "grad_norm": 1.652400832599998, + "learning_rate": 6.289179443069585e-06, + "loss": 0.3313, + "step": 14980 + }, + { + "epoch": 0.43, + "grad_norm": 1.6235456409547213, + "learning_rate": 6.288725611144666e-06, + "loss": 0.3486, + "step": 14981 + }, + { + "epoch": 0.43, + "grad_norm": 1.2632105403521177, + "learning_rate": 6.288271767846729e-06, + "loss": 0.3265, + "step": 14982 + }, + { + "epoch": 0.43, + "grad_norm": 1.243816812316724, + "learning_rate": 6.287817913179778e-06, + "loss": 0.3085, + "step": 14983 + }, + { + "epoch": 0.43, + "grad_norm": 1.5612282282458114, + "learning_rate": 6.287364047147819e-06, + "loss": 0.3477, + "step": 14984 + }, + { + "epoch": 0.43, + "grad_norm": 1.4105090423259015, + "learning_rate": 6.2869101697548595e-06, + "loss": 0.3514, + "step": 14985 + }, + { + "epoch": 0.43, + "grad_norm": 1.3545292714623594, + "learning_rate": 6.286456281004904e-06, + "loss": 0.325, + "step": 14986 + }, + { + "epoch": 0.43, + "grad_norm": 1.4227070573176943, + "learning_rate": 6.2860023809019565e-06, + "loss": 0.3574, + "step": 14987 + }, + { + "epoch": 0.43, + "grad_norm": 1.2575793237881059, + "learning_rate": 6.285548469450023e-06, + "loss": 0.3236, + "step": 14988 + }, + { + "epoch": 0.43, + "grad_norm": 1.2736170880963797, + "learning_rate": 6.285094546653111e-06, + "loss": 0.3199, + "step": 14989 + }, + { + "epoch": 0.43, + "grad_norm": 1.6195948973069154, + "learning_rate": 6.284640612515226e-06, + "loss": 0.3241, + "step": 14990 + }, + { + "epoch": 0.43, + "grad_norm": 1.4417166673154984, + "learning_rate": 6.284186667040373e-06, + "loss": 0.3311, + "step": 14991 + }, + { + "epoch": 0.43, + "grad_norm": 1.3838019290983714, + "learning_rate": 6.283732710232557e-06, + "loss": 0.3141, + "step": 14992 + }, + { + "epoch": 0.43, + "grad_norm": 1.4494543847684611, + "learning_rate": 6.2832787420957865e-06, + "loss": 0.3043, + "step": 14993 + }, + { + "epoch": 0.43, + "grad_norm": 1.4321367284491977, + "learning_rate": 6.282824762634068e-06, + "loss": 0.3051, + "step": 14994 + }, + { + "epoch": 0.43, + "grad_norm": 1.420736855695865, + "learning_rate": 6.282370771851406e-06, + "loss": 0.3255, + "step": 14995 + }, + { + "epoch": 0.43, + "grad_norm": 1.450493118386548, + "learning_rate": 6.281916769751808e-06, + "loss": 0.3133, + "step": 14996 + }, + { + "epoch": 0.43, + "grad_norm": 2.6079261061588688, + "learning_rate": 6.281462756339278e-06, + "loss": 0.3385, + "step": 14997 + }, + { + "epoch": 0.44, + "grad_norm": 1.3519526916472797, + "learning_rate": 6.281008731617827e-06, + "loss": 0.3076, + "step": 14998 + }, + { + "epoch": 0.44, + "grad_norm": 1.317778850939494, + "learning_rate": 6.2805546955914595e-06, + "loss": 0.33, + "step": 14999 + }, + { + "epoch": 0.44, + "grad_norm": 1.456269907840816, + "learning_rate": 6.280100648264184e-06, + "loss": 0.3203, + "step": 15000 + }, + { + "epoch": 0.44, + "grad_norm": 1.2595030665480338, + "learning_rate": 6.279646589640004e-06, + "loss": 0.3027, + "step": 15001 + }, + { + "epoch": 0.44, + "grad_norm": 1.28727300808051, + "learning_rate": 6.279192519722929e-06, + "loss": 0.3173, + "step": 15002 + }, + { + "epoch": 0.44, + "grad_norm": 1.3381605034821311, + "learning_rate": 6.278738438516965e-06, + "loss": 0.3194, + "step": 15003 + }, + { + "epoch": 0.44, + "grad_norm": 1.4652580374800654, + "learning_rate": 6.278284346026122e-06, + "loss": 0.3326, + "step": 15004 + }, + { + "epoch": 0.44, + "grad_norm": 1.270903390469074, + "learning_rate": 6.277830242254404e-06, + "loss": 0.2974, + "step": 15005 + }, + { + "epoch": 0.44, + "grad_norm": 1.0619060363749313, + "learning_rate": 6.277376127205821e-06, + "loss": 0.6114, + "step": 15006 + }, + { + "epoch": 0.44, + "grad_norm": 1.567432740853554, + "learning_rate": 6.276922000884379e-06, + "loss": 0.3248, + "step": 15007 + }, + { + "epoch": 0.44, + "grad_norm": 1.5630354066177319, + "learning_rate": 6.276467863294086e-06, + "loss": 0.314, + "step": 15008 + }, + { + "epoch": 0.44, + "grad_norm": 1.4387531341782256, + "learning_rate": 6.276013714438951e-06, + "loss": 0.3158, + "step": 15009 + }, + { + "epoch": 0.44, + "grad_norm": 1.3879819113567204, + "learning_rate": 6.2755595543229806e-06, + "loss": 0.3451, + "step": 15010 + }, + { + "epoch": 0.44, + "grad_norm": 1.4554556343048475, + "learning_rate": 6.275105382950182e-06, + "loss": 0.3339, + "step": 15011 + }, + { + "epoch": 0.44, + "grad_norm": 1.3157863565589525, + "learning_rate": 6.274651200324564e-06, + "loss": 0.3071, + "step": 15012 + }, + { + "epoch": 0.44, + "grad_norm": 1.6240966695834582, + "learning_rate": 6.2741970064501365e-06, + "loss": 0.342, + "step": 15013 + }, + { + "epoch": 0.44, + "grad_norm": 1.3510758076050817, + "learning_rate": 6.273742801330906e-06, + "loss": 0.3433, + "step": 15014 + }, + { + "epoch": 0.44, + "grad_norm": 1.434813069815278, + "learning_rate": 6.27328858497088e-06, + "loss": 0.3692, + "step": 15015 + }, + { + "epoch": 0.44, + "grad_norm": 1.301808803959588, + "learning_rate": 6.272834357374068e-06, + "loss": 0.3265, + "step": 15016 + }, + { + "epoch": 0.44, + "grad_norm": 1.2740250964289415, + "learning_rate": 6.272380118544479e-06, + "loss": 0.2968, + "step": 15017 + }, + { + "epoch": 0.44, + "grad_norm": 1.9395349257996892, + "learning_rate": 6.271925868486123e-06, + "loss": 0.3173, + "step": 15018 + }, + { + "epoch": 0.44, + "grad_norm": 1.3728103191489203, + "learning_rate": 6.271471607203006e-06, + "loss": 0.3073, + "step": 15019 + }, + { + "epoch": 0.44, + "grad_norm": 1.4213806530492088, + "learning_rate": 6.271017334699137e-06, + "loss": 0.3302, + "step": 15020 + }, + { + "epoch": 0.44, + "grad_norm": 1.2336974617215049, + "learning_rate": 6.2705630509785265e-06, + "loss": 0.3116, + "step": 15021 + }, + { + "epoch": 0.44, + "grad_norm": 1.5284786111213862, + "learning_rate": 6.270108756045184e-06, + "loss": 0.3116, + "step": 15022 + }, + { + "epoch": 0.44, + "grad_norm": 1.4234444170868144, + "learning_rate": 6.269654449903116e-06, + "loss": 0.3442, + "step": 15023 + }, + { + "epoch": 0.44, + "grad_norm": 1.4199302681940307, + "learning_rate": 6.269200132556335e-06, + "loss": 0.3194, + "step": 15024 + }, + { + "epoch": 0.44, + "grad_norm": 1.3391770205876896, + "learning_rate": 6.268745804008848e-06, + "loss": 0.3093, + "step": 15025 + }, + { + "epoch": 0.44, + "grad_norm": 1.2244807498099615, + "learning_rate": 6.268291464264664e-06, + "loss": 0.2983, + "step": 15026 + }, + { + "epoch": 0.44, + "grad_norm": 1.2653394517280132, + "learning_rate": 6.267837113327797e-06, + "loss": 0.3353, + "step": 15027 + }, + { + "epoch": 0.44, + "grad_norm": 1.6182352722528641, + "learning_rate": 6.267382751202251e-06, + "loss": 0.324, + "step": 15028 + }, + { + "epoch": 0.44, + "grad_norm": 1.8943929456899316, + "learning_rate": 6.2669283778920375e-06, + "loss": 0.333, + "step": 15029 + }, + { + "epoch": 0.44, + "grad_norm": 1.1869642014937525, + "learning_rate": 6.2664739934011676e-06, + "loss": 0.3157, + "step": 15030 + }, + { + "epoch": 0.44, + "grad_norm": 1.2510408112015945, + "learning_rate": 6.266019597733652e-06, + "loss": 0.3252, + "step": 15031 + }, + { + "epoch": 0.44, + "grad_norm": 2.107472823314279, + "learning_rate": 6.2655651908934986e-06, + "loss": 0.3302, + "step": 15032 + }, + { + "epoch": 0.44, + "grad_norm": 1.3591729600870142, + "learning_rate": 6.2651107728847175e-06, + "loss": 0.2946, + "step": 15033 + }, + { + "epoch": 0.44, + "grad_norm": 1.281027981972997, + "learning_rate": 6.2646563437113205e-06, + "loss": 0.3222, + "step": 15034 + }, + { + "epoch": 0.44, + "grad_norm": 0.9823031026968255, + "learning_rate": 6.2642019033773175e-06, + "loss": 0.6357, + "step": 15035 + }, + { + "epoch": 0.44, + "grad_norm": 1.557990005292187, + "learning_rate": 6.263747451886719e-06, + "loss": 0.3165, + "step": 15036 + }, + { + "epoch": 0.44, + "grad_norm": 1.3650425034350169, + "learning_rate": 6.263292989243534e-06, + "loss": 0.3255, + "step": 15037 + }, + { + "epoch": 0.44, + "grad_norm": 1.255881792066974, + "learning_rate": 6.262838515451776e-06, + "loss": 0.3249, + "step": 15038 + }, + { + "epoch": 0.44, + "grad_norm": 1.2991699670876131, + "learning_rate": 6.262384030515453e-06, + "loss": 0.3516, + "step": 15039 + }, + { + "epoch": 0.44, + "grad_norm": 1.3047208909491514, + "learning_rate": 6.261929534438577e-06, + "loss": 0.328, + "step": 15040 + }, + { + "epoch": 0.44, + "grad_norm": 4.213881013066445, + "learning_rate": 6.261475027225159e-06, + "loss": 0.3463, + "step": 15041 + }, + { + "epoch": 0.44, + "grad_norm": 0.9281905059084441, + "learning_rate": 6.26102050887921e-06, + "loss": 0.5519, + "step": 15042 + }, + { + "epoch": 0.44, + "grad_norm": 1.2442755080476973, + "learning_rate": 6.2605659794047405e-06, + "loss": 0.3297, + "step": 15043 + }, + { + "epoch": 0.44, + "grad_norm": 1.3436824925258224, + "learning_rate": 6.260111438805762e-06, + "loss": 0.3139, + "step": 15044 + }, + { + "epoch": 0.44, + "grad_norm": 1.4275760156710164, + "learning_rate": 6.259656887086288e-06, + "loss": 0.3284, + "step": 15045 + }, + { + "epoch": 0.44, + "grad_norm": 1.5686688040343573, + "learning_rate": 6.259202324250325e-06, + "loss": 0.3333, + "step": 15046 + }, + { + "epoch": 0.44, + "grad_norm": 1.234279885944767, + "learning_rate": 6.258747750301889e-06, + "loss": 0.3218, + "step": 15047 + }, + { + "epoch": 0.44, + "grad_norm": 1.7652571156825405, + "learning_rate": 6.258293165244991e-06, + "loss": 0.3209, + "step": 15048 + }, + { + "epoch": 0.44, + "grad_norm": 1.2012849607496865, + "learning_rate": 6.25783856908364e-06, + "loss": 0.3031, + "step": 15049 + }, + { + "epoch": 0.44, + "grad_norm": 1.5202124001273434, + "learning_rate": 6.2573839618218525e-06, + "loss": 0.3246, + "step": 15050 + }, + { + "epoch": 0.44, + "grad_norm": 1.2927382254847182, + "learning_rate": 6.256929343463635e-06, + "loss": 0.3092, + "step": 15051 + }, + { + "epoch": 0.44, + "grad_norm": 1.3430952169485664, + "learning_rate": 6.256474714013003e-06, + "loss": 0.3249, + "step": 15052 + }, + { + "epoch": 0.44, + "grad_norm": 0.9364806062141331, + "learning_rate": 6.2560200734739685e-06, + "loss": 0.6229, + "step": 15053 + }, + { + "epoch": 0.44, + "grad_norm": 1.3964056104206712, + "learning_rate": 6.255565421850543e-06, + "loss": 0.3423, + "step": 15054 + }, + { + "epoch": 0.44, + "grad_norm": 1.277228936787834, + "learning_rate": 6.255110759146738e-06, + "loss": 0.32, + "step": 15055 + }, + { + "epoch": 0.44, + "grad_norm": 1.5346541900344348, + "learning_rate": 6.254656085366565e-06, + "loss": 0.3304, + "step": 15056 + }, + { + "epoch": 0.44, + "grad_norm": 1.9079233047668422, + "learning_rate": 6.254201400514039e-06, + "loss": 0.3313, + "step": 15057 + }, + { + "epoch": 0.44, + "grad_norm": 1.512331469079698, + "learning_rate": 6.253746704593173e-06, + "loss": 0.3348, + "step": 15058 + }, + { + "epoch": 0.44, + "grad_norm": 1.268336087305319, + "learning_rate": 6.253291997607978e-06, + "loss": 0.3168, + "step": 15059 + }, + { + "epoch": 0.44, + "grad_norm": 1.2081863595410236, + "learning_rate": 6.252837279562467e-06, + "loss": 0.3234, + "step": 15060 + }, + { + "epoch": 0.44, + "grad_norm": 1.2832573431939711, + "learning_rate": 6.252382550460653e-06, + "loss": 0.3008, + "step": 15061 + }, + { + "epoch": 0.44, + "grad_norm": 1.4247500869015512, + "learning_rate": 6.251927810306548e-06, + "loss": 0.3202, + "step": 15062 + }, + { + "epoch": 0.44, + "grad_norm": 1.2866063200643634, + "learning_rate": 6.251473059104168e-06, + "loss": 0.313, + "step": 15063 + }, + { + "epoch": 0.44, + "grad_norm": 1.3562482224074963, + "learning_rate": 6.251018296857524e-06, + "loss": 0.3594, + "step": 15064 + }, + { + "epoch": 0.44, + "grad_norm": 0.9895280426352612, + "learning_rate": 6.25056352357063e-06, + "loss": 0.5954, + "step": 15065 + }, + { + "epoch": 0.44, + "grad_norm": 1.4428400313148981, + "learning_rate": 6.250108739247498e-06, + "loss": 0.3179, + "step": 15066 + }, + { + "epoch": 0.44, + "grad_norm": 1.5022464291261919, + "learning_rate": 6.249653943892143e-06, + "loss": 0.3394, + "step": 15067 + }, + { + "epoch": 0.44, + "grad_norm": 1.5926256630876006, + "learning_rate": 6.249199137508579e-06, + "loss": 0.3098, + "step": 15068 + }, + { + "epoch": 0.44, + "grad_norm": 1.021923286620172, + "learning_rate": 6.248744320100817e-06, + "loss": 0.6234, + "step": 15069 + }, + { + "epoch": 0.44, + "grad_norm": 1.3351617830901061, + "learning_rate": 6.248289491672873e-06, + "loss": 0.3131, + "step": 15070 + }, + { + "epoch": 0.44, + "grad_norm": 1.3908391172749848, + "learning_rate": 6.247834652228761e-06, + "loss": 0.3171, + "step": 15071 + }, + { + "epoch": 0.44, + "grad_norm": 1.3290570980971212, + "learning_rate": 6.247379801772495e-06, + "loss": 0.3143, + "step": 15072 + }, + { + "epoch": 0.44, + "grad_norm": 1.3298504872032135, + "learning_rate": 6.246924940308087e-06, + "loss": 0.316, + "step": 15073 + }, + { + "epoch": 0.44, + "grad_norm": 1.6296800895294663, + "learning_rate": 6.246470067839553e-06, + "loss": 0.3299, + "step": 15074 + }, + { + "epoch": 0.44, + "grad_norm": 1.1732732533927328, + "learning_rate": 6.2460151843709064e-06, + "loss": 0.3078, + "step": 15075 + }, + { + "epoch": 0.44, + "grad_norm": 0.950946633507072, + "learning_rate": 6.245560289906162e-06, + "loss": 0.6433, + "step": 15076 + }, + { + "epoch": 0.44, + "grad_norm": 1.3497237674767741, + "learning_rate": 6.245105384449335e-06, + "loss": 0.3473, + "step": 15077 + }, + { + "epoch": 0.44, + "grad_norm": 1.2729970765520817, + "learning_rate": 6.2446504680044385e-06, + "loss": 0.3353, + "step": 15078 + }, + { + "epoch": 0.44, + "grad_norm": 1.2788620420876475, + "learning_rate": 6.244195540575488e-06, + "loss": 0.3126, + "step": 15079 + }, + { + "epoch": 0.44, + "grad_norm": 1.2431092132418304, + "learning_rate": 6.243740602166499e-06, + "loss": 0.3051, + "step": 15080 + }, + { + "epoch": 0.44, + "grad_norm": 1.2559338670663442, + "learning_rate": 6.2432856527814845e-06, + "loss": 0.3197, + "step": 15081 + }, + { + "epoch": 0.44, + "grad_norm": 1.6140253662242299, + "learning_rate": 6.242830692424461e-06, + "loss": 0.3308, + "step": 15082 + }, + { + "epoch": 0.44, + "grad_norm": 1.4757777371231438, + "learning_rate": 6.242375721099442e-06, + "loss": 0.3778, + "step": 15083 + }, + { + "epoch": 0.44, + "grad_norm": 1.3379650725262549, + "learning_rate": 6.241920738810443e-06, + "loss": 0.3247, + "step": 15084 + }, + { + "epoch": 0.44, + "grad_norm": 1.4237668789007605, + "learning_rate": 6.24146574556148e-06, + "loss": 0.3071, + "step": 15085 + }, + { + "epoch": 0.44, + "grad_norm": 1.4092127992407568, + "learning_rate": 6.24101074135657e-06, + "loss": 0.3152, + "step": 15086 + }, + { + "epoch": 0.44, + "grad_norm": 1.36099255450015, + "learning_rate": 6.240555726199724e-06, + "loss": 0.3155, + "step": 15087 + }, + { + "epoch": 0.44, + "grad_norm": 2.2417699167279097, + "learning_rate": 6.240100700094961e-06, + "loss": 0.3266, + "step": 15088 + }, + { + "epoch": 0.44, + "grad_norm": 1.2106412145550502, + "learning_rate": 6.239645663046294e-06, + "loss": 0.3162, + "step": 15089 + }, + { + "epoch": 0.44, + "grad_norm": 1.2533902527544627, + "learning_rate": 6.239190615057743e-06, + "loss": 0.3135, + "step": 15090 + }, + { + "epoch": 0.44, + "grad_norm": 1.560615353212869, + "learning_rate": 6.2387355561333195e-06, + "loss": 0.3425, + "step": 15091 + }, + { + "epoch": 0.44, + "grad_norm": 1.3674192042484579, + "learning_rate": 6.2382804862770405e-06, + "loss": 0.3438, + "step": 15092 + }, + { + "epoch": 0.44, + "grad_norm": 1.200879399118896, + "learning_rate": 6.237825405492924e-06, + "loss": 0.2974, + "step": 15093 + }, + { + "epoch": 0.44, + "grad_norm": 2.5235202967202417, + "learning_rate": 6.2373703137849835e-06, + "loss": 0.3256, + "step": 15094 + }, + { + "epoch": 0.44, + "grad_norm": 1.7182641559369694, + "learning_rate": 6.236915211157237e-06, + "loss": 0.3056, + "step": 15095 + }, + { + "epoch": 0.44, + "grad_norm": 1.2813339986555154, + "learning_rate": 6.2364600976137e-06, + "loss": 0.3182, + "step": 15096 + }, + { + "epoch": 0.44, + "grad_norm": 1.529573154537243, + "learning_rate": 6.236004973158389e-06, + "loss": 0.3337, + "step": 15097 + }, + { + "epoch": 0.44, + "grad_norm": 1.48618997097231, + "learning_rate": 6.235549837795318e-06, + "loss": 0.3201, + "step": 15098 + }, + { + "epoch": 0.44, + "grad_norm": 1.3485635562589693, + "learning_rate": 6.235094691528508e-06, + "loss": 0.319, + "step": 15099 + }, + { + "epoch": 0.44, + "grad_norm": 2.6092588454636862, + "learning_rate": 6.2346395343619745e-06, + "loss": 0.3081, + "step": 15100 + }, + { + "epoch": 0.44, + "grad_norm": 0.9256815416622342, + "learning_rate": 6.2341843662997326e-06, + "loss": 0.6201, + "step": 15101 + }, + { + "epoch": 0.44, + "grad_norm": 1.3601150419505417, + "learning_rate": 6.2337291873458e-06, + "loss": 0.2941, + "step": 15102 + }, + { + "epoch": 0.44, + "grad_norm": 1.2948369936097417, + "learning_rate": 6.233273997504193e-06, + "loss": 0.3294, + "step": 15103 + }, + { + "epoch": 0.44, + "grad_norm": 1.6943892268767031, + "learning_rate": 6.23281879677893e-06, + "loss": 0.3414, + "step": 15104 + }, + { + "epoch": 0.44, + "grad_norm": 1.5693579054844484, + "learning_rate": 6.232363585174027e-06, + "loss": 0.3394, + "step": 15105 + }, + { + "epoch": 0.44, + "grad_norm": 1.398538757641976, + "learning_rate": 6.2319083626935015e-06, + "loss": 0.3469, + "step": 15106 + }, + { + "epoch": 0.44, + "grad_norm": 1.1492765788736712, + "learning_rate": 6.2314531293413705e-06, + "loss": 0.3111, + "step": 15107 + }, + { + "epoch": 0.44, + "grad_norm": 1.2872019626193423, + "learning_rate": 6.230997885121653e-06, + "loss": 0.3282, + "step": 15108 + }, + { + "epoch": 0.44, + "grad_norm": 1.301819052584575, + "learning_rate": 6.230542630038368e-06, + "loss": 0.296, + "step": 15109 + }, + { + "epoch": 0.44, + "grad_norm": 1.272616628732808, + "learning_rate": 6.230087364095527e-06, + "loss": 0.3372, + "step": 15110 + }, + { + "epoch": 0.44, + "grad_norm": 1.5216196409930383, + "learning_rate": 6.2296320872971515e-06, + "loss": 0.325, + "step": 15111 + }, + { + "epoch": 0.44, + "grad_norm": 1.591021843986547, + "learning_rate": 6.22917679964726e-06, + "loss": 0.3286, + "step": 15112 + }, + { + "epoch": 0.44, + "grad_norm": 1.3562527844657024, + "learning_rate": 6.228721501149869e-06, + "loss": 0.3154, + "step": 15113 + }, + { + "epoch": 0.44, + "grad_norm": 1.2949944214795839, + "learning_rate": 6.2282661918089995e-06, + "loss": 0.3171, + "step": 15114 + }, + { + "epoch": 0.44, + "grad_norm": 1.6808460222891188, + "learning_rate": 6.2278108716286645e-06, + "loss": 0.3063, + "step": 15115 + }, + { + "epoch": 0.44, + "grad_norm": 2.660113038234755, + "learning_rate": 6.227355540612886e-06, + "loss": 0.3103, + "step": 15116 + }, + { + "epoch": 0.44, + "grad_norm": 0.9473700039269357, + "learning_rate": 6.226900198765681e-06, + "loss": 0.5897, + "step": 15117 + }, + { + "epoch": 0.44, + "grad_norm": 1.4352550055374966, + "learning_rate": 6.226444846091068e-06, + "loss": 0.3196, + "step": 15118 + }, + { + "epoch": 0.44, + "grad_norm": 1.2423134802714124, + "learning_rate": 6.225989482593067e-06, + "loss": 0.3231, + "step": 15119 + }, + { + "epoch": 0.44, + "grad_norm": 1.2938085215446744, + "learning_rate": 6.2255341082756936e-06, + "loss": 0.2989, + "step": 15120 + }, + { + "epoch": 0.44, + "grad_norm": 1.3249332610712599, + "learning_rate": 6.225078723142968e-06, + "loss": 0.3197, + "step": 15121 + }, + { + "epoch": 0.44, + "grad_norm": 2.168573704303746, + "learning_rate": 6.2246233271989105e-06, + "loss": 0.2984, + "step": 15122 + }, + { + "epoch": 0.44, + "grad_norm": 1.2337758546414488, + "learning_rate": 6.224167920447539e-06, + "loss": 0.3055, + "step": 15123 + }, + { + "epoch": 0.44, + "grad_norm": 1.5718933788781186, + "learning_rate": 6.22371250289287e-06, + "loss": 0.3539, + "step": 15124 + }, + { + "epoch": 0.44, + "grad_norm": 1.637368586050357, + "learning_rate": 6.223257074538926e-06, + "loss": 0.3183, + "step": 15125 + }, + { + "epoch": 0.44, + "grad_norm": 1.466258346716507, + "learning_rate": 6.222801635389723e-06, + "loss": 0.343, + "step": 15126 + }, + { + "epoch": 0.44, + "grad_norm": 1.6219535896114377, + "learning_rate": 6.222346185449284e-06, + "loss": 0.3332, + "step": 15127 + }, + { + "epoch": 0.44, + "grad_norm": 1.6381852477873233, + "learning_rate": 6.221890724721625e-06, + "loss": 0.2939, + "step": 15128 + }, + { + "epoch": 0.44, + "grad_norm": 1.5668625034826145, + "learning_rate": 6.221435253210768e-06, + "loss": 0.3188, + "step": 15129 + }, + { + "epoch": 0.44, + "grad_norm": 1.2974704792373337, + "learning_rate": 6.220979770920729e-06, + "loss": 0.3311, + "step": 15130 + }, + { + "epoch": 0.44, + "grad_norm": 1.3189518935799396, + "learning_rate": 6.220524277855532e-06, + "loss": 0.3464, + "step": 15131 + }, + { + "epoch": 0.44, + "grad_norm": 1.7430416680026117, + "learning_rate": 6.220068774019196e-06, + "loss": 0.3199, + "step": 15132 + }, + { + "epoch": 0.44, + "grad_norm": 1.5650315293179078, + "learning_rate": 6.219613259415737e-06, + "loss": 0.3187, + "step": 15133 + }, + { + "epoch": 0.44, + "grad_norm": 1.1947068446529985, + "learning_rate": 6.219157734049179e-06, + "loss": 0.3063, + "step": 15134 + }, + { + "epoch": 0.44, + "grad_norm": 1.3096218101997787, + "learning_rate": 6.218702197923538e-06, + "loss": 0.3113, + "step": 15135 + }, + { + "epoch": 0.44, + "grad_norm": 1.7201965699629334, + "learning_rate": 6.2182466510428395e-06, + "loss": 0.3244, + "step": 15136 + }, + { + "epoch": 0.44, + "grad_norm": 1.3214256197014829, + "learning_rate": 6.2177910934111004e-06, + "loss": 0.3182, + "step": 15137 + }, + { + "epoch": 0.44, + "grad_norm": 1.798762870584179, + "learning_rate": 6.217335525032341e-06, + "loss": 0.3242, + "step": 15138 + }, + { + "epoch": 0.44, + "grad_norm": 1.7805452843675043, + "learning_rate": 6.216879945910581e-06, + "loss": 0.3271, + "step": 15139 + }, + { + "epoch": 0.44, + "grad_norm": 1.3723774357793115, + "learning_rate": 6.216424356049843e-06, + "loss": 0.3181, + "step": 15140 + }, + { + "epoch": 0.44, + "grad_norm": 1.2921347128864815, + "learning_rate": 6.215968755454146e-06, + "loss": 0.3329, + "step": 15141 + }, + { + "epoch": 0.44, + "grad_norm": 1.3304802308150685, + "learning_rate": 6.215513144127512e-06, + "loss": 0.3058, + "step": 15142 + }, + { + "epoch": 0.44, + "grad_norm": 2.7230518126844476, + "learning_rate": 6.215057522073959e-06, + "loss": 0.3474, + "step": 15143 + }, + { + "epoch": 0.44, + "grad_norm": 1.2616754268120938, + "learning_rate": 6.214601889297512e-06, + "loss": 0.3014, + "step": 15144 + }, + { + "epoch": 0.44, + "grad_norm": 1.4993785341489436, + "learning_rate": 6.2141462458021905e-06, + "loss": 0.3388, + "step": 15145 + }, + { + "epoch": 0.44, + "grad_norm": 1.8853439625204742, + "learning_rate": 6.213690591592013e-06, + "loss": 0.3453, + "step": 15146 + }, + { + "epoch": 0.44, + "grad_norm": 1.5766645611924093, + "learning_rate": 6.213234926671003e-06, + "loss": 0.3075, + "step": 15147 + }, + { + "epoch": 0.44, + "grad_norm": 1.2995817186318783, + "learning_rate": 6.212779251043182e-06, + "loss": 0.3194, + "step": 15148 + }, + { + "epoch": 0.44, + "grad_norm": 1.7304291811899044, + "learning_rate": 6.212323564712569e-06, + "loss": 0.2959, + "step": 15149 + }, + { + "epoch": 0.44, + "grad_norm": 1.6483642646037844, + "learning_rate": 6.211867867683189e-06, + "loss": 0.3298, + "step": 15150 + }, + { + "epoch": 0.44, + "grad_norm": 1.2932936489327287, + "learning_rate": 6.211412159959062e-06, + "loss": 0.3394, + "step": 15151 + }, + { + "epoch": 0.44, + "grad_norm": 1.4373237772692307, + "learning_rate": 6.210956441544208e-06, + "loss": 0.331, + "step": 15152 + }, + { + "epoch": 0.44, + "grad_norm": 1.3237289172321909, + "learning_rate": 6.210500712442649e-06, + "loss": 0.339, + "step": 15153 + }, + { + "epoch": 0.44, + "grad_norm": 1.3155591338017565, + "learning_rate": 6.210044972658409e-06, + "loss": 0.318, + "step": 15154 + }, + { + "epoch": 0.44, + "grad_norm": 1.3784580081203552, + "learning_rate": 6.20958922219551e-06, + "loss": 0.3362, + "step": 15155 + }, + { + "epoch": 0.44, + "grad_norm": 1.351277668456139, + "learning_rate": 6.2091334610579715e-06, + "loss": 0.3179, + "step": 15156 + }, + { + "epoch": 0.44, + "grad_norm": 1.2372285085196395, + "learning_rate": 6.208677689249816e-06, + "loss": 0.3096, + "step": 15157 + }, + { + "epoch": 0.44, + "grad_norm": 1.6238617605215595, + "learning_rate": 6.208221906775067e-06, + "loss": 0.3072, + "step": 15158 + }, + { + "epoch": 0.44, + "grad_norm": 1.5148980580922784, + "learning_rate": 6.207766113637747e-06, + "loss": 0.3251, + "step": 15159 + }, + { + "epoch": 0.44, + "grad_norm": 1.3869470967317696, + "learning_rate": 6.207310309841876e-06, + "loss": 0.3298, + "step": 15160 + }, + { + "epoch": 0.44, + "grad_norm": 1.3164222959141045, + "learning_rate": 6.206854495391479e-06, + "loss": 0.3281, + "step": 15161 + }, + { + "epoch": 0.44, + "grad_norm": 1.4940285491009062, + "learning_rate": 6.206398670290577e-06, + "loss": 0.3171, + "step": 15162 + }, + { + "epoch": 0.44, + "grad_norm": 1.2667666350870708, + "learning_rate": 6.205942834543196e-06, + "loss": 0.3085, + "step": 15163 + }, + { + "epoch": 0.44, + "grad_norm": 1.3879324950998746, + "learning_rate": 6.205486988153355e-06, + "loss": 0.338, + "step": 15164 + }, + { + "epoch": 0.44, + "grad_norm": 1.264133246698289, + "learning_rate": 6.2050311311250765e-06, + "loss": 0.3073, + "step": 15165 + }, + { + "epoch": 0.44, + "grad_norm": 1.580831431270314, + "learning_rate": 6.2045752634623856e-06, + "loss": 0.3158, + "step": 15166 + }, + { + "epoch": 0.44, + "grad_norm": 1.353744457664195, + "learning_rate": 6.204119385169305e-06, + "loss": 0.3249, + "step": 15167 + }, + { + "epoch": 0.44, + "grad_norm": 1.435402566436635, + "learning_rate": 6.203663496249856e-06, + "loss": 0.3121, + "step": 15168 + }, + { + "epoch": 0.44, + "grad_norm": 1.5340593675643561, + "learning_rate": 6.203207596708066e-06, + "loss": 0.3122, + "step": 15169 + }, + { + "epoch": 0.44, + "grad_norm": 1.5196396925084232, + "learning_rate": 6.202751686547953e-06, + "loss": 0.3288, + "step": 15170 + }, + { + "epoch": 0.44, + "grad_norm": 1.5365343682176427, + "learning_rate": 6.202295765773545e-06, + "loss": 0.3191, + "step": 15171 + }, + { + "epoch": 0.44, + "grad_norm": 1.402999153333557, + "learning_rate": 6.201839834388862e-06, + "loss": 0.2989, + "step": 15172 + }, + { + "epoch": 0.44, + "grad_norm": 1.1828422970640298, + "learning_rate": 6.201383892397929e-06, + "loss": 0.2951, + "step": 15173 + }, + { + "epoch": 0.44, + "grad_norm": 1.3808531757118685, + "learning_rate": 6.200927939804772e-06, + "loss": 0.323, + "step": 15174 + }, + { + "epoch": 0.44, + "grad_norm": 1.3269687065046087, + "learning_rate": 6.20047197661341e-06, + "loss": 0.3241, + "step": 15175 + }, + { + "epoch": 0.44, + "grad_norm": 1.404419148174067, + "learning_rate": 6.200016002827871e-06, + "loss": 0.3311, + "step": 15176 + }, + { + "epoch": 0.44, + "grad_norm": 1.4197447623392123, + "learning_rate": 6.199560018452178e-06, + "loss": 0.3408, + "step": 15177 + }, + { + "epoch": 0.44, + "grad_norm": 1.4644382064255637, + "learning_rate": 6.1991040234903545e-06, + "loss": 0.3526, + "step": 15178 + }, + { + "epoch": 0.44, + "grad_norm": 1.3284594192955417, + "learning_rate": 6.198648017946424e-06, + "loss": 0.3146, + "step": 15179 + }, + { + "epoch": 0.44, + "grad_norm": 1.3714239612887198, + "learning_rate": 6.19819200182441e-06, + "loss": 0.3247, + "step": 15180 + }, + { + "epoch": 0.44, + "grad_norm": 1.796885834860289, + "learning_rate": 6.19773597512834e-06, + "loss": 0.3553, + "step": 15181 + }, + { + "epoch": 0.44, + "grad_norm": 1.4279146063071786, + "learning_rate": 6.1972799378622375e-06, + "loss": 0.3223, + "step": 15182 + }, + { + "epoch": 0.44, + "grad_norm": 1.4365027014908227, + "learning_rate": 6.196823890030124e-06, + "loss": 0.3172, + "step": 15183 + }, + { + "epoch": 0.44, + "grad_norm": 1.4204724570227611, + "learning_rate": 6.196367831636027e-06, + "loss": 0.3056, + "step": 15184 + }, + { + "epoch": 0.44, + "grad_norm": 1.2313260272638789, + "learning_rate": 6.19591176268397e-06, + "loss": 0.3028, + "step": 15185 + }, + { + "epoch": 0.44, + "grad_norm": 1.3938167424217234, + "learning_rate": 6.19545568317798e-06, + "loss": 0.3499, + "step": 15186 + }, + { + "epoch": 0.44, + "grad_norm": 1.2757151975559278, + "learning_rate": 6.194999593122081e-06, + "loss": 0.3186, + "step": 15187 + }, + { + "epoch": 0.44, + "grad_norm": 1.2858212177983497, + "learning_rate": 6.1945434925202954e-06, + "loss": 0.3191, + "step": 15188 + }, + { + "epoch": 0.44, + "grad_norm": 1.5659267357491458, + "learning_rate": 6.194087381376651e-06, + "loss": 0.3121, + "step": 15189 + }, + { + "epoch": 0.44, + "grad_norm": 1.3694615475596383, + "learning_rate": 6.193631259695172e-06, + "loss": 0.3395, + "step": 15190 + }, + { + "epoch": 0.44, + "grad_norm": 2.0028762440496224, + "learning_rate": 6.193175127479884e-06, + "loss": 0.3092, + "step": 15191 + }, + { + "epoch": 0.44, + "grad_norm": 1.5755383767196398, + "learning_rate": 6.192718984734815e-06, + "loss": 0.345, + "step": 15192 + }, + { + "epoch": 0.44, + "grad_norm": 2.6880786911804937, + "learning_rate": 6.192262831463985e-06, + "loss": 0.301, + "step": 15193 + }, + { + "epoch": 0.44, + "grad_norm": 0.9444085672887367, + "learning_rate": 6.1918066676714205e-06, + "loss": 0.6145, + "step": 15194 + }, + { + "epoch": 0.44, + "grad_norm": 1.274103792942147, + "learning_rate": 6.1913504933611516e-06, + "loss": 0.3218, + "step": 15195 + }, + { + "epoch": 0.44, + "grad_norm": 1.5943158910045958, + "learning_rate": 6.1908943085372006e-06, + "loss": 0.3283, + "step": 15196 + }, + { + "epoch": 0.44, + "grad_norm": 1.286342207695946, + "learning_rate": 6.190438113203594e-06, + "loss": 0.2941, + "step": 15197 + }, + { + "epoch": 0.44, + "grad_norm": 1.3614317518145191, + "learning_rate": 6.189981907364358e-06, + "loss": 0.2965, + "step": 15198 + }, + { + "epoch": 0.44, + "grad_norm": 1.6125834993617092, + "learning_rate": 6.189525691023518e-06, + "loss": 0.324, + "step": 15199 + }, + { + "epoch": 0.44, + "grad_norm": 1.9448289589128172, + "learning_rate": 6.189069464185102e-06, + "loss": 0.3051, + "step": 15200 + }, + { + "epoch": 0.44, + "grad_norm": 1.4142945722644946, + "learning_rate": 6.188613226853133e-06, + "loss": 0.322, + "step": 15201 + }, + { + "epoch": 0.44, + "grad_norm": 1.3099650338933237, + "learning_rate": 6.1881569790316394e-06, + "loss": 0.3458, + "step": 15202 + }, + { + "epoch": 0.44, + "grad_norm": 1.3131686823298963, + "learning_rate": 6.187700720724648e-06, + "loss": 0.3189, + "step": 15203 + }, + { + "epoch": 0.44, + "grad_norm": 1.2666342541024425, + "learning_rate": 6.187244451936184e-06, + "loss": 0.3071, + "step": 15204 + }, + { + "epoch": 0.44, + "grad_norm": 1.2969040758817612, + "learning_rate": 6.186788172670276e-06, + "loss": 0.3052, + "step": 15205 + }, + { + "epoch": 0.44, + "grad_norm": 1.3283194334162671, + "learning_rate": 6.186331882930949e-06, + "loss": 0.2952, + "step": 15206 + }, + { + "epoch": 0.44, + "grad_norm": 1.477827675173501, + "learning_rate": 6.185875582722229e-06, + "loss": 0.3308, + "step": 15207 + }, + { + "epoch": 0.44, + "grad_norm": 1.855701164315608, + "learning_rate": 6.185419272048143e-06, + "loss": 0.3149, + "step": 15208 + }, + { + "epoch": 0.44, + "grad_norm": 1.2833067462149699, + "learning_rate": 6.184962950912719e-06, + "loss": 0.317, + "step": 15209 + }, + { + "epoch": 0.44, + "grad_norm": 1.6598420696638179, + "learning_rate": 6.1845066193199856e-06, + "loss": 0.3595, + "step": 15210 + }, + { + "epoch": 0.44, + "grad_norm": 1.4228933098722973, + "learning_rate": 6.1840502772739675e-06, + "loss": 0.3119, + "step": 15211 + }, + { + "epoch": 0.44, + "grad_norm": 1.7474817885589748, + "learning_rate": 6.183593924778692e-06, + "loss": 0.3341, + "step": 15212 + }, + { + "epoch": 0.44, + "grad_norm": 1.3586534709599232, + "learning_rate": 6.183137561838186e-06, + "loss": 0.3226, + "step": 15213 + }, + { + "epoch": 0.44, + "grad_norm": 1.6252029857576362, + "learning_rate": 6.18268118845648e-06, + "loss": 0.3298, + "step": 15214 + }, + { + "epoch": 0.44, + "grad_norm": 1.8788335177418156, + "learning_rate": 6.182224804637598e-06, + "loss": 0.3114, + "step": 15215 + }, + { + "epoch": 0.44, + "grad_norm": 2.122218167166875, + "learning_rate": 6.181768410385569e-06, + "loss": 0.2891, + "step": 15216 + }, + { + "epoch": 0.44, + "grad_norm": 1.7130674538738386, + "learning_rate": 6.18131200570442e-06, + "loss": 0.3201, + "step": 15217 + }, + { + "epoch": 0.44, + "grad_norm": 1.2126198493620477, + "learning_rate": 6.180855590598182e-06, + "loss": 0.3186, + "step": 15218 + }, + { + "epoch": 0.44, + "grad_norm": 0.9977949595885474, + "learning_rate": 6.1803991650708784e-06, + "loss": 0.5861, + "step": 15219 + }, + { + "epoch": 0.44, + "grad_norm": 1.747757582092717, + "learning_rate": 6.17994272912654e-06, + "loss": 0.3556, + "step": 15220 + }, + { + "epoch": 0.44, + "grad_norm": 1.8160037954357657, + "learning_rate": 6.179486282769194e-06, + "loss": 0.2993, + "step": 15221 + }, + { + "epoch": 0.44, + "grad_norm": 1.6275567358529541, + "learning_rate": 6.179029826002867e-06, + "loss": 0.3338, + "step": 15222 + }, + { + "epoch": 0.44, + "grad_norm": 1.5336849048546501, + "learning_rate": 6.1785733588315896e-06, + "loss": 0.3176, + "step": 15223 + }, + { + "epoch": 0.44, + "grad_norm": 1.7898492634580354, + "learning_rate": 6.178116881259389e-06, + "loss": 0.3145, + "step": 15224 + }, + { + "epoch": 0.44, + "grad_norm": 1.1586250817997428, + "learning_rate": 6.177660393290294e-06, + "loss": 0.3041, + "step": 15225 + }, + { + "epoch": 0.44, + "grad_norm": 1.4473726423612518, + "learning_rate": 6.177203894928333e-06, + "loss": 0.3225, + "step": 15226 + }, + { + "epoch": 0.44, + "grad_norm": 1.3136420712263805, + "learning_rate": 6.176747386177534e-06, + "loss": 0.3466, + "step": 15227 + }, + { + "epoch": 0.44, + "grad_norm": 1.4035051647213388, + "learning_rate": 6.176290867041928e-06, + "loss": 0.3246, + "step": 15228 + }, + { + "epoch": 0.44, + "grad_norm": 1.3843870059415135, + "learning_rate": 6.1758343375255405e-06, + "loss": 0.294, + "step": 15229 + }, + { + "epoch": 0.44, + "grad_norm": 1.9114308093015089, + "learning_rate": 6.1753777976324024e-06, + "loss": 0.3245, + "step": 15230 + }, + { + "epoch": 0.44, + "grad_norm": 1.2660822260513038, + "learning_rate": 6.174921247366541e-06, + "loss": 0.311, + "step": 15231 + }, + { + "epoch": 0.44, + "grad_norm": 1.4134902587583207, + "learning_rate": 6.174464686731989e-06, + "loss": 0.3278, + "step": 15232 + }, + { + "epoch": 0.44, + "grad_norm": 1.7549733146491664, + "learning_rate": 6.174008115732771e-06, + "loss": 0.309, + "step": 15233 + }, + { + "epoch": 0.44, + "grad_norm": 1.5117717630169734, + "learning_rate": 6.17355153437292e-06, + "loss": 0.3354, + "step": 15234 + }, + { + "epoch": 0.44, + "grad_norm": 1.2896651575822657, + "learning_rate": 6.173094942656462e-06, + "loss": 0.3188, + "step": 15235 + }, + { + "epoch": 0.44, + "grad_norm": 1.396806752140163, + "learning_rate": 6.172638340587429e-06, + "loss": 0.364, + "step": 15236 + }, + { + "epoch": 0.44, + "grad_norm": 1.5644050306551847, + "learning_rate": 6.17218172816985e-06, + "loss": 0.3311, + "step": 15237 + }, + { + "epoch": 0.44, + "grad_norm": 1.6118618943012897, + "learning_rate": 6.1717251054077545e-06, + "loss": 0.3225, + "step": 15238 + }, + { + "epoch": 0.44, + "grad_norm": 1.7310325221454266, + "learning_rate": 6.17126847230517e-06, + "loss": 0.32, + "step": 15239 + }, + { + "epoch": 0.44, + "grad_norm": 1.3738162266018055, + "learning_rate": 6.17081182886613e-06, + "loss": 0.3254, + "step": 15240 + }, + { + "epoch": 0.44, + "grad_norm": 2.4002036578469483, + "learning_rate": 6.1703551750946635e-06, + "loss": 0.3192, + "step": 15241 + }, + { + "epoch": 0.44, + "grad_norm": 2.1946409627390984, + "learning_rate": 6.169898510994797e-06, + "loss": 0.3303, + "step": 15242 + }, + { + "epoch": 0.44, + "grad_norm": 1.285351875730475, + "learning_rate": 6.169441836570564e-06, + "loss": 0.2997, + "step": 15243 + }, + { + "epoch": 0.44, + "grad_norm": 1.3383860430591819, + "learning_rate": 6.168985151825995e-06, + "loss": 0.3008, + "step": 15244 + }, + { + "epoch": 0.44, + "grad_norm": 1.4814806937949208, + "learning_rate": 6.1685284567651174e-06, + "loss": 0.3471, + "step": 15245 + }, + { + "epoch": 0.44, + "grad_norm": 1.4342387372148897, + "learning_rate": 6.168071751391965e-06, + "loss": 0.3014, + "step": 15246 + }, + { + "epoch": 0.44, + "grad_norm": 2.687112345758234, + "learning_rate": 6.1676150357105646e-06, + "loss": 0.3014, + "step": 15247 + }, + { + "epoch": 0.44, + "grad_norm": 1.3215840979465858, + "learning_rate": 6.167158309724951e-06, + "loss": 0.3099, + "step": 15248 + }, + { + "epoch": 0.44, + "grad_norm": 1.573232881648053, + "learning_rate": 6.16670157343915e-06, + "loss": 0.3367, + "step": 15249 + }, + { + "epoch": 0.44, + "grad_norm": 1.800625249539677, + "learning_rate": 6.166244826857195e-06, + "loss": 0.3267, + "step": 15250 + }, + { + "epoch": 0.44, + "grad_norm": 1.3618500779356324, + "learning_rate": 6.165788069983118e-06, + "loss": 0.3341, + "step": 15251 + }, + { + "epoch": 0.44, + "grad_norm": 1.2724691669803618, + "learning_rate": 6.165331302820947e-06, + "loss": 0.3164, + "step": 15252 + }, + { + "epoch": 0.44, + "grad_norm": 1.3766801174966348, + "learning_rate": 6.1648745253747145e-06, + "loss": 0.3339, + "step": 15253 + }, + { + "epoch": 0.44, + "grad_norm": 1.3291280718223673, + "learning_rate": 6.164417737648451e-06, + "loss": 0.3235, + "step": 15254 + }, + { + "epoch": 0.44, + "grad_norm": 1.269803669569713, + "learning_rate": 6.16396093964619e-06, + "loss": 0.3116, + "step": 15255 + }, + { + "epoch": 0.44, + "grad_norm": 1.5303012605526742, + "learning_rate": 6.163504131371959e-06, + "loss": 0.3092, + "step": 15256 + }, + { + "epoch": 0.44, + "grad_norm": 1.1726473653959908, + "learning_rate": 6.1630473128297905e-06, + "loss": 0.3005, + "step": 15257 + }, + { + "epoch": 0.44, + "grad_norm": 1.312426820294714, + "learning_rate": 6.1625904840237185e-06, + "loss": 0.3058, + "step": 15258 + }, + { + "epoch": 0.44, + "grad_norm": 1.9533282842581203, + "learning_rate": 6.162133644957772e-06, + "loss": 0.3202, + "step": 15259 + }, + { + "epoch": 0.44, + "grad_norm": 2.806863244024192, + "learning_rate": 6.161676795635982e-06, + "loss": 0.304, + "step": 15260 + }, + { + "epoch": 0.44, + "grad_norm": 1.2194002828662884, + "learning_rate": 6.161219936062382e-06, + "loss": 0.3118, + "step": 15261 + }, + { + "epoch": 0.44, + "grad_norm": 1.2626545565219218, + "learning_rate": 6.160763066241005e-06, + "loss": 0.3152, + "step": 15262 + }, + { + "epoch": 0.44, + "grad_norm": 1.3780052049214364, + "learning_rate": 6.160306186175878e-06, + "loss": 0.3193, + "step": 15263 + }, + { + "epoch": 0.44, + "grad_norm": 1.3700943668550751, + "learning_rate": 6.1598492958710375e-06, + "loss": 0.3275, + "step": 15264 + }, + { + "epoch": 0.44, + "grad_norm": 1.4902466516846187, + "learning_rate": 6.159392395330516e-06, + "loss": 0.3067, + "step": 15265 + }, + { + "epoch": 0.44, + "grad_norm": 1.820268374093929, + "learning_rate": 6.158935484558341e-06, + "loss": 0.3331, + "step": 15266 + }, + { + "epoch": 0.44, + "grad_norm": 1.3984540030843968, + "learning_rate": 6.158478563558548e-06, + "loss": 0.3389, + "step": 15267 + }, + { + "epoch": 0.44, + "grad_norm": 1.5431261548316837, + "learning_rate": 6.158021632335169e-06, + "loss": 0.3172, + "step": 15268 + }, + { + "epoch": 0.44, + "grad_norm": 1.4620482834150028, + "learning_rate": 6.157564690892237e-06, + "loss": 0.3096, + "step": 15269 + }, + { + "epoch": 0.44, + "grad_norm": 1.338742548896661, + "learning_rate": 6.157107739233783e-06, + "loss": 0.3182, + "step": 15270 + }, + { + "epoch": 0.44, + "grad_norm": 1.182444435459953, + "learning_rate": 6.156650777363841e-06, + "loss": 0.3032, + "step": 15271 + }, + { + "epoch": 0.44, + "grad_norm": 1.39390999708508, + "learning_rate": 6.156193805286442e-06, + "loss": 0.3294, + "step": 15272 + }, + { + "epoch": 0.44, + "grad_norm": 1.5344454310915732, + "learning_rate": 6.155736823005621e-06, + "loss": 0.3357, + "step": 15273 + }, + { + "epoch": 0.44, + "grad_norm": 1.648314974587282, + "learning_rate": 6.1552798305254095e-06, + "loss": 0.3167, + "step": 15274 + }, + { + "epoch": 0.44, + "grad_norm": 1.348158009037716, + "learning_rate": 6.15482282784984e-06, + "loss": 0.3007, + "step": 15275 + }, + { + "epoch": 0.44, + "grad_norm": 1.9576557519844546, + "learning_rate": 6.154365814982947e-06, + "loss": 0.3083, + "step": 15276 + }, + { + "epoch": 0.44, + "grad_norm": 1.4553967281915, + "learning_rate": 6.153908791928762e-06, + "loss": 0.322, + "step": 15277 + }, + { + "epoch": 0.44, + "grad_norm": 1.340308506320888, + "learning_rate": 6.15345175869132e-06, + "loss": 0.3008, + "step": 15278 + }, + { + "epoch": 0.44, + "grad_norm": 1.8852696398999538, + "learning_rate": 6.152994715274653e-06, + "loss": 0.3239, + "step": 15279 + }, + { + "epoch": 0.44, + "grad_norm": 1.3127265481177706, + "learning_rate": 6.152537661682795e-06, + "loss": 0.3102, + "step": 15280 + }, + { + "epoch": 0.44, + "grad_norm": 1.7118760419117418, + "learning_rate": 6.1520805979197784e-06, + "loss": 0.3366, + "step": 15281 + }, + { + "epoch": 0.44, + "grad_norm": 1.3374584472516626, + "learning_rate": 6.151623523989638e-06, + "loss": 0.3015, + "step": 15282 + }, + { + "epoch": 0.44, + "grad_norm": 1.42394260760135, + "learning_rate": 6.151166439896409e-06, + "loss": 0.3102, + "step": 15283 + }, + { + "epoch": 0.44, + "grad_norm": 1.3505046060852208, + "learning_rate": 6.1507093456441226e-06, + "loss": 0.3133, + "step": 15284 + }, + { + "epoch": 0.44, + "grad_norm": 1.2807107122989725, + "learning_rate": 6.150252241236813e-06, + "loss": 0.342, + "step": 15285 + }, + { + "epoch": 0.44, + "grad_norm": 1.3744763004962381, + "learning_rate": 6.1497951266785135e-06, + "loss": 0.3154, + "step": 15286 + }, + { + "epoch": 0.44, + "grad_norm": 1.4549784893612576, + "learning_rate": 6.149338001973262e-06, + "loss": 0.3596, + "step": 15287 + }, + { + "epoch": 0.44, + "grad_norm": 3.7434450954049416, + "learning_rate": 6.148880867125086e-06, + "loss": 0.3309, + "step": 15288 + }, + { + "epoch": 0.44, + "grad_norm": 0.9518797850187388, + "learning_rate": 6.148423722138027e-06, + "loss": 0.6108, + "step": 15289 + }, + { + "epoch": 0.44, + "grad_norm": 1.6301725799044864, + "learning_rate": 6.147966567016115e-06, + "loss": 0.3246, + "step": 15290 + }, + { + "epoch": 0.44, + "grad_norm": 2.1424559537448338, + "learning_rate": 6.147509401763385e-06, + "loss": 0.3246, + "step": 15291 + }, + { + "epoch": 0.44, + "grad_norm": 1.5082314191846464, + "learning_rate": 6.147052226383873e-06, + "loss": 0.323, + "step": 15292 + }, + { + "epoch": 0.44, + "grad_norm": 1.7802212475554524, + "learning_rate": 6.146595040881611e-06, + "loss": 0.308, + "step": 15293 + }, + { + "epoch": 0.44, + "grad_norm": 1.9992158519695948, + "learning_rate": 6.146137845260635e-06, + "loss": 0.3126, + "step": 15294 + }, + { + "epoch": 0.44, + "grad_norm": 1.4623184452527325, + "learning_rate": 6.14568063952498e-06, + "loss": 0.309, + "step": 15295 + }, + { + "epoch": 0.44, + "grad_norm": 1.5461445483729603, + "learning_rate": 6.145223423678681e-06, + "loss": 0.3211, + "step": 15296 + }, + { + "epoch": 0.44, + "grad_norm": 1.3250442738759363, + "learning_rate": 6.144766197725772e-06, + "loss": 0.3208, + "step": 15297 + }, + { + "epoch": 0.44, + "grad_norm": 1.267973653340088, + "learning_rate": 6.144308961670289e-06, + "loss": 0.3055, + "step": 15298 + }, + { + "epoch": 0.44, + "grad_norm": 1.2121817214111428, + "learning_rate": 6.143851715516267e-06, + "loss": 0.3024, + "step": 15299 + }, + { + "epoch": 0.44, + "grad_norm": 0.9606549185876818, + "learning_rate": 6.143394459267739e-06, + "loss": 0.6388, + "step": 15300 + }, + { + "epoch": 0.44, + "grad_norm": 1.4038793427157623, + "learning_rate": 6.142937192928745e-06, + "loss": 0.319, + "step": 15301 + }, + { + "epoch": 0.44, + "grad_norm": 1.2967601215989788, + "learning_rate": 6.142479916503317e-06, + "loss": 0.3149, + "step": 15302 + }, + { + "epoch": 0.44, + "grad_norm": 1.4965884059858123, + "learning_rate": 6.14202262999549e-06, + "loss": 0.3073, + "step": 15303 + }, + { + "epoch": 0.44, + "grad_norm": 1.1909714171716739, + "learning_rate": 6.141565333409301e-06, + "loss": 0.3099, + "step": 15304 + }, + { + "epoch": 0.44, + "grad_norm": 0.9851371446277782, + "learning_rate": 6.141108026748785e-06, + "loss": 0.6216, + "step": 15305 + }, + { + "epoch": 0.44, + "grad_norm": 1.4595958153083155, + "learning_rate": 6.1406507100179805e-06, + "loss": 0.2994, + "step": 15306 + }, + { + "epoch": 0.44, + "grad_norm": 1.9557973680655545, + "learning_rate": 6.140193383220918e-06, + "loss": 0.3195, + "step": 15307 + }, + { + "epoch": 0.44, + "grad_norm": 1.289509118865858, + "learning_rate": 6.139736046361636e-06, + "loss": 0.3094, + "step": 15308 + }, + { + "epoch": 0.44, + "grad_norm": 1.2422099643036322, + "learning_rate": 6.139278699444172e-06, + "loss": 0.3224, + "step": 15309 + }, + { + "epoch": 0.44, + "grad_norm": 1.2785758249508095, + "learning_rate": 6.138821342472561e-06, + "loss": 0.3399, + "step": 15310 + }, + { + "epoch": 0.44, + "grad_norm": 1.2444442967533502, + "learning_rate": 6.138363975450838e-06, + "loss": 0.2952, + "step": 15311 + }, + { + "epoch": 0.44, + "grad_norm": 1.2515148775448828, + "learning_rate": 6.137906598383041e-06, + "loss": 0.3261, + "step": 15312 + }, + { + "epoch": 0.44, + "grad_norm": 1.7536992858580434, + "learning_rate": 6.137449211273205e-06, + "loss": 0.3316, + "step": 15313 + }, + { + "epoch": 0.44, + "grad_norm": 1.3359183003797361, + "learning_rate": 6.1369918141253685e-06, + "loss": 0.3559, + "step": 15314 + }, + { + "epoch": 0.44, + "grad_norm": 1.3134932565443573, + "learning_rate": 6.136534406943565e-06, + "loss": 0.3161, + "step": 15315 + }, + { + "epoch": 0.44, + "grad_norm": 1.2782720949410202, + "learning_rate": 6.136076989731834e-06, + "loss": 0.322, + "step": 15316 + }, + { + "epoch": 0.44, + "grad_norm": 1.2093169403593185, + "learning_rate": 6.13561956249421e-06, + "loss": 0.3034, + "step": 15317 + }, + { + "epoch": 0.44, + "grad_norm": 1.8783159534484133, + "learning_rate": 6.1351621252347305e-06, + "loss": 0.3425, + "step": 15318 + }, + { + "epoch": 0.44, + "grad_norm": 1.2494677284934599, + "learning_rate": 6.134704677957435e-06, + "loss": 0.2873, + "step": 15319 + }, + { + "epoch": 0.44, + "grad_norm": 1.3769978311010171, + "learning_rate": 6.134247220666356e-06, + "loss": 0.3318, + "step": 15320 + }, + { + "epoch": 0.44, + "grad_norm": 1.690137011069191, + "learning_rate": 6.133789753365533e-06, + "loss": 0.3182, + "step": 15321 + }, + { + "epoch": 0.44, + "grad_norm": 1.3018457643280397, + "learning_rate": 6.1333322760590034e-06, + "loss": 0.3041, + "step": 15322 + }, + { + "epoch": 0.44, + "grad_norm": 1.253821305120646, + "learning_rate": 6.132874788750803e-06, + "loss": 0.313, + "step": 15323 + }, + { + "epoch": 0.44, + "grad_norm": 1.3453443133359724, + "learning_rate": 6.132417291444971e-06, + "loss": 0.3132, + "step": 15324 + }, + { + "epoch": 0.44, + "grad_norm": 1.2693682532504107, + "learning_rate": 6.131959784145544e-06, + "loss": 0.335, + "step": 15325 + }, + { + "epoch": 0.44, + "grad_norm": 2.060575150705426, + "learning_rate": 6.131502266856557e-06, + "loss": 0.3085, + "step": 15326 + }, + { + "epoch": 0.44, + "grad_norm": 1.2448795322945776, + "learning_rate": 6.131044739582051e-06, + "loss": 0.3155, + "step": 15327 + }, + { + "epoch": 0.44, + "grad_norm": 1.2826388061327347, + "learning_rate": 6.130587202326065e-06, + "loss": 0.3168, + "step": 15328 + }, + { + "epoch": 0.44, + "grad_norm": 1.5191708204052206, + "learning_rate": 6.130129655092633e-06, + "loss": 0.2975, + "step": 15329 + }, + { + "epoch": 0.44, + "grad_norm": 1.2954824954464037, + "learning_rate": 6.129672097885793e-06, + "loss": 0.3191, + "step": 15330 + }, + { + "epoch": 0.44, + "grad_norm": 1.8638016617045987, + "learning_rate": 6.1292145307095854e-06, + "loss": 0.305, + "step": 15331 + }, + { + "epoch": 0.44, + "grad_norm": 1.3500523370115645, + "learning_rate": 6.128756953568047e-06, + "loss": 0.3073, + "step": 15332 + }, + { + "epoch": 0.44, + "grad_norm": 1.2623967565208156, + "learning_rate": 6.128299366465217e-06, + "loss": 0.3132, + "step": 15333 + }, + { + "epoch": 0.44, + "grad_norm": 1.3539728136902223, + "learning_rate": 6.127841769405131e-06, + "loss": 0.3275, + "step": 15334 + }, + { + "epoch": 0.44, + "grad_norm": 1.2813910338913843, + "learning_rate": 6.127384162391828e-06, + "loss": 0.3033, + "step": 15335 + }, + { + "epoch": 0.44, + "grad_norm": 1.1638859799568941, + "learning_rate": 6.126926545429349e-06, + "loss": 0.2957, + "step": 15336 + }, + { + "epoch": 0.44, + "grad_norm": 1.2206188102479254, + "learning_rate": 6.12646891852173e-06, + "loss": 0.3025, + "step": 15337 + }, + { + "epoch": 0.44, + "grad_norm": 1.187597344844902, + "learning_rate": 6.126011281673011e-06, + "loss": 0.311, + "step": 15338 + }, + { + "epoch": 0.44, + "grad_norm": 1.5649938063250093, + "learning_rate": 6.125553634887229e-06, + "loss": 0.3255, + "step": 15339 + }, + { + "epoch": 0.44, + "grad_norm": 1.302639754701037, + "learning_rate": 6.125095978168424e-06, + "loss": 0.3292, + "step": 15340 + }, + { + "epoch": 0.44, + "grad_norm": 1.3326678626524453, + "learning_rate": 6.124638311520634e-06, + "loss": 0.331, + "step": 15341 + }, + { + "epoch": 0.44, + "grad_norm": 2.4329471357265473, + "learning_rate": 6.124180634947901e-06, + "loss": 0.3131, + "step": 15342 + }, + { + "epoch": 0.45, + "grad_norm": 1.3906260445160317, + "learning_rate": 6.123722948454258e-06, + "loss": 0.3269, + "step": 15343 + }, + { + "epoch": 0.45, + "grad_norm": 1.4394183435175754, + "learning_rate": 6.1232652520437485e-06, + "loss": 0.3196, + "step": 15344 + }, + { + "epoch": 0.45, + "grad_norm": 1.2153086046068493, + "learning_rate": 6.1228075457204115e-06, + "loss": 0.3239, + "step": 15345 + }, + { + "epoch": 0.45, + "grad_norm": 1.446283034342574, + "learning_rate": 6.122349829488286e-06, + "loss": 0.3286, + "step": 15346 + }, + { + "epoch": 0.45, + "grad_norm": 1.321185088116691, + "learning_rate": 6.12189210335141e-06, + "loss": 0.329, + "step": 15347 + }, + { + "epoch": 0.45, + "grad_norm": 1.6208219614056034, + "learning_rate": 6.121434367313823e-06, + "loss": 0.3125, + "step": 15348 + }, + { + "epoch": 0.45, + "grad_norm": 1.3133432407834997, + "learning_rate": 6.120976621379567e-06, + "loss": 0.3148, + "step": 15349 + }, + { + "epoch": 0.45, + "grad_norm": 1.284674529916636, + "learning_rate": 6.120518865552678e-06, + "loss": 0.322, + "step": 15350 + }, + { + "epoch": 0.45, + "grad_norm": 1.3053743691490938, + "learning_rate": 6.1200610998371986e-06, + "loss": 0.3285, + "step": 15351 + }, + { + "epoch": 0.45, + "grad_norm": 1.5358769723411374, + "learning_rate": 6.119603324237168e-06, + "loss": 0.3419, + "step": 15352 + }, + { + "epoch": 0.45, + "grad_norm": 1.8583482872918997, + "learning_rate": 6.119145538756624e-06, + "loss": 0.3472, + "step": 15353 + }, + { + "epoch": 0.45, + "grad_norm": 1.3450000471404673, + "learning_rate": 6.1186877433996095e-06, + "loss": 0.3086, + "step": 15354 + }, + { + "epoch": 0.45, + "grad_norm": 1.294073847142471, + "learning_rate": 6.118229938170162e-06, + "loss": 0.3319, + "step": 15355 + }, + { + "epoch": 0.45, + "grad_norm": 1.3631192498103206, + "learning_rate": 6.1177721230723255e-06, + "loss": 0.3261, + "step": 15356 + }, + { + "epoch": 0.45, + "grad_norm": 1.2862015942436837, + "learning_rate": 6.117314298110135e-06, + "loss": 0.3035, + "step": 15357 + }, + { + "epoch": 0.45, + "grad_norm": 1.2226856872989902, + "learning_rate": 6.116856463287635e-06, + "loss": 0.331, + "step": 15358 + }, + { + "epoch": 0.45, + "grad_norm": 1.3524262447930908, + "learning_rate": 6.116398618608863e-06, + "loss": 0.3191, + "step": 15359 + }, + { + "epoch": 0.45, + "grad_norm": 1.2309572992287074, + "learning_rate": 6.115940764077862e-06, + "loss": 0.2965, + "step": 15360 + }, + { + "epoch": 0.45, + "grad_norm": 1.34334636031044, + "learning_rate": 6.115482899698673e-06, + "loss": 0.3027, + "step": 15361 + }, + { + "epoch": 0.45, + "grad_norm": 1.1809422621367576, + "learning_rate": 6.1150250254753315e-06, + "loss": 0.2887, + "step": 15362 + }, + { + "epoch": 0.45, + "grad_norm": 1.2823143692064938, + "learning_rate": 6.114567141411883e-06, + "loss": 0.294, + "step": 15363 + }, + { + "epoch": 0.45, + "grad_norm": 1.2184236041797225, + "learning_rate": 6.1141092475123675e-06, + "loss": 0.3215, + "step": 15364 + }, + { + "epoch": 0.45, + "grad_norm": 1.273578558176107, + "learning_rate": 6.113651343780825e-06, + "loss": 0.3277, + "step": 15365 + }, + { + "epoch": 0.45, + "grad_norm": 1.3665673243014171, + "learning_rate": 6.113193430221298e-06, + "loss": 0.3122, + "step": 15366 + }, + { + "epoch": 0.45, + "grad_norm": 1.487541641929984, + "learning_rate": 6.1127355068378255e-06, + "loss": 0.3382, + "step": 15367 + }, + { + "epoch": 0.45, + "grad_norm": 1.4709901937932608, + "learning_rate": 6.112277573634451e-06, + "loss": 0.319, + "step": 15368 + }, + { + "epoch": 0.45, + "grad_norm": 1.5524197557445742, + "learning_rate": 6.111819630615214e-06, + "loss": 0.3353, + "step": 15369 + }, + { + "epoch": 0.45, + "grad_norm": 1.198074183219199, + "learning_rate": 6.111361677784155e-06, + "loss": 0.3157, + "step": 15370 + }, + { + "epoch": 0.45, + "grad_norm": 1.5472863564770358, + "learning_rate": 6.110903715145319e-06, + "loss": 0.2978, + "step": 15371 + }, + { + "epoch": 0.45, + "grad_norm": 1.5449460833797455, + "learning_rate": 6.110445742702744e-06, + "loss": 0.3041, + "step": 15372 + }, + { + "epoch": 0.45, + "grad_norm": 1.3502061704508577, + "learning_rate": 6.109987760460474e-06, + "loss": 0.2994, + "step": 15373 + }, + { + "epoch": 0.45, + "grad_norm": 1.4776066009674667, + "learning_rate": 6.109529768422549e-06, + "loss": 0.3092, + "step": 15374 + }, + { + "epoch": 0.45, + "grad_norm": 1.1409739064988174, + "learning_rate": 6.109071766593014e-06, + "loss": 0.3022, + "step": 15375 + }, + { + "epoch": 0.45, + "grad_norm": 1.3254947793017855, + "learning_rate": 6.108613754975905e-06, + "loss": 0.2964, + "step": 15376 + }, + { + "epoch": 0.45, + "grad_norm": 1.485142588015279, + "learning_rate": 6.108155733575268e-06, + "loss": 0.3203, + "step": 15377 + }, + { + "epoch": 0.45, + "grad_norm": 1.4198060084239852, + "learning_rate": 6.107697702395145e-06, + "loss": 0.3438, + "step": 15378 + }, + { + "epoch": 0.45, + "grad_norm": 1.7777254160361058, + "learning_rate": 6.107239661439578e-06, + "loss": 0.3507, + "step": 15379 + }, + { + "epoch": 0.45, + "grad_norm": 1.254120438162681, + "learning_rate": 6.106781610712608e-06, + "loss": 0.3092, + "step": 15380 + }, + { + "epoch": 0.45, + "grad_norm": 1.443186197007777, + "learning_rate": 6.106323550218277e-06, + "loss": 0.2998, + "step": 15381 + }, + { + "epoch": 0.45, + "grad_norm": 1.2676622975633454, + "learning_rate": 6.1058654799606295e-06, + "loss": 0.3329, + "step": 15382 + }, + { + "epoch": 0.45, + "grad_norm": 1.2981099410427215, + "learning_rate": 6.105407399943708e-06, + "loss": 0.3263, + "step": 15383 + }, + { + "epoch": 0.45, + "grad_norm": 1.2201457607367316, + "learning_rate": 6.104949310171551e-06, + "loss": 0.3065, + "step": 15384 + }, + { + "epoch": 0.45, + "grad_norm": 1.3293148813786555, + "learning_rate": 6.104491210648207e-06, + "loss": 0.3321, + "step": 15385 + }, + { + "epoch": 0.45, + "grad_norm": 1.2203517061069185, + "learning_rate": 6.104033101377713e-06, + "loss": 0.3486, + "step": 15386 + }, + { + "epoch": 0.45, + "grad_norm": 1.3131082372390892, + "learning_rate": 6.103574982364118e-06, + "loss": 0.3231, + "step": 15387 + }, + { + "epoch": 0.45, + "grad_norm": 1.2616916503114497, + "learning_rate": 6.10311685361146e-06, + "loss": 0.318, + "step": 15388 + }, + { + "epoch": 0.45, + "grad_norm": 1.3567068900401984, + "learning_rate": 6.102658715123784e-06, + "loss": 0.3301, + "step": 15389 + }, + { + "epoch": 0.45, + "grad_norm": 1.3952799888189718, + "learning_rate": 6.102200566905131e-06, + "loss": 0.3244, + "step": 15390 + }, + { + "epoch": 0.45, + "grad_norm": 0.9498893583909384, + "learning_rate": 6.101742408959547e-06, + "loss": 0.5957, + "step": 15391 + }, + { + "epoch": 0.45, + "grad_norm": 1.347589750309982, + "learning_rate": 6.101284241291075e-06, + "loss": 0.3125, + "step": 15392 + }, + { + "epoch": 0.45, + "grad_norm": 1.375935911889421, + "learning_rate": 6.100826063903756e-06, + "loss": 0.3405, + "step": 15393 + }, + { + "epoch": 0.45, + "grad_norm": 1.2385171569151758, + "learning_rate": 6.100367876801633e-06, + "loss": 0.3167, + "step": 15394 + }, + { + "epoch": 0.45, + "grad_norm": 1.3154163017172655, + "learning_rate": 6.099909679988753e-06, + "loss": 0.3244, + "step": 15395 + }, + { + "epoch": 0.45, + "grad_norm": 1.3338997572012594, + "learning_rate": 6.099451473469158e-06, + "loss": 0.3311, + "step": 15396 + }, + { + "epoch": 0.45, + "grad_norm": 1.2808892007223671, + "learning_rate": 6.098993257246892e-06, + "loss": 0.2961, + "step": 15397 + }, + { + "epoch": 0.45, + "grad_norm": 1.4171450469674385, + "learning_rate": 6.098535031325997e-06, + "loss": 0.3325, + "step": 15398 + }, + { + "epoch": 0.45, + "grad_norm": 1.404243684588283, + "learning_rate": 6.098076795710519e-06, + "loss": 0.3118, + "step": 15399 + }, + { + "epoch": 0.45, + "grad_norm": 1.4053363998128936, + "learning_rate": 6.097618550404502e-06, + "loss": 0.3272, + "step": 15400 + }, + { + "epoch": 0.45, + "grad_norm": 2.5666521870430254, + "learning_rate": 6.097160295411988e-06, + "loss": 0.3186, + "step": 15401 + }, + { + "epoch": 0.45, + "grad_norm": 1.6756888991868732, + "learning_rate": 6.096702030737023e-06, + "loss": 0.3111, + "step": 15402 + }, + { + "epoch": 0.45, + "grad_norm": 1.265500075427137, + "learning_rate": 6.096243756383649e-06, + "loss": 0.3254, + "step": 15403 + }, + { + "epoch": 0.45, + "grad_norm": 1.0542098566883973, + "learning_rate": 6.095785472355913e-06, + "loss": 0.6846, + "step": 15404 + }, + { + "epoch": 0.45, + "grad_norm": 1.1785666701500204, + "learning_rate": 6.0953271786578575e-06, + "loss": 0.308, + "step": 15405 + }, + { + "epoch": 0.45, + "grad_norm": 1.2384086793433804, + "learning_rate": 6.094868875293528e-06, + "loss": 0.3109, + "step": 15406 + }, + { + "epoch": 0.45, + "grad_norm": 1.4244567177357959, + "learning_rate": 6.094410562266968e-06, + "loss": 0.314, + "step": 15407 + }, + { + "epoch": 0.45, + "grad_norm": 1.5168184924774049, + "learning_rate": 6.093952239582223e-06, + "loss": 0.3117, + "step": 15408 + }, + { + "epoch": 0.45, + "grad_norm": 1.6429553635830736, + "learning_rate": 6.093493907243337e-06, + "loss": 0.3349, + "step": 15409 + }, + { + "epoch": 0.45, + "grad_norm": 1.5064789183808978, + "learning_rate": 6.093035565254356e-06, + "loss": 0.3201, + "step": 15410 + }, + { + "epoch": 0.45, + "grad_norm": 1.3961046878422188, + "learning_rate": 6.092577213619323e-06, + "loss": 0.3259, + "step": 15411 + }, + { + "epoch": 0.45, + "grad_norm": 1.303451992815379, + "learning_rate": 6.092118852342283e-06, + "loss": 0.3309, + "step": 15412 + }, + { + "epoch": 0.45, + "grad_norm": 1.2833921299288598, + "learning_rate": 6.091660481427284e-06, + "loss": 0.3094, + "step": 15413 + }, + { + "epoch": 0.45, + "grad_norm": 1.2522880281304447, + "learning_rate": 6.091202100878367e-06, + "loss": 0.2994, + "step": 15414 + }, + { + "epoch": 0.45, + "grad_norm": 1.3145093475963308, + "learning_rate": 6.090743710699583e-06, + "loss": 0.3247, + "step": 15415 + }, + { + "epoch": 0.45, + "grad_norm": 1.2905650486925035, + "learning_rate": 6.09028531089497e-06, + "loss": 0.3216, + "step": 15416 + }, + { + "epoch": 0.45, + "grad_norm": 1.2097096021859193, + "learning_rate": 6.089826901468579e-06, + "loss": 0.3163, + "step": 15417 + }, + { + "epoch": 0.45, + "grad_norm": 1.3177755439739034, + "learning_rate": 6.089368482424454e-06, + "loss": 0.3315, + "step": 15418 + }, + { + "epoch": 0.45, + "grad_norm": 1.4216780349047602, + "learning_rate": 6.088910053766638e-06, + "loss": 0.3077, + "step": 15419 + }, + { + "epoch": 0.45, + "grad_norm": 1.433669523528054, + "learning_rate": 6.08845161549918e-06, + "loss": 0.3017, + "step": 15420 + }, + { + "epoch": 0.45, + "grad_norm": 2.440502739757709, + "learning_rate": 6.087993167626124e-06, + "loss": 0.3474, + "step": 15421 + }, + { + "epoch": 0.45, + "grad_norm": 1.3488234427104577, + "learning_rate": 6.087534710151516e-06, + "loss": 0.3261, + "step": 15422 + }, + { + "epoch": 0.45, + "grad_norm": 1.544427514433614, + "learning_rate": 6.087076243079403e-06, + "loss": 0.3156, + "step": 15423 + }, + { + "epoch": 0.45, + "grad_norm": 1.3969516977146574, + "learning_rate": 6.0866177664138295e-06, + "loss": 0.3108, + "step": 15424 + }, + { + "epoch": 0.45, + "grad_norm": 1.2356202589242173, + "learning_rate": 6.086159280158842e-06, + "loss": 0.3121, + "step": 15425 + }, + { + "epoch": 0.45, + "grad_norm": 1.296119062384223, + "learning_rate": 6.085700784318486e-06, + "loss": 0.3104, + "step": 15426 + }, + { + "epoch": 0.45, + "grad_norm": 1.429513037812635, + "learning_rate": 6.0852422788968106e-06, + "loss": 0.3309, + "step": 15427 + }, + { + "epoch": 0.45, + "grad_norm": 1.3494641903010691, + "learning_rate": 6.084783763897859e-06, + "loss": 0.3492, + "step": 15428 + }, + { + "epoch": 0.45, + "grad_norm": 1.2728807392746972, + "learning_rate": 6.084325239325679e-06, + "loss": 0.3245, + "step": 15429 + }, + { + "epoch": 0.45, + "grad_norm": 1.3992175415653656, + "learning_rate": 6.083866705184316e-06, + "loss": 0.3149, + "step": 15430 + }, + { + "epoch": 0.45, + "grad_norm": 1.638581423910013, + "learning_rate": 6.083408161477818e-06, + "loss": 0.3601, + "step": 15431 + }, + { + "epoch": 0.45, + "grad_norm": 2.0840194629643265, + "learning_rate": 6.082949608210231e-06, + "loss": 0.3462, + "step": 15432 + }, + { + "epoch": 0.45, + "grad_norm": 1.3313509352478485, + "learning_rate": 6.082491045385601e-06, + "loss": 0.3004, + "step": 15433 + }, + { + "epoch": 0.45, + "grad_norm": 1.397288681436795, + "learning_rate": 6.082032473007976e-06, + "loss": 0.3075, + "step": 15434 + }, + { + "epoch": 0.45, + "grad_norm": 1.216443714470475, + "learning_rate": 6.081573891081403e-06, + "loss": 0.3226, + "step": 15435 + }, + { + "epoch": 0.45, + "grad_norm": 1.2765718377592694, + "learning_rate": 6.081115299609926e-06, + "loss": 0.3261, + "step": 15436 + }, + { + "epoch": 0.45, + "grad_norm": 1.3618411229588343, + "learning_rate": 6.0806566985975965e-06, + "loss": 0.3307, + "step": 15437 + }, + { + "epoch": 0.45, + "grad_norm": 0.9458549229508921, + "learning_rate": 6.08019808804846e-06, + "loss": 0.5699, + "step": 15438 + }, + { + "epoch": 0.45, + "grad_norm": 1.2174662555219868, + "learning_rate": 6.079739467966563e-06, + "loss": 0.3181, + "step": 15439 + }, + { + "epoch": 0.45, + "grad_norm": 1.4286290045859575, + "learning_rate": 6.079280838355952e-06, + "loss": 0.3203, + "step": 15440 + }, + { + "epoch": 0.45, + "grad_norm": 1.4176222078524945, + "learning_rate": 6.0788221992206765e-06, + "loss": 0.3071, + "step": 15441 + }, + { + "epoch": 0.45, + "grad_norm": 1.277032471281228, + "learning_rate": 6.078363550564784e-06, + "loss": 0.2981, + "step": 15442 + }, + { + "epoch": 0.45, + "grad_norm": 1.5248154180372393, + "learning_rate": 6.07790489239232e-06, + "loss": 0.3133, + "step": 15443 + }, + { + "epoch": 0.45, + "grad_norm": 1.1645692666866743, + "learning_rate": 6.077446224707334e-06, + "loss": 0.2963, + "step": 15444 + }, + { + "epoch": 0.45, + "grad_norm": 1.195401973416383, + "learning_rate": 6.076987547513873e-06, + "loss": 0.317, + "step": 15445 + }, + { + "epoch": 0.45, + "grad_norm": 2.680952600397322, + "learning_rate": 6.076528860815984e-06, + "loss": 0.3321, + "step": 15446 + }, + { + "epoch": 0.45, + "grad_norm": 1.23558259257064, + "learning_rate": 6.076070164617718e-06, + "loss": 0.2999, + "step": 15447 + }, + { + "epoch": 0.45, + "grad_norm": 1.2582481260178215, + "learning_rate": 6.07561145892312e-06, + "loss": 0.3268, + "step": 15448 + }, + { + "epoch": 0.45, + "grad_norm": 1.249876139235157, + "learning_rate": 6.075152743736238e-06, + "loss": 0.3056, + "step": 15449 + }, + { + "epoch": 0.45, + "grad_norm": 1.2440138064826018, + "learning_rate": 6.074694019061121e-06, + "loss": 0.3142, + "step": 15450 + }, + { + "epoch": 0.45, + "grad_norm": 4.612267174854832, + "learning_rate": 6.074235284901817e-06, + "loss": 0.3268, + "step": 15451 + }, + { + "epoch": 0.45, + "grad_norm": 1.311817694810524, + "learning_rate": 6.0737765412623775e-06, + "loss": 0.3151, + "step": 15452 + }, + { + "epoch": 0.45, + "grad_norm": 1.8722491467455669, + "learning_rate": 6.0733177881468455e-06, + "loss": 0.3214, + "step": 15453 + }, + { + "epoch": 0.45, + "grad_norm": 1.276113371270528, + "learning_rate": 6.072859025559272e-06, + "loss": 0.2946, + "step": 15454 + }, + { + "epoch": 0.45, + "grad_norm": 1.7955971779260163, + "learning_rate": 6.072400253503707e-06, + "loss": 0.3406, + "step": 15455 + }, + { + "epoch": 0.45, + "grad_norm": 1.3657878605216498, + "learning_rate": 6.0719414719841985e-06, + "loss": 0.3435, + "step": 15456 + }, + { + "epoch": 0.45, + "grad_norm": 1.2437983018041003, + "learning_rate": 6.071482681004793e-06, + "loss": 0.3131, + "step": 15457 + }, + { + "epoch": 0.45, + "grad_norm": 1.4462508781475647, + "learning_rate": 6.071023880569541e-06, + "loss": 0.3258, + "step": 15458 + }, + { + "epoch": 0.45, + "grad_norm": 1.626739234493426, + "learning_rate": 6.070565070682492e-06, + "loss": 0.3255, + "step": 15459 + }, + { + "epoch": 0.45, + "grad_norm": 1.6621409864354997, + "learning_rate": 6.070106251347695e-06, + "loss": 0.3012, + "step": 15460 + }, + { + "epoch": 0.45, + "grad_norm": 1.2801825628026176, + "learning_rate": 6.069647422569198e-06, + "loss": 0.2971, + "step": 15461 + }, + { + "epoch": 0.45, + "grad_norm": 1.2580904267007693, + "learning_rate": 6.06918858435105e-06, + "loss": 0.3055, + "step": 15462 + }, + { + "epoch": 0.45, + "grad_norm": 0.8944442601494738, + "learning_rate": 6.068729736697301e-06, + "loss": 0.5484, + "step": 15463 + }, + { + "epoch": 0.45, + "grad_norm": 1.237088932910555, + "learning_rate": 6.068270879612001e-06, + "loss": 0.3214, + "step": 15464 + }, + { + "epoch": 0.45, + "grad_norm": 1.5365841188529457, + "learning_rate": 6.067812013099198e-06, + "loss": 0.303, + "step": 15465 + }, + { + "epoch": 0.45, + "grad_norm": 1.5914068537694765, + "learning_rate": 6.067353137162942e-06, + "loss": 0.3014, + "step": 15466 + }, + { + "epoch": 0.45, + "grad_norm": 1.3152089536260894, + "learning_rate": 6.0668942518072824e-06, + "loss": 0.3222, + "step": 15467 + }, + { + "epoch": 0.45, + "grad_norm": 1.4089499261971636, + "learning_rate": 6.06643535703627e-06, + "loss": 0.3259, + "step": 15468 + }, + { + "epoch": 0.45, + "grad_norm": 1.9290888570861995, + "learning_rate": 6.065976452853953e-06, + "loss": 0.3266, + "step": 15469 + }, + { + "epoch": 0.45, + "grad_norm": 1.209755437426685, + "learning_rate": 6.065517539264383e-06, + "loss": 0.3052, + "step": 15470 + }, + { + "epoch": 0.45, + "grad_norm": 1.5376651898839813, + "learning_rate": 6.065058616271608e-06, + "loss": 0.3196, + "step": 15471 + }, + { + "epoch": 0.45, + "grad_norm": 1.2142552637767754, + "learning_rate": 6.0645996838796794e-06, + "loss": 0.3159, + "step": 15472 + }, + { + "epoch": 0.45, + "grad_norm": 1.2711592358735349, + "learning_rate": 6.064140742092648e-06, + "loss": 0.3667, + "step": 15473 + }, + { + "epoch": 0.45, + "grad_norm": 1.6410858730075102, + "learning_rate": 6.06368179091456e-06, + "loss": 0.3301, + "step": 15474 + }, + { + "epoch": 0.45, + "grad_norm": 1.343383429197141, + "learning_rate": 6.063222830349471e-06, + "loss": 0.3363, + "step": 15475 + }, + { + "epoch": 0.45, + "grad_norm": 1.28934803379444, + "learning_rate": 6.062763860401427e-06, + "loss": 0.3244, + "step": 15476 + }, + { + "epoch": 0.45, + "grad_norm": 1.397151797980091, + "learning_rate": 6.062304881074481e-06, + "loss": 0.3157, + "step": 15477 + }, + { + "epoch": 0.45, + "grad_norm": 1.2881959038463977, + "learning_rate": 6.061845892372682e-06, + "loss": 0.3216, + "step": 15478 + }, + { + "epoch": 0.45, + "grad_norm": 1.2712468646440829, + "learning_rate": 6.061386894300082e-06, + "loss": 0.3156, + "step": 15479 + }, + { + "epoch": 0.45, + "grad_norm": 1.4333138794626508, + "learning_rate": 6.06092788686073e-06, + "loss": 0.3175, + "step": 15480 + }, + { + "epoch": 0.45, + "grad_norm": 1.159225190621533, + "learning_rate": 6.060468870058678e-06, + "loss": 0.3245, + "step": 15481 + }, + { + "epoch": 0.45, + "grad_norm": 1.3780872989771837, + "learning_rate": 6.060009843897976e-06, + "loss": 0.2957, + "step": 15482 + }, + { + "epoch": 0.45, + "grad_norm": 1.2740770258238348, + "learning_rate": 6.059550808382676e-06, + "loss": 0.3446, + "step": 15483 + }, + { + "epoch": 0.45, + "grad_norm": 1.3002073226616147, + "learning_rate": 6.059091763516829e-06, + "loss": 0.301, + "step": 15484 + }, + { + "epoch": 0.45, + "grad_norm": 0.9143083128051662, + "learning_rate": 6.058632709304484e-06, + "loss": 0.5825, + "step": 15485 + }, + { + "epoch": 0.45, + "grad_norm": 1.3579203271770766, + "learning_rate": 6.0581736457496935e-06, + "loss": 0.3397, + "step": 15486 + }, + { + "epoch": 0.45, + "grad_norm": 0.9277624881327153, + "learning_rate": 6.05771457285651e-06, + "loss": 0.6014, + "step": 15487 + }, + { + "epoch": 0.45, + "grad_norm": 1.2364309600807253, + "learning_rate": 6.0572554906289835e-06, + "loss": 0.3197, + "step": 15488 + }, + { + "epoch": 0.45, + "grad_norm": 5.516574181569822, + "learning_rate": 6.056796399071166e-06, + "loss": 0.3224, + "step": 15489 + }, + { + "epoch": 0.45, + "grad_norm": 1.5828550900979967, + "learning_rate": 6.056337298187107e-06, + "loss": 0.3216, + "step": 15490 + }, + { + "epoch": 0.45, + "grad_norm": 1.4997205330382721, + "learning_rate": 6.05587818798086e-06, + "loss": 0.2932, + "step": 15491 + }, + { + "epoch": 0.45, + "grad_norm": 1.2820837516435903, + "learning_rate": 6.055419068456478e-06, + "loss": 0.3182, + "step": 15492 + }, + { + "epoch": 0.45, + "grad_norm": 1.345218775267753, + "learning_rate": 6.054959939618011e-06, + "loss": 0.3183, + "step": 15493 + }, + { + "epoch": 0.45, + "grad_norm": 1.2959748144945482, + "learning_rate": 6.054500801469509e-06, + "loss": 0.3105, + "step": 15494 + }, + { + "epoch": 0.45, + "grad_norm": 1.4551009781209179, + "learning_rate": 6.054041654015025e-06, + "loss": 0.3333, + "step": 15495 + }, + { + "epoch": 0.45, + "grad_norm": 1.2999358222327426, + "learning_rate": 6.053582497258613e-06, + "loss": 0.2952, + "step": 15496 + }, + { + "epoch": 0.45, + "grad_norm": 1.4700283683373205, + "learning_rate": 6.053123331204324e-06, + "loss": 0.3161, + "step": 15497 + }, + { + "epoch": 0.45, + "grad_norm": 1.6672125580545376, + "learning_rate": 6.052664155856209e-06, + "loss": 0.3033, + "step": 15498 + }, + { + "epoch": 0.45, + "grad_norm": 1.2465388700126943, + "learning_rate": 6.0522049712183225e-06, + "loss": 0.3399, + "step": 15499 + }, + { + "epoch": 0.45, + "grad_norm": 1.263318672211034, + "learning_rate": 6.0517457772947145e-06, + "loss": 0.3113, + "step": 15500 + }, + { + "epoch": 0.45, + "grad_norm": 1.2808832606734968, + "learning_rate": 6.051286574089439e-06, + "loss": 0.3029, + "step": 15501 + }, + { + "epoch": 0.45, + "grad_norm": 1.1922002508397265, + "learning_rate": 6.050827361606549e-06, + "loss": 0.2994, + "step": 15502 + }, + { + "epoch": 0.45, + "grad_norm": 1.3971118974643904, + "learning_rate": 6.0503681398500936e-06, + "loss": 0.3076, + "step": 15503 + }, + { + "epoch": 0.45, + "grad_norm": 1.729951687365892, + "learning_rate": 6.049908908824128e-06, + "loss": 0.3244, + "step": 15504 + }, + { + "epoch": 0.45, + "grad_norm": 1.2508626414942816, + "learning_rate": 6.049449668532705e-06, + "loss": 0.3345, + "step": 15505 + }, + { + "epoch": 0.45, + "grad_norm": 2.825834966498015, + "learning_rate": 6.048990418979878e-06, + "loss": 0.3155, + "step": 15506 + }, + { + "epoch": 0.45, + "grad_norm": 1.3854618985888993, + "learning_rate": 6.048531160169698e-06, + "loss": 0.3471, + "step": 15507 + }, + { + "epoch": 0.45, + "grad_norm": 1.307341463052194, + "learning_rate": 6.048071892106219e-06, + "loss": 0.3142, + "step": 15508 + }, + { + "epoch": 0.45, + "grad_norm": 1.4520250203637448, + "learning_rate": 6.047612614793493e-06, + "loss": 0.3012, + "step": 15509 + }, + { + "epoch": 0.45, + "grad_norm": 1.5667532948960832, + "learning_rate": 6.0471533282355755e-06, + "loss": 0.3322, + "step": 15510 + }, + { + "epoch": 0.45, + "grad_norm": 1.2580106007217473, + "learning_rate": 6.046694032436517e-06, + "loss": 0.3114, + "step": 15511 + }, + { + "epoch": 0.45, + "grad_norm": 1.4381019907026662, + "learning_rate": 6.046234727400373e-06, + "loss": 0.3049, + "step": 15512 + }, + { + "epoch": 0.45, + "grad_norm": 1.454231777957459, + "learning_rate": 6.045775413131196e-06, + "loss": 0.3253, + "step": 15513 + }, + { + "epoch": 0.45, + "grad_norm": 1.286993292132473, + "learning_rate": 6.045316089633038e-06, + "loss": 0.3117, + "step": 15514 + }, + { + "epoch": 0.45, + "grad_norm": 1.3221856932596296, + "learning_rate": 6.044856756909956e-06, + "loss": 0.2937, + "step": 15515 + }, + { + "epoch": 0.45, + "grad_norm": 1.5560783985398265, + "learning_rate": 6.0443974149660015e-06, + "loss": 0.3094, + "step": 15516 + }, + { + "epoch": 0.45, + "grad_norm": 1.5796077114274603, + "learning_rate": 6.043938063805227e-06, + "loss": 0.3131, + "step": 15517 + }, + { + "epoch": 0.45, + "grad_norm": 1.4012268149004041, + "learning_rate": 6.043478703431688e-06, + "loss": 0.3338, + "step": 15518 + }, + { + "epoch": 0.45, + "grad_norm": 1.2745900712669098, + "learning_rate": 6.043019333849437e-06, + "loss": 0.3271, + "step": 15519 + }, + { + "epoch": 0.45, + "grad_norm": 1.5274601425599637, + "learning_rate": 6.042559955062531e-06, + "loss": 0.3551, + "step": 15520 + }, + { + "epoch": 0.45, + "grad_norm": 1.367977773360943, + "learning_rate": 6.0421005670750185e-06, + "loss": 0.3188, + "step": 15521 + }, + { + "epoch": 0.45, + "grad_norm": 1.2014214685463602, + "learning_rate": 6.041641169890958e-06, + "loss": 0.3201, + "step": 15522 + }, + { + "epoch": 0.45, + "grad_norm": 1.314265372118674, + "learning_rate": 6.041181763514403e-06, + "loss": 0.3028, + "step": 15523 + }, + { + "epoch": 0.45, + "grad_norm": 1.2672945912562934, + "learning_rate": 6.040722347949408e-06, + "loss": 0.3181, + "step": 15524 + }, + { + "epoch": 0.45, + "grad_norm": 1.468703061452083, + "learning_rate": 6.0402629232000275e-06, + "loss": 0.3112, + "step": 15525 + }, + { + "epoch": 0.45, + "grad_norm": 1.300364084878531, + "learning_rate": 6.039803489270314e-06, + "loss": 0.3282, + "step": 15526 + }, + { + "epoch": 0.45, + "grad_norm": 1.7503654991265556, + "learning_rate": 6.0393440461643225e-06, + "loss": 0.3141, + "step": 15527 + }, + { + "epoch": 0.45, + "grad_norm": 1.2686653007491853, + "learning_rate": 6.0388845938861085e-06, + "loss": 0.305, + "step": 15528 + }, + { + "epoch": 0.45, + "grad_norm": 1.2883974283511073, + "learning_rate": 6.0384251324397284e-06, + "loss": 0.3068, + "step": 15529 + }, + { + "epoch": 0.45, + "grad_norm": 1.2442312805388898, + "learning_rate": 6.037965661829234e-06, + "loss": 0.2918, + "step": 15530 + }, + { + "epoch": 0.45, + "grad_norm": 1.2114655395299059, + "learning_rate": 6.03750618205868e-06, + "loss": 0.2965, + "step": 15531 + }, + { + "epoch": 0.45, + "grad_norm": 1.3626941974714355, + "learning_rate": 6.0370466931321226e-06, + "loss": 0.3391, + "step": 15532 + }, + { + "epoch": 0.45, + "grad_norm": 1.2408191084752977, + "learning_rate": 6.036587195053617e-06, + "loss": 0.3139, + "step": 15533 + }, + { + "epoch": 0.45, + "grad_norm": 1.306027120959316, + "learning_rate": 6.036127687827219e-06, + "loss": 0.3092, + "step": 15534 + }, + { + "epoch": 0.45, + "grad_norm": 1.4676975781427153, + "learning_rate": 6.0356681714569806e-06, + "loss": 0.3102, + "step": 15535 + }, + { + "epoch": 0.45, + "grad_norm": 1.287417990654896, + "learning_rate": 6.03520864594696e-06, + "loss": 0.3085, + "step": 15536 + }, + { + "epoch": 0.45, + "grad_norm": 1.451832315053698, + "learning_rate": 6.0347491113012125e-06, + "loss": 0.2959, + "step": 15537 + }, + { + "epoch": 0.45, + "grad_norm": 1.4312179468592288, + "learning_rate": 6.034289567523792e-06, + "loss": 0.346, + "step": 15538 + }, + { + "epoch": 0.45, + "grad_norm": 2.0681126992912184, + "learning_rate": 6.0338300146187546e-06, + "loss": 0.3426, + "step": 15539 + }, + { + "epoch": 0.45, + "grad_norm": 1.2787810017056156, + "learning_rate": 6.033370452590155e-06, + "loss": 0.3172, + "step": 15540 + }, + { + "epoch": 0.45, + "grad_norm": 1.417609612069778, + "learning_rate": 6.03291088144205e-06, + "loss": 0.3294, + "step": 15541 + }, + { + "epoch": 0.45, + "grad_norm": 1.907008533313508, + "learning_rate": 6.032451301178496e-06, + "loss": 0.3115, + "step": 15542 + }, + { + "epoch": 0.45, + "grad_norm": 1.4513748655656853, + "learning_rate": 6.031991711803547e-06, + "loss": 0.3218, + "step": 15543 + }, + { + "epoch": 0.45, + "grad_norm": 1.2686649866222688, + "learning_rate": 6.0315321133212615e-06, + "loss": 0.3237, + "step": 15544 + }, + { + "epoch": 0.45, + "grad_norm": 1.462232345537497, + "learning_rate": 6.031072505735691e-06, + "loss": 0.2911, + "step": 15545 + }, + { + "epoch": 0.45, + "grad_norm": 1.2316175212100255, + "learning_rate": 6.030612889050896e-06, + "loss": 0.2993, + "step": 15546 + }, + { + "epoch": 0.45, + "grad_norm": 1.3387404563088787, + "learning_rate": 6.030153263270929e-06, + "loss": 0.304, + "step": 15547 + }, + { + "epoch": 0.45, + "grad_norm": 1.3036486600460828, + "learning_rate": 6.029693628399851e-06, + "loss": 0.3683, + "step": 15548 + }, + { + "epoch": 0.45, + "grad_norm": 1.0863976565342328, + "learning_rate": 6.029233984441713e-06, + "loss": 0.6031, + "step": 15549 + }, + { + "epoch": 0.45, + "grad_norm": 1.4988096706386835, + "learning_rate": 6.028774331400573e-06, + "loss": 0.3397, + "step": 15550 + }, + { + "epoch": 0.45, + "grad_norm": 1.417592529385708, + "learning_rate": 6.028314669280489e-06, + "loss": 0.3353, + "step": 15551 + }, + { + "epoch": 0.45, + "grad_norm": 5.34158785943009, + "learning_rate": 6.027854998085518e-06, + "loss": 0.3385, + "step": 15552 + }, + { + "epoch": 0.45, + "grad_norm": 1.553851399949654, + "learning_rate": 6.027395317819714e-06, + "loss": 0.3231, + "step": 15553 + }, + { + "epoch": 0.45, + "grad_norm": 1.292712614850813, + "learning_rate": 6.026935628487135e-06, + "loss": 0.3084, + "step": 15554 + }, + { + "epoch": 0.45, + "grad_norm": 1.4744735518549017, + "learning_rate": 6.026475930091837e-06, + "loss": 0.3202, + "step": 15555 + }, + { + "epoch": 0.45, + "grad_norm": 2.982472420759004, + "learning_rate": 6.0260162226378795e-06, + "loss": 0.3545, + "step": 15556 + }, + { + "epoch": 0.45, + "grad_norm": 1.4065428654148255, + "learning_rate": 6.0255565061293165e-06, + "loss": 0.3477, + "step": 15557 + }, + { + "epoch": 0.45, + "grad_norm": 1.254680693529171, + "learning_rate": 6.0250967805702055e-06, + "loss": 0.333, + "step": 15558 + }, + { + "epoch": 0.45, + "grad_norm": 0.9666280792652316, + "learning_rate": 6.024637045964604e-06, + "loss": 0.6207, + "step": 15559 + }, + { + "epoch": 0.45, + "grad_norm": 1.44122256795176, + "learning_rate": 6.024177302316569e-06, + "loss": 0.3138, + "step": 15560 + }, + { + "epoch": 0.45, + "grad_norm": 1.2347374589240285, + "learning_rate": 6.023717549630158e-06, + "loss": 0.3168, + "step": 15561 + }, + { + "epoch": 0.45, + "grad_norm": 1.326224026961701, + "learning_rate": 6.02325778790943e-06, + "loss": 0.3037, + "step": 15562 + }, + { + "epoch": 0.45, + "grad_norm": 1.4213046838858883, + "learning_rate": 6.0227980171584384e-06, + "loss": 0.3233, + "step": 15563 + }, + { + "epoch": 0.45, + "grad_norm": 1.385067826668809, + "learning_rate": 6.022338237381243e-06, + "loss": 0.3344, + "step": 15564 + }, + { + "epoch": 0.45, + "grad_norm": 1.3179368077368192, + "learning_rate": 6.021878448581903e-06, + "loss": 0.3021, + "step": 15565 + }, + { + "epoch": 0.45, + "grad_norm": 1.2608707852108691, + "learning_rate": 6.0214186507644745e-06, + "loss": 0.3037, + "step": 15566 + }, + { + "epoch": 0.45, + "grad_norm": 1.811672460838261, + "learning_rate": 6.020958843933014e-06, + "loss": 0.3049, + "step": 15567 + }, + { + "epoch": 0.45, + "grad_norm": 1.3083015833501757, + "learning_rate": 6.02049902809158e-06, + "loss": 0.3436, + "step": 15568 + }, + { + "epoch": 0.45, + "grad_norm": 1.3368683038821487, + "learning_rate": 6.02003920324423e-06, + "loss": 0.3008, + "step": 15569 + }, + { + "epoch": 0.45, + "grad_norm": 1.3119974297183792, + "learning_rate": 6.019579369395026e-06, + "loss": 0.3154, + "step": 15570 + }, + { + "epoch": 0.45, + "grad_norm": 1.2914310919796799, + "learning_rate": 6.01911952654802e-06, + "loss": 0.3206, + "step": 15571 + }, + { + "epoch": 0.45, + "grad_norm": 1.2074135047321901, + "learning_rate": 6.018659674707273e-06, + "loss": 0.2996, + "step": 15572 + }, + { + "epoch": 0.45, + "grad_norm": 1.238361734603944, + "learning_rate": 6.018199813876844e-06, + "loss": 0.3322, + "step": 15573 + }, + { + "epoch": 0.45, + "grad_norm": 2.121812725689242, + "learning_rate": 6.01773994406079e-06, + "loss": 0.3035, + "step": 15574 + }, + { + "epoch": 0.45, + "grad_norm": 1.8125415888704308, + "learning_rate": 6.0172800652631706e-06, + "loss": 0.333, + "step": 15575 + }, + { + "epoch": 0.45, + "grad_norm": 1.3333527107320924, + "learning_rate": 6.016820177488042e-06, + "loss": 0.3276, + "step": 15576 + }, + { + "epoch": 0.45, + "grad_norm": 1.261952023064421, + "learning_rate": 6.016360280739464e-06, + "loss": 0.2959, + "step": 15577 + }, + { + "epoch": 0.45, + "grad_norm": 1.4734259237432128, + "learning_rate": 6.015900375021495e-06, + "loss": 0.3349, + "step": 15578 + }, + { + "epoch": 0.45, + "grad_norm": 1.8082670051654024, + "learning_rate": 6.015440460338195e-06, + "loss": 0.3149, + "step": 15579 + }, + { + "epoch": 0.45, + "grad_norm": 1.2283592052084242, + "learning_rate": 6.014980536693622e-06, + "loss": 0.2982, + "step": 15580 + }, + { + "epoch": 0.45, + "grad_norm": 1.4502472610898007, + "learning_rate": 6.014520604091834e-06, + "loss": 0.3011, + "step": 15581 + }, + { + "epoch": 0.45, + "grad_norm": 1.4640361289031878, + "learning_rate": 6.014060662536889e-06, + "loss": 0.3081, + "step": 15582 + }, + { + "epoch": 0.45, + "grad_norm": 1.702545241243656, + "learning_rate": 6.013600712032849e-06, + "loss": 0.3124, + "step": 15583 + }, + { + "epoch": 0.45, + "grad_norm": 1.2131050249095827, + "learning_rate": 6.013140752583771e-06, + "loss": 0.2982, + "step": 15584 + }, + { + "epoch": 0.45, + "grad_norm": 1.263149538615144, + "learning_rate": 6.012680784193715e-06, + "loss": 0.3477, + "step": 15585 + }, + { + "epoch": 0.45, + "grad_norm": 1.4016807491208259, + "learning_rate": 6.012220806866741e-06, + "loss": 0.3205, + "step": 15586 + }, + { + "epoch": 0.45, + "grad_norm": 1.4422205541296542, + "learning_rate": 6.0117608206069046e-06, + "loss": 0.3577, + "step": 15587 + }, + { + "epoch": 0.45, + "grad_norm": 1.3659407667644237, + "learning_rate": 6.011300825418269e-06, + "loss": 0.3381, + "step": 15588 + }, + { + "epoch": 0.45, + "grad_norm": 1.4670830464705358, + "learning_rate": 6.010840821304893e-06, + "loss": 0.3325, + "step": 15589 + }, + { + "epoch": 0.45, + "grad_norm": 1.4801609569810275, + "learning_rate": 6.0103808082708345e-06, + "loss": 0.3225, + "step": 15590 + }, + { + "epoch": 0.45, + "grad_norm": 1.0000384052285858, + "learning_rate": 6.0099207863201535e-06, + "loss": 0.5923, + "step": 15591 + }, + { + "epoch": 0.45, + "grad_norm": 1.2623510951644035, + "learning_rate": 6.0094607554569114e-06, + "loss": 0.3194, + "step": 15592 + }, + { + "epoch": 0.45, + "grad_norm": 1.4083257697540505, + "learning_rate": 6.009000715685168e-06, + "loss": 0.3356, + "step": 15593 + }, + { + "epoch": 0.45, + "grad_norm": 1.3856706330159911, + "learning_rate": 6.008540667008981e-06, + "loss": 0.3037, + "step": 15594 + }, + { + "epoch": 0.45, + "grad_norm": 1.4134670068348423, + "learning_rate": 6.008080609432411e-06, + "loss": 0.3172, + "step": 15595 + }, + { + "epoch": 0.45, + "grad_norm": 1.2701106551056267, + "learning_rate": 6.007620542959518e-06, + "loss": 0.3079, + "step": 15596 + }, + { + "epoch": 0.45, + "grad_norm": 1.8765836752977312, + "learning_rate": 6.007160467594363e-06, + "loss": 0.3203, + "step": 15597 + }, + { + "epoch": 0.45, + "grad_norm": 1.4535699184646673, + "learning_rate": 6.006700383341007e-06, + "loss": 0.3809, + "step": 15598 + }, + { + "epoch": 0.45, + "grad_norm": 1.519105434550696, + "learning_rate": 6.006240290203508e-06, + "loss": 0.3138, + "step": 15599 + }, + { + "epoch": 0.45, + "grad_norm": 1.2608507027178615, + "learning_rate": 6.005780188185928e-06, + "loss": 0.3125, + "step": 15600 + }, + { + "epoch": 0.45, + "grad_norm": 1.3664805104803608, + "learning_rate": 6.005320077292326e-06, + "loss": 0.3194, + "step": 15601 + }, + { + "epoch": 0.45, + "grad_norm": 2.041608511284444, + "learning_rate": 6.0048599575267615e-06, + "loss": 0.324, + "step": 15602 + }, + { + "epoch": 0.45, + "grad_norm": 1.3256691022146825, + "learning_rate": 6.004399828893301e-06, + "loss": 0.3171, + "step": 15603 + }, + { + "epoch": 0.45, + "grad_norm": 1.36758894925091, + "learning_rate": 6.003939691395997e-06, + "loss": 0.3074, + "step": 15604 + }, + { + "epoch": 0.45, + "grad_norm": 1.6118588117804156, + "learning_rate": 6.003479545038915e-06, + "loss": 0.3284, + "step": 15605 + }, + { + "epoch": 0.45, + "grad_norm": 1.280066426473836, + "learning_rate": 6.0030193898261145e-06, + "loss": 0.3004, + "step": 15606 + }, + { + "epoch": 0.45, + "grad_norm": 2.5071243416979296, + "learning_rate": 6.0025592257616585e-06, + "loss": 0.3317, + "step": 15607 + }, + { + "epoch": 0.45, + "grad_norm": 1.7070708102332206, + "learning_rate": 6.002099052849605e-06, + "loss": 0.3166, + "step": 15608 + }, + { + "epoch": 0.45, + "grad_norm": 1.7494049665394003, + "learning_rate": 6.001638871094015e-06, + "loss": 0.3277, + "step": 15609 + }, + { + "epoch": 0.45, + "grad_norm": 1.3862861185451163, + "learning_rate": 6.001178680498952e-06, + "loss": 0.3019, + "step": 15610 + }, + { + "epoch": 0.45, + "grad_norm": 1.5893490395021597, + "learning_rate": 6.000718481068478e-06, + "loss": 0.395, + "step": 15611 + }, + { + "epoch": 0.45, + "grad_norm": 1.3005212364298901, + "learning_rate": 6.000258272806651e-06, + "loss": 0.3177, + "step": 15612 + }, + { + "epoch": 0.45, + "grad_norm": 1.3545232884609937, + "learning_rate": 5.999798055717532e-06, + "loss": 0.3082, + "step": 15613 + }, + { + "epoch": 0.45, + "grad_norm": 1.3003631422813462, + "learning_rate": 5.999337829805186e-06, + "loss": 0.319, + "step": 15614 + }, + { + "epoch": 0.45, + "grad_norm": 1.8398844840924433, + "learning_rate": 5.998877595073672e-06, + "loss": 0.3248, + "step": 15615 + }, + { + "epoch": 0.45, + "grad_norm": 1.6129466352811774, + "learning_rate": 5.998417351527053e-06, + "loss": 0.3246, + "step": 15616 + }, + { + "epoch": 0.45, + "grad_norm": 1.2599514519901986, + "learning_rate": 5.997957099169388e-06, + "loss": 0.3069, + "step": 15617 + }, + { + "epoch": 0.45, + "grad_norm": 1.2711245297900562, + "learning_rate": 5.997496838004741e-06, + "loss": 0.3039, + "step": 15618 + }, + { + "epoch": 0.45, + "grad_norm": 1.431643852039149, + "learning_rate": 5.997036568037174e-06, + "loss": 0.3294, + "step": 15619 + }, + { + "epoch": 0.45, + "grad_norm": 1.5433862290106732, + "learning_rate": 5.996576289270747e-06, + "loss": 0.3135, + "step": 15620 + }, + { + "epoch": 0.45, + "grad_norm": 1.3770133780054756, + "learning_rate": 5.996116001709525e-06, + "loss": 0.292, + "step": 15621 + }, + { + "epoch": 0.45, + "grad_norm": 1.2858774134943303, + "learning_rate": 5.995655705357567e-06, + "loss": 0.3138, + "step": 15622 + }, + { + "epoch": 0.45, + "grad_norm": 1.3765771599706749, + "learning_rate": 5.995195400218936e-06, + "loss": 0.2991, + "step": 15623 + }, + { + "epoch": 0.45, + "grad_norm": 1.2593468645813883, + "learning_rate": 5.994735086297695e-06, + "loss": 0.3184, + "step": 15624 + }, + { + "epoch": 0.45, + "grad_norm": 2.195785684728921, + "learning_rate": 5.994274763597907e-06, + "loss": 0.3129, + "step": 15625 + }, + { + "epoch": 0.45, + "grad_norm": 1.4581101649327983, + "learning_rate": 5.993814432123633e-06, + "loss": 0.3368, + "step": 15626 + }, + { + "epoch": 0.45, + "grad_norm": 1.8751110418626227, + "learning_rate": 5.9933540918789336e-06, + "loss": 0.2969, + "step": 15627 + }, + { + "epoch": 0.45, + "grad_norm": 1.4610689494476872, + "learning_rate": 5.992893742867876e-06, + "loss": 0.3205, + "step": 15628 + }, + { + "epoch": 0.45, + "grad_norm": 1.4716929189975774, + "learning_rate": 5.992433385094519e-06, + "loss": 0.3006, + "step": 15629 + }, + { + "epoch": 0.45, + "grad_norm": 1.3742599391539039, + "learning_rate": 5.991973018562925e-06, + "loss": 0.3298, + "step": 15630 + }, + { + "epoch": 0.45, + "grad_norm": 1.3854869911817758, + "learning_rate": 5.99151264327716e-06, + "loss": 0.315, + "step": 15631 + }, + { + "epoch": 0.45, + "grad_norm": 1.7813332167076585, + "learning_rate": 5.991052259241283e-06, + "loss": 0.3386, + "step": 15632 + }, + { + "epoch": 0.45, + "grad_norm": 1.2478249362885347, + "learning_rate": 5.99059186645936e-06, + "loss": 0.3232, + "step": 15633 + }, + { + "epoch": 0.45, + "grad_norm": 1.3606476122061861, + "learning_rate": 5.990131464935453e-06, + "loss": 0.3215, + "step": 15634 + }, + { + "epoch": 0.45, + "grad_norm": 1.5787181455835786, + "learning_rate": 5.989671054673625e-06, + "loss": 0.3047, + "step": 15635 + }, + { + "epoch": 0.45, + "grad_norm": 1.4025727691149932, + "learning_rate": 5.989210635677938e-06, + "loss": 0.3263, + "step": 15636 + }, + { + "epoch": 0.45, + "grad_norm": 1.2827293418369903, + "learning_rate": 5.988750207952456e-06, + "loss": 0.3049, + "step": 15637 + }, + { + "epoch": 0.45, + "grad_norm": 1.4573385085546957, + "learning_rate": 5.988289771501243e-06, + "loss": 0.2955, + "step": 15638 + }, + { + "epoch": 0.45, + "grad_norm": 1.5232143646405472, + "learning_rate": 5.987829326328363e-06, + "loss": 0.3397, + "step": 15639 + }, + { + "epoch": 0.45, + "grad_norm": 1.4736052607006571, + "learning_rate": 5.9873688724378764e-06, + "loss": 0.3103, + "step": 15640 + }, + { + "epoch": 0.45, + "grad_norm": 1.3077213970712849, + "learning_rate": 5.9869084098338494e-06, + "loss": 0.3053, + "step": 15641 + }, + { + "epoch": 0.45, + "grad_norm": 1.2934395144258193, + "learning_rate": 5.986447938520344e-06, + "loss": 0.2906, + "step": 15642 + }, + { + "epoch": 0.45, + "grad_norm": 1.3496162104266785, + "learning_rate": 5.985987458501426e-06, + "loss": 0.3196, + "step": 15643 + }, + { + "epoch": 0.45, + "grad_norm": 1.4153564630848847, + "learning_rate": 5.985526969781158e-06, + "loss": 0.3238, + "step": 15644 + }, + { + "epoch": 0.45, + "grad_norm": 1.2431728786561924, + "learning_rate": 5.9850664723636025e-06, + "loss": 0.3219, + "step": 15645 + }, + { + "epoch": 0.45, + "grad_norm": 1.4612855444352335, + "learning_rate": 5.984605966252823e-06, + "loss": 0.3129, + "step": 15646 + }, + { + "epoch": 0.45, + "grad_norm": 1.167515573941983, + "learning_rate": 5.984145451452886e-06, + "loss": 0.2851, + "step": 15647 + }, + { + "epoch": 0.45, + "grad_norm": 1.1518499992789077, + "learning_rate": 5.983684927967856e-06, + "loss": 0.2968, + "step": 15648 + }, + { + "epoch": 0.45, + "grad_norm": 1.2303902193273888, + "learning_rate": 5.983224395801794e-06, + "loss": 0.3353, + "step": 15649 + }, + { + "epoch": 0.45, + "grad_norm": 1.3165954825689212, + "learning_rate": 5.982763854958766e-06, + "loss": 0.3141, + "step": 15650 + }, + { + "epoch": 0.45, + "grad_norm": 1.4946137120958152, + "learning_rate": 5.9823033054428355e-06, + "loss": 0.317, + "step": 15651 + }, + { + "epoch": 0.45, + "grad_norm": 1.6987977083734356, + "learning_rate": 5.98184274725807e-06, + "loss": 0.3125, + "step": 15652 + }, + { + "epoch": 0.45, + "grad_norm": 2.113286528488373, + "learning_rate": 5.981382180408529e-06, + "loss": 0.3147, + "step": 15653 + }, + { + "epoch": 0.45, + "grad_norm": 0.9315370043067933, + "learning_rate": 5.98092160489828e-06, + "loss": 0.5925, + "step": 15654 + }, + { + "epoch": 0.45, + "grad_norm": 1.3831546631820353, + "learning_rate": 5.980461020731386e-06, + "loss": 0.3427, + "step": 15655 + }, + { + "epoch": 0.45, + "grad_norm": 1.3131428716280975, + "learning_rate": 5.980000427911915e-06, + "loss": 0.3017, + "step": 15656 + }, + { + "epoch": 0.45, + "grad_norm": 1.7234687301063036, + "learning_rate": 5.979539826443928e-06, + "loss": 0.3206, + "step": 15657 + }, + { + "epoch": 0.45, + "grad_norm": 1.4422510230935395, + "learning_rate": 5.9790792163314915e-06, + "loss": 0.3168, + "step": 15658 + }, + { + "epoch": 0.45, + "grad_norm": 1.3848337815380933, + "learning_rate": 5.9786185975786694e-06, + "loss": 0.2999, + "step": 15659 + }, + { + "epoch": 0.45, + "grad_norm": 2.5737225878053533, + "learning_rate": 5.9781579701895274e-06, + "loss": 0.3217, + "step": 15660 + }, + { + "epoch": 0.45, + "grad_norm": 1.6356468638931096, + "learning_rate": 5.9776973341681295e-06, + "loss": 0.3281, + "step": 15661 + }, + { + "epoch": 0.45, + "grad_norm": 1.574729138627238, + "learning_rate": 5.977236689518544e-06, + "loss": 0.336, + "step": 15662 + }, + { + "epoch": 0.45, + "grad_norm": 1.5960686353114317, + "learning_rate": 5.976776036244833e-06, + "loss": 0.3248, + "step": 15663 + }, + { + "epoch": 0.45, + "grad_norm": 1.660377789752301, + "learning_rate": 5.976315374351061e-06, + "loss": 0.291, + "step": 15664 + }, + { + "epoch": 0.45, + "grad_norm": 1.3117102730533727, + "learning_rate": 5.975854703841297e-06, + "loss": 0.3425, + "step": 15665 + }, + { + "epoch": 0.45, + "grad_norm": 1.3985221581496123, + "learning_rate": 5.975394024719605e-06, + "loss": 0.3323, + "step": 15666 + }, + { + "epoch": 0.45, + "grad_norm": 1.503792578497194, + "learning_rate": 5.9749333369900485e-06, + "loss": 0.3456, + "step": 15667 + }, + { + "epoch": 0.45, + "grad_norm": 1.9671090681341281, + "learning_rate": 5.9744726406566945e-06, + "loss": 0.334, + "step": 15668 + }, + { + "epoch": 0.45, + "grad_norm": 1.2838041956087047, + "learning_rate": 5.974011935723608e-06, + "loss": 0.3217, + "step": 15669 + }, + { + "epoch": 0.45, + "grad_norm": 1.8641773079072628, + "learning_rate": 5.9735512221948576e-06, + "loss": 0.3098, + "step": 15670 + }, + { + "epoch": 0.45, + "grad_norm": 1.2483584891830284, + "learning_rate": 5.973090500074505e-06, + "loss": 0.3015, + "step": 15671 + }, + { + "epoch": 0.45, + "grad_norm": 1.2620754311320672, + "learning_rate": 5.972629769366619e-06, + "loss": 0.3121, + "step": 15672 + }, + { + "epoch": 0.45, + "grad_norm": 1.6337493576167772, + "learning_rate": 5.972169030075263e-06, + "loss": 0.3152, + "step": 15673 + }, + { + "epoch": 0.45, + "grad_norm": 1.4969087507574557, + "learning_rate": 5.971708282204506e-06, + "loss": 0.3278, + "step": 15674 + }, + { + "epoch": 0.45, + "grad_norm": 1.2610192853230195, + "learning_rate": 5.971247525758411e-06, + "loss": 0.3049, + "step": 15675 + }, + { + "epoch": 0.45, + "grad_norm": 1.348911020106274, + "learning_rate": 5.970786760741049e-06, + "loss": 0.3194, + "step": 15676 + }, + { + "epoch": 0.45, + "grad_norm": 1.2705991917938722, + "learning_rate": 5.97032598715648e-06, + "loss": 0.3207, + "step": 15677 + }, + { + "epoch": 0.45, + "grad_norm": 1.440534887934335, + "learning_rate": 5.969865205008774e-06, + "loss": 0.3091, + "step": 15678 + }, + { + "epoch": 0.45, + "grad_norm": 1.439100580930771, + "learning_rate": 5.969404414301997e-06, + "loss": 0.3257, + "step": 15679 + }, + { + "epoch": 0.45, + "grad_norm": 1.3127897514343947, + "learning_rate": 5.9689436150402155e-06, + "loss": 0.3423, + "step": 15680 + }, + { + "epoch": 0.45, + "grad_norm": 1.2889582142145541, + "learning_rate": 5.968482807227496e-06, + "loss": 0.3043, + "step": 15681 + }, + { + "epoch": 0.45, + "grad_norm": 1.427674443692183, + "learning_rate": 5.9680219908679045e-06, + "loss": 0.3282, + "step": 15682 + }, + { + "epoch": 0.45, + "grad_norm": 1.2452300488989738, + "learning_rate": 5.967561165965509e-06, + "loss": 0.3065, + "step": 15683 + }, + { + "epoch": 0.45, + "grad_norm": 1.2595405763418968, + "learning_rate": 5.967100332524375e-06, + "loss": 0.2893, + "step": 15684 + }, + { + "epoch": 0.45, + "grad_norm": 1.39216220295501, + "learning_rate": 5.96663949054857e-06, + "loss": 0.3069, + "step": 15685 + }, + { + "epoch": 0.45, + "grad_norm": 1.2406630050391672, + "learning_rate": 5.96617864004216e-06, + "loss": 0.2962, + "step": 15686 + }, + { + "epoch": 0.45, + "grad_norm": 1.3763857589657267, + "learning_rate": 5.9657177810092135e-06, + "loss": 0.3143, + "step": 15687 + }, + { + "epoch": 0.46, + "grad_norm": 1.3370166098460752, + "learning_rate": 5.965256913453797e-06, + "loss": 0.3593, + "step": 15688 + }, + { + "epoch": 0.46, + "grad_norm": 1.2403969316717989, + "learning_rate": 5.964796037379978e-06, + "loss": 0.3378, + "step": 15689 + }, + { + "epoch": 0.46, + "grad_norm": 1.4264219815667551, + "learning_rate": 5.964335152791823e-06, + "loss": 0.3233, + "step": 15690 + }, + { + "epoch": 0.46, + "grad_norm": 1.2957137177468288, + "learning_rate": 5.963874259693399e-06, + "loss": 0.3213, + "step": 15691 + }, + { + "epoch": 0.46, + "grad_norm": 1.3366016068001056, + "learning_rate": 5.963413358088773e-06, + "loss": 0.31, + "step": 15692 + }, + { + "epoch": 0.46, + "grad_norm": 1.0090033244100736, + "learning_rate": 5.962952447982015e-06, + "loss": 0.6287, + "step": 15693 + }, + { + "epoch": 0.46, + "grad_norm": 1.2901248254263982, + "learning_rate": 5.96249152937719e-06, + "loss": 0.3203, + "step": 15694 + }, + { + "epoch": 0.46, + "grad_norm": 1.316388695180822, + "learning_rate": 5.962030602278367e-06, + "loss": 0.2925, + "step": 15695 + }, + { + "epoch": 0.46, + "grad_norm": 1.288863562248574, + "learning_rate": 5.961569666689612e-06, + "loss": 0.2955, + "step": 15696 + }, + { + "epoch": 0.46, + "grad_norm": 1.4090766645128572, + "learning_rate": 5.961108722614996e-06, + "loss": 0.3042, + "step": 15697 + }, + { + "epoch": 0.46, + "grad_norm": 1.3602501499571493, + "learning_rate": 5.9606477700585855e-06, + "loss": 0.3331, + "step": 15698 + }, + { + "epoch": 0.46, + "grad_norm": 1.3649673037917676, + "learning_rate": 5.960186809024446e-06, + "loss": 0.3164, + "step": 15699 + }, + { + "epoch": 0.46, + "grad_norm": 1.2663974448419726, + "learning_rate": 5.959725839516648e-06, + "loss": 0.3426, + "step": 15700 + }, + { + "epoch": 0.46, + "grad_norm": 1.3316380065926117, + "learning_rate": 5.959264861539258e-06, + "loss": 0.3284, + "step": 15701 + }, + { + "epoch": 0.46, + "grad_norm": 1.3516857266275584, + "learning_rate": 5.958803875096346e-06, + "loss": 0.3265, + "step": 15702 + }, + { + "epoch": 0.46, + "grad_norm": 1.2852276476862918, + "learning_rate": 5.958342880191978e-06, + "loss": 0.3073, + "step": 15703 + }, + { + "epoch": 0.46, + "grad_norm": 1.2622443341265954, + "learning_rate": 5.9578818768302245e-06, + "loss": 0.3043, + "step": 15704 + }, + { + "epoch": 0.46, + "grad_norm": 1.3265700652564365, + "learning_rate": 5.957420865015152e-06, + "loss": 0.3125, + "step": 15705 + }, + { + "epoch": 0.46, + "grad_norm": 1.2654653256302313, + "learning_rate": 5.956959844750831e-06, + "loss": 0.3294, + "step": 15706 + }, + { + "epoch": 0.46, + "grad_norm": 1.796058920897199, + "learning_rate": 5.956498816041328e-06, + "loss": 0.3175, + "step": 15707 + }, + { + "epoch": 0.46, + "grad_norm": 1.3997311169363837, + "learning_rate": 5.956037778890712e-06, + "loss": 0.2951, + "step": 15708 + }, + { + "epoch": 0.46, + "grad_norm": 1.4439045330010019, + "learning_rate": 5.955576733303053e-06, + "loss": 0.32, + "step": 15709 + }, + { + "epoch": 0.46, + "grad_norm": 1.4960791800523763, + "learning_rate": 5.9551156792824175e-06, + "loss": 0.3274, + "step": 15710 + }, + { + "epoch": 0.46, + "grad_norm": 1.5567358489940575, + "learning_rate": 5.954654616832874e-06, + "loss": 0.3178, + "step": 15711 + }, + { + "epoch": 0.46, + "grad_norm": 1.3980865676874452, + "learning_rate": 5.954193545958497e-06, + "loss": 0.3104, + "step": 15712 + }, + { + "epoch": 0.46, + "grad_norm": 1.3667115643866161, + "learning_rate": 5.95373246666335e-06, + "loss": 0.3216, + "step": 15713 + }, + { + "epoch": 0.46, + "grad_norm": 1.332657134440066, + "learning_rate": 5.953271378951502e-06, + "loss": 0.3014, + "step": 15714 + }, + { + "epoch": 0.46, + "grad_norm": 1.301110154412222, + "learning_rate": 5.952810282827024e-06, + "loss": 0.3241, + "step": 15715 + }, + { + "epoch": 0.46, + "grad_norm": 1.5623838913934776, + "learning_rate": 5.952349178293985e-06, + "loss": 0.3442, + "step": 15716 + }, + { + "epoch": 0.46, + "grad_norm": 0.9475288763167484, + "learning_rate": 5.9518880653564534e-06, + "loss": 0.5616, + "step": 15717 + }, + { + "epoch": 0.46, + "grad_norm": 1.2703745277014042, + "learning_rate": 5.9514269440185e-06, + "loss": 0.2989, + "step": 15718 + }, + { + "epoch": 0.46, + "grad_norm": 1.3767730376182075, + "learning_rate": 5.9509658142841916e-06, + "loss": 0.325, + "step": 15719 + }, + { + "epoch": 0.46, + "grad_norm": 1.3580740928728634, + "learning_rate": 5.950504676157601e-06, + "loss": 0.3074, + "step": 15720 + }, + { + "epoch": 0.46, + "grad_norm": 1.2581165273276231, + "learning_rate": 5.950043529642796e-06, + "loss": 0.3243, + "step": 15721 + }, + { + "epoch": 0.46, + "grad_norm": 1.6488377835217383, + "learning_rate": 5.949582374743846e-06, + "loss": 0.3381, + "step": 15722 + }, + { + "epoch": 0.46, + "grad_norm": 1.382729290332561, + "learning_rate": 5.9491212114648215e-06, + "loss": 0.3342, + "step": 15723 + }, + { + "epoch": 0.46, + "grad_norm": 1.5938076050690142, + "learning_rate": 5.9486600398097905e-06, + "loss": 0.3342, + "step": 15724 + }, + { + "epoch": 0.46, + "grad_norm": 1.6581697448290575, + "learning_rate": 5.948198859782826e-06, + "loss": 0.3195, + "step": 15725 + }, + { + "epoch": 0.46, + "grad_norm": 1.2267011321218713, + "learning_rate": 5.947737671387994e-06, + "loss": 0.347, + "step": 15726 + }, + { + "epoch": 0.46, + "grad_norm": 1.4431682411551607, + "learning_rate": 5.947276474629367e-06, + "loss": 0.3235, + "step": 15727 + }, + { + "epoch": 0.46, + "grad_norm": 1.4491302280311624, + "learning_rate": 5.946815269511015e-06, + "loss": 0.3002, + "step": 15728 + }, + { + "epoch": 0.46, + "grad_norm": 1.2764748075286014, + "learning_rate": 5.946354056037008e-06, + "loss": 0.3066, + "step": 15729 + }, + { + "epoch": 0.46, + "grad_norm": 1.3562249029359472, + "learning_rate": 5.945892834211415e-06, + "loss": 0.309, + "step": 15730 + }, + { + "epoch": 0.46, + "grad_norm": 1.5537706736409116, + "learning_rate": 5.9454316040383085e-06, + "loss": 0.3163, + "step": 15731 + }, + { + "epoch": 0.46, + "grad_norm": 1.2800191282861564, + "learning_rate": 5.944970365521757e-06, + "loss": 0.292, + "step": 15732 + }, + { + "epoch": 0.46, + "grad_norm": 2.857336085124343, + "learning_rate": 5.944509118665831e-06, + "loss": 0.3322, + "step": 15733 + }, + { + "epoch": 0.46, + "grad_norm": 1.4242909596971536, + "learning_rate": 5.9440478634746026e-06, + "loss": 0.3046, + "step": 15734 + }, + { + "epoch": 0.46, + "grad_norm": 1.4611870845026038, + "learning_rate": 5.943586599952142e-06, + "loss": 0.3108, + "step": 15735 + }, + { + "epoch": 0.46, + "grad_norm": 1.4040541791802406, + "learning_rate": 5.943125328102517e-06, + "loss": 0.2911, + "step": 15736 + }, + { + "epoch": 0.46, + "grad_norm": 1.5317931045962399, + "learning_rate": 5.942664047929801e-06, + "loss": 0.3199, + "step": 15737 + }, + { + "epoch": 0.46, + "grad_norm": 1.4951657513995915, + "learning_rate": 5.942202759438065e-06, + "loss": 0.3315, + "step": 15738 + }, + { + "epoch": 0.46, + "grad_norm": 1.4293140086724059, + "learning_rate": 5.941741462631379e-06, + "loss": 0.3166, + "step": 15739 + }, + { + "epoch": 0.46, + "grad_norm": 1.7042679742946871, + "learning_rate": 5.941280157513815e-06, + "loss": 0.3537, + "step": 15740 + }, + { + "epoch": 0.46, + "grad_norm": 1.698599822181269, + "learning_rate": 5.940818844089443e-06, + "loss": 0.3008, + "step": 15741 + }, + { + "epoch": 0.46, + "grad_norm": 1.412235850518803, + "learning_rate": 5.940357522362333e-06, + "loss": 0.3179, + "step": 15742 + }, + { + "epoch": 0.46, + "grad_norm": 1.6773647831352878, + "learning_rate": 5.939896192336559e-06, + "loss": 0.3286, + "step": 15743 + }, + { + "epoch": 0.46, + "grad_norm": 1.3174933090522396, + "learning_rate": 5.939434854016189e-06, + "loss": 0.3137, + "step": 15744 + }, + { + "epoch": 0.46, + "grad_norm": 1.9829459914585512, + "learning_rate": 5.938973507405297e-06, + "loss": 0.3176, + "step": 15745 + }, + { + "epoch": 0.46, + "grad_norm": 1.3451929661568507, + "learning_rate": 5.938512152507952e-06, + "loss": 0.3083, + "step": 15746 + }, + { + "epoch": 0.46, + "grad_norm": 1.431019635577031, + "learning_rate": 5.938050789328229e-06, + "loss": 0.312, + "step": 15747 + }, + { + "epoch": 0.46, + "grad_norm": 1.2528073806734925, + "learning_rate": 5.937589417870196e-06, + "loss": 0.3098, + "step": 15748 + }, + { + "epoch": 0.46, + "grad_norm": 1.2643679681933786, + "learning_rate": 5.937128038137927e-06, + "loss": 0.3087, + "step": 15749 + }, + { + "epoch": 0.46, + "grad_norm": 1.5951030031342834, + "learning_rate": 5.936666650135492e-06, + "loss": 0.3365, + "step": 15750 + }, + { + "epoch": 0.46, + "grad_norm": 1.2798014080061886, + "learning_rate": 5.936205253866962e-06, + "loss": 0.3142, + "step": 15751 + }, + { + "epoch": 0.46, + "grad_norm": 1.5736935510579397, + "learning_rate": 5.935743849336412e-06, + "loss": 0.3228, + "step": 15752 + }, + { + "epoch": 0.46, + "grad_norm": 2.242373126072264, + "learning_rate": 5.935282436547913e-06, + "loss": 0.3177, + "step": 15753 + }, + { + "epoch": 0.46, + "grad_norm": 1.388100266961843, + "learning_rate": 5.9348210155055355e-06, + "loss": 0.2971, + "step": 15754 + }, + { + "epoch": 0.46, + "grad_norm": 1.3812392495306407, + "learning_rate": 5.9343595862133515e-06, + "loss": 0.3309, + "step": 15755 + }, + { + "epoch": 0.46, + "grad_norm": 1.5889344319195997, + "learning_rate": 5.933898148675433e-06, + "loss": 0.3119, + "step": 15756 + }, + { + "epoch": 0.46, + "grad_norm": 1.4887589488797655, + "learning_rate": 5.9334367028958536e-06, + "loss": 0.2844, + "step": 15757 + }, + { + "epoch": 0.46, + "grad_norm": 4.343275601731509, + "learning_rate": 5.932975248878686e-06, + "loss": 0.317, + "step": 15758 + }, + { + "epoch": 0.46, + "grad_norm": 1.8571358040408679, + "learning_rate": 5.9325137866280014e-06, + "loss": 0.348, + "step": 15759 + }, + { + "epoch": 0.46, + "grad_norm": 3.032343188503232, + "learning_rate": 5.9320523161478705e-06, + "loss": 0.3025, + "step": 15760 + }, + { + "epoch": 0.46, + "grad_norm": 1.75917087968463, + "learning_rate": 5.9315908374423695e-06, + "loss": 0.3186, + "step": 15761 + }, + { + "epoch": 0.46, + "grad_norm": 1.2630455618168215, + "learning_rate": 5.931129350515568e-06, + "loss": 0.2966, + "step": 15762 + }, + { + "epoch": 0.46, + "grad_norm": 1.3162102634457018, + "learning_rate": 5.930667855371539e-06, + "loss": 0.3015, + "step": 15763 + }, + { + "epoch": 0.46, + "grad_norm": 1.2346976013796636, + "learning_rate": 5.930206352014357e-06, + "loss": 0.3095, + "step": 15764 + }, + { + "epoch": 0.46, + "grad_norm": 1.2016120366390621, + "learning_rate": 5.9297448404480925e-06, + "loss": 0.3107, + "step": 15765 + }, + { + "epoch": 0.46, + "grad_norm": 1.2653064855461091, + "learning_rate": 5.929283320676821e-06, + "loss": 0.3198, + "step": 15766 + }, + { + "epoch": 0.46, + "grad_norm": 1.355489593832525, + "learning_rate": 5.928821792704613e-06, + "loss": 0.3399, + "step": 15767 + }, + { + "epoch": 0.46, + "grad_norm": 1.3026769300253966, + "learning_rate": 5.928360256535544e-06, + "loss": 0.3157, + "step": 15768 + }, + { + "epoch": 0.46, + "grad_norm": 1.3996581915371427, + "learning_rate": 5.927898712173683e-06, + "loss": 0.306, + "step": 15769 + }, + { + "epoch": 0.46, + "grad_norm": 1.366449089590701, + "learning_rate": 5.927437159623107e-06, + "loss": 0.3059, + "step": 15770 + }, + { + "epoch": 0.46, + "grad_norm": 2.1415076090732654, + "learning_rate": 5.926975598887888e-06, + "loss": 0.2887, + "step": 15771 + }, + { + "epoch": 0.46, + "grad_norm": 0.9431836575181352, + "learning_rate": 5.926514029972098e-06, + "loss": 0.6159, + "step": 15772 + }, + { + "epoch": 0.46, + "grad_norm": 1.2677891594133537, + "learning_rate": 5.926052452879811e-06, + "loss": 0.2966, + "step": 15773 + }, + { + "epoch": 0.46, + "grad_norm": 2.4373104932317164, + "learning_rate": 5.925590867615102e-06, + "loss": 0.3037, + "step": 15774 + }, + { + "epoch": 0.46, + "grad_norm": 1.3299497746077518, + "learning_rate": 5.925129274182042e-06, + "loss": 0.3221, + "step": 15775 + }, + { + "epoch": 0.46, + "grad_norm": 0.9648143484553594, + "learning_rate": 5.924667672584708e-06, + "loss": 0.6614, + "step": 15776 + }, + { + "epoch": 0.46, + "grad_norm": 1.4120308388016756, + "learning_rate": 5.92420606282717e-06, + "loss": 0.3158, + "step": 15777 + }, + { + "epoch": 0.46, + "grad_norm": 1.5787821077572026, + "learning_rate": 5.923744444913504e-06, + "loss": 0.3598, + "step": 15778 + }, + { + "epoch": 0.46, + "grad_norm": 2.2168644902618935, + "learning_rate": 5.923282818847782e-06, + "loss": 0.3154, + "step": 15779 + }, + { + "epoch": 0.46, + "grad_norm": 1.740291636292331, + "learning_rate": 5.92282118463408e-06, + "loss": 0.3521, + "step": 15780 + }, + { + "epoch": 0.46, + "grad_norm": 1.346548240218013, + "learning_rate": 5.92235954227647e-06, + "loss": 0.3211, + "step": 15781 + }, + { + "epoch": 0.46, + "grad_norm": 1.9277767766587277, + "learning_rate": 5.921897891779028e-06, + "loss": 0.3377, + "step": 15782 + }, + { + "epoch": 0.46, + "grad_norm": 1.2903182448957107, + "learning_rate": 5.921436233145826e-06, + "loss": 0.3261, + "step": 15783 + }, + { + "epoch": 0.46, + "grad_norm": 1.4352158129701846, + "learning_rate": 5.9209745663809395e-06, + "loss": 0.2969, + "step": 15784 + }, + { + "epoch": 0.46, + "grad_norm": 1.2890175851268715, + "learning_rate": 5.920512891488441e-06, + "loss": 0.3192, + "step": 15785 + }, + { + "epoch": 0.46, + "grad_norm": 1.5926978598834487, + "learning_rate": 5.920051208472407e-06, + "loss": 0.3055, + "step": 15786 + }, + { + "epoch": 0.46, + "grad_norm": 1.208113101542787, + "learning_rate": 5.919589517336911e-06, + "loss": 0.2985, + "step": 15787 + }, + { + "epoch": 0.46, + "grad_norm": 1.6329270486100114, + "learning_rate": 5.919127818086027e-06, + "loss": 0.3442, + "step": 15788 + }, + { + "epoch": 0.46, + "grad_norm": 1.2136474406349589, + "learning_rate": 5.9186661107238285e-06, + "loss": 0.3057, + "step": 15789 + }, + { + "epoch": 0.46, + "grad_norm": 1.3266780211811995, + "learning_rate": 5.918204395254395e-06, + "loss": 0.33, + "step": 15790 + }, + { + "epoch": 0.46, + "grad_norm": 1.3395358351763935, + "learning_rate": 5.917742671681794e-06, + "loss": 0.3148, + "step": 15791 + }, + { + "epoch": 0.46, + "grad_norm": 1.227902698524698, + "learning_rate": 5.917280940010105e-06, + "loss": 0.3226, + "step": 15792 + }, + { + "epoch": 0.46, + "grad_norm": 1.2615727989153909, + "learning_rate": 5.916819200243401e-06, + "loss": 0.3001, + "step": 15793 + }, + { + "epoch": 0.46, + "grad_norm": 1.3182535431822386, + "learning_rate": 5.916357452385758e-06, + "loss": 0.3277, + "step": 15794 + }, + { + "epoch": 0.46, + "grad_norm": 1.3055181066158235, + "learning_rate": 5.915895696441251e-06, + "loss": 0.309, + "step": 15795 + }, + { + "epoch": 0.46, + "grad_norm": 1.2412925833150246, + "learning_rate": 5.9154339324139535e-06, + "loss": 0.3169, + "step": 15796 + }, + { + "epoch": 0.46, + "grad_norm": 1.3814345720634325, + "learning_rate": 5.914972160307941e-06, + "loss": 0.3152, + "step": 15797 + }, + { + "epoch": 0.46, + "grad_norm": 1.3469853935332579, + "learning_rate": 5.914510380127289e-06, + "loss": 0.3276, + "step": 15798 + }, + { + "epoch": 0.46, + "grad_norm": 1.1850329156923778, + "learning_rate": 5.914048591876074e-06, + "loss": 0.3154, + "step": 15799 + }, + { + "epoch": 0.46, + "grad_norm": 1.338033022724928, + "learning_rate": 5.913586795558369e-06, + "loss": 0.3255, + "step": 15800 + }, + { + "epoch": 0.46, + "grad_norm": 1.3853323792086285, + "learning_rate": 5.91312499117825e-06, + "loss": 0.3113, + "step": 15801 + }, + { + "epoch": 0.46, + "grad_norm": 1.2853924234408456, + "learning_rate": 5.912663178739794e-06, + "loss": 0.3295, + "step": 15802 + }, + { + "epoch": 0.46, + "grad_norm": 1.3562499947030617, + "learning_rate": 5.912201358247075e-06, + "loss": 0.3628, + "step": 15803 + }, + { + "epoch": 0.46, + "grad_norm": 2.0241947764261417, + "learning_rate": 5.911739529704168e-06, + "loss": 0.313, + "step": 15804 + }, + { + "epoch": 0.46, + "grad_norm": 1.3066548367108213, + "learning_rate": 5.911277693115151e-06, + "loss": 0.333, + "step": 15805 + }, + { + "epoch": 0.46, + "grad_norm": 1.979684249048461, + "learning_rate": 5.910815848484096e-06, + "loss": 0.317, + "step": 15806 + }, + { + "epoch": 0.46, + "grad_norm": 1.5411100473678163, + "learning_rate": 5.910353995815082e-06, + "loss": 0.301, + "step": 15807 + }, + { + "epoch": 0.46, + "grad_norm": 1.2298751866548316, + "learning_rate": 5.909892135112186e-06, + "loss": 0.315, + "step": 15808 + }, + { + "epoch": 0.46, + "grad_norm": 1.4461661316079304, + "learning_rate": 5.90943026637948e-06, + "loss": 0.3068, + "step": 15809 + }, + { + "epoch": 0.46, + "grad_norm": 1.278581893411612, + "learning_rate": 5.908968389621043e-06, + "loss": 0.3163, + "step": 15810 + }, + { + "epoch": 0.46, + "grad_norm": 1.326273451077146, + "learning_rate": 5.908506504840949e-06, + "loss": 0.3475, + "step": 15811 + }, + { + "epoch": 0.46, + "grad_norm": 1.2760577684015937, + "learning_rate": 5.908044612043274e-06, + "loss": 0.3154, + "step": 15812 + }, + { + "epoch": 0.46, + "grad_norm": 1.4896770520586655, + "learning_rate": 5.907582711232096e-06, + "loss": 0.3261, + "step": 15813 + }, + { + "epoch": 0.46, + "grad_norm": 1.232327335144656, + "learning_rate": 5.907120802411492e-06, + "loss": 0.3168, + "step": 15814 + }, + { + "epoch": 0.46, + "grad_norm": 2.9346016848923764, + "learning_rate": 5.906658885585535e-06, + "loss": 0.3345, + "step": 15815 + }, + { + "epoch": 0.46, + "grad_norm": 1.308415628630088, + "learning_rate": 5.906196960758302e-06, + "loss": 0.2866, + "step": 15816 + }, + { + "epoch": 0.46, + "grad_norm": 1.5308637033291657, + "learning_rate": 5.9057350279338735e-06, + "loss": 0.3222, + "step": 15817 + }, + { + "epoch": 0.46, + "grad_norm": 1.4482206552812795, + "learning_rate": 5.9052730871163225e-06, + "loss": 0.3259, + "step": 15818 + }, + { + "epoch": 0.46, + "grad_norm": 0.9960340629139962, + "learning_rate": 5.904811138309726e-06, + "loss": 0.5975, + "step": 15819 + }, + { + "epoch": 0.46, + "grad_norm": 1.575675527963002, + "learning_rate": 5.9043491815181605e-06, + "loss": 0.3095, + "step": 15820 + }, + { + "epoch": 0.46, + "grad_norm": 1.7923581238196529, + "learning_rate": 5.9038872167457055e-06, + "loss": 0.3166, + "step": 15821 + }, + { + "epoch": 0.46, + "grad_norm": 1.2569843536012528, + "learning_rate": 5.903425243996435e-06, + "loss": 0.3079, + "step": 15822 + }, + { + "epoch": 0.46, + "grad_norm": 1.8335811845908097, + "learning_rate": 5.902963263274426e-06, + "loss": 0.3196, + "step": 15823 + }, + { + "epoch": 0.46, + "grad_norm": 1.5975938371538634, + "learning_rate": 5.902501274583757e-06, + "loss": 0.3116, + "step": 15824 + }, + { + "epoch": 0.46, + "grad_norm": 1.545898179418107, + "learning_rate": 5.9020392779285044e-06, + "loss": 0.3154, + "step": 15825 + }, + { + "epoch": 0.46, + "grad_norm": 1.6462495383714186, + "learning_rate": 5.901577273312743e-06, + "loss": 0.3179, + "step": 15826 + }, + { + "epoch": 0.46, + "grad_norm": 1.2414974586324592, + "learning_rate": 5.901115260740555e-06, + "loss": 0.2941, + "step": 15827 + }, + { + "epoch": 0.46, + "grad_norm": 1.139236263852912, + "learning_rate": 5.900653240216014e-06, + "loss": 0.3095, + "step": 15828 + }, + { + "epoch": 0.46, + "grad_norm": 1.2884050894222465, + "learning_rate": 5.900191211743197e-06, + "loss": 0.3455, + "step": 15829 + }, + { + "epoch": 0.46, + "grad_norm": 2.155308296568667, + "learning_rate": 5.899729175326183e-06, + "loss": 0.3284, + "step": 15830 + }, + { + "epoch": 0.46, + "grad_norm": 1.264951620977733, + "learning_rate": 5.899267130969049e-06, + "loss": 0.365, + "step": 15831 + }, + { + "epoch": 0.46, + "grad_norm": 1.7107457210432047, + "learning_rate": 5.898805078675873e-06, + "loss": 0.2758, + "step": 15832 + }, + { + "epoch": 0.46, + "grad_norm": 1.4740860732955698, + "learning_rate": 5.8983430184507315e-06, + "loss": 0.3191, + "step": 15833 + }, + { + "epoch": 0.46, + "grad_norm": 1.7774267150551366, + "learning_rate": 5.897880950297704e-06, + "loss": 0.3015, + "step": 15834 + }, + { + "epoch": 0.46, + "grad_norm": 1.3220577661539679, + "learning_rate": 5.897418874220867e-06, + "loss": 0.3145, + "step": 15835 + }, + { + "epoch": 0.46, + "grad_norm": 1.5677007045749158, + "learning_rate": 5.896956790224298e-06, + "loss": 0.322, + "step": 15836 + }, + { + "epoch": 0.46, + "grad_norm": 1.2805663453183789, + "learning_rate": 5.896494698312075e-06, + "loss": 0.31, + "step": 15837 + }, + { + "epoch": 0.46, + "grad_norm": 1.4038987148427222, + "learning_rate": 5.896032598488277e-06, + "loss": 0.3458, + "step": 15838 + }, + { + "epoch": 0.46, + "grad_norm": 1.5808834109495629, + "learning_rate": 5.895570490756983e-06, + "loss": 0.3272, + "step": 15839 + }, + { + "epoch": 0.46, + "grad_norm": 1.3355507514462446, + "learning_rate": 5.8951083751222685e-06, + "loss": 0.3394, + "step": 15840 + }, + { + "epoch": 0.46, + "grad_norm": 1.2628334114170883, + "learning_rate": 5.894646251588211e-06, + "loss": 0.3278, + "step": 15841 + }, + { + "epoch": 0.46, + "grad_norm": 1.1894356655557894, + "learning_rate": 5.894184120158891e-06, + "loss": 0.3179, + "step": 15842 + }, + { + "epoch": 0.46, + "grad_norm": 1.5729005573177905, + "learning_rate": 5.893721980838386e-06, + "loss": 0.306, + "step": 15843 + }, + { + "epoch": 0.46, + "grad_norm": 1.7658777187591834, + "learning_rate": 5.8932598336307755e-06, + "loss": 0.3142, + "step": 15844 + }, + { + "epoch": 0.46, + "grad_norm": 1.7047111061084164, + "learning_rate": 5.892797678540138e-06, + "loss": 0.3453, + "step": 15845 + }, + { + "epoch": 0.46, + "grad_norm": 1.3495599511849952, + "learning_rate": 5.892335515570548e-06, + "loss": 0.3186, + "step": 15846 + }, + { + "epoch": 0.46, + "grad_norm": 1.6112877742778442, + "learning_rate": 5.891873344726089e-06, + "loss": 0.3546, + "step": 15847 + }, + { + "epoch": 0.46, + "grad_norm": 1.6053829277822034, + "learning_rate": 5.891411166010838e-06, + "loss": 0.3395, + "step": 15848 + }, + { + "epoch": 0.46, + "grad_norm": 1.3626643447013103, + "learning_rate": 5.8909489794288745e-06, + "loss": 0.3153, + "step": 15849 + }, + { + "epoch": 0.46, + "grad_norm": 1.4054795772876734, + "learning_rate": 5.890486784984275e-06, + "loss": 0.3281, + "step": 15850 + }, + { + "epoch": 0.46, + "grad_norm": 1.2490011535886139, + "learning_rate": 5.8900245826811195e-06, + "loss": 0.3034, + "step": 15851 + }, + { + "epoch": 0.46, + "grad_norm": 1.4442830696429385, + "learning_rate": 5.889562372523487e-06, + "loss": 0.3246, + "step": 15852 + }, + { + "epoch": 0.46, + "grad_norm": 1.3719737274780808, + "learning_rate": 5.889100154515458e-06, + "loss": 0.357, + "step": 15853 + }, + { + "epoch": 0.46, + "grad_norm": 1.2840676359469283, + "learning_rate": 5.88863792866111e-06, + "loss": 0.3263, + "step": 15854 + }, + { + "epoch": 0.46, + "grad_norm": 1.3160477479225878, + "learning_rate": 5.888175694964522e-06, + "loss": 0.3054, + "step": 15855 + }, + { + "epoch": 0.46, + "grad_norm": 1.3305052553789385, + "learning_rate": 5.887713453429774e-06, + "loss": 0.315, + "step": 15856 + }, + { + "epoch": 0.46, + "grad_norm": 1.2712886821012466, + "learning_rate": 5.887251204060944e-06, + "loss": 0.2987, + "step": 15857 + }, + { + "epoch": 0.46, + "grad_norm": 2.400465626378702, + "learning_rate": 5.886788946862114e-06, + "loss": 0.3301, + "step": 15858 + }, + { + "epoch": 0.46, + "grad_norm": 1.3193254773164105, + "learning_rate": 5.88632668183736e-06, + "loss": 0.3099, + "step": 15859 + }, + { + "epoch": 0.46, + "grad_norm": 0.9383721982863239, + "learning_rate": 5.885864408990764e-06, + "loss": 0.5983, + "step": 15860 + }, + { + "epoch": 0.46, + "grad_norm": 1.2278735183035943, + "learning_rate": 5.885402128326404e-06, + "loss": 0.3146, + "step": 15861 + }, + { + "epoch": 0.46, + "grad_norm": 1.3697611911597836, + "learning_rate": 5.884939839848361e-06, + "loss": 0.315, + "step": 15862 + }, + { + "epoch": 0.46, + "grad_norm": 1.213577693188218, + "learning_rate": 5.884477543560715e-06, + "loss": 0.3252, + "step": 15863 + }, + { + "epoch": 0.46, + "grad_norm": 1.3115840111342232, + "learning_rate": 5.884015239467543e-06, + "loss": 0.3148, + "step": 15864 + }, + { + "epoch": 0.46, + "grad_norm": 1.3579321264171254, + "learning_rate": 5.883552927572928e-06, + "loss": 0.2932, + "step": 15865 + }, + { + "epoch": 0.46, + "grad_norm": 1.2747184763985102, + "learning_rate": 5.883090607880948e-06, + "loss": 0.3268, + "step": 15866 + }, + { + "epoch": 0.46, + "grad_norm": 1.394012022116541, + "learning_rate": 5.882628280395683e-06, + "loss": 0.315, + "step": 15867 + }, + { + "epoch": 0.46, + "grad_norm": 1.2179967113090975, + "learning_rate": 5.882165945121216e-06, + "loss": 0.303, + "step": 15868 + }, + { + "epoch": 0.46, + "grad_norm": 1.571327553454334, + "learning_rate": 5.8817036020616234e-06, + "loss": 0.2955, + "step": 15869 + }, + { + "epoch": 0.46, + "grad_norm": 1.3894995059305097, + "learning_rate": 5.881241251220986e-06, + "loss": 0.3178, + "step": 15870 + }, + { + "epoch": 0.46, + "grad_norm": 1.8880981992302621, + "learning_rate": 5.8807788926033845e-06, + "loss": 0.2935, + "step": 15871 + }, + { + "epoch": 0.46, + "grad_norm": 1.5044262405769122, + "learning_rate": 5.880316526212901e-06, + "loss": 0.3014, + "step": 15872 + }, + { + "epoch": 0.46, + "grad_norm": 1.4825965168638826, + "learning_rate": 5.879854152053615e-06, + "loss": 0.365, + "step": 15873 + }, + { + "epoch": 0.46, + "grad_norm": 1.3632261659185203, + "learning_rate": 5.879391770129604e-06, + "loss": 0.3271, + "step": 15874 + }, + { + "epoch": 0.46, + "grad_norm": 1.2064334858441164, + "learning_rate": 5.878929380444952e-06, + "loss": 0.2801, + "step": 15875 + }, + { + "epoch": 0.46, + "grad_norm": 1.305915585015521, + "learning_rate": 5.878466983003739e-06, + "loss": 0.3243, + "step": 15876 + }, + { + "epoch": 0.46, + "grad_norm": 0.9363335337683787, + "learning_rate": 5.878004577810045e-06, + "loss": 0.6357, + "step": 15877 + }, + { + "epoch": 0.46, + "grad_norm": 1.341738196419447, + "learning_rate": 5.87754216486795e-06, + "loss": 0.3105, + "step": 15878 + }, + { + "epoch": 0.46, + "grad_norm": 1.5083852102902635, + "learning_rate": 5.8770797441815355e-06, + "loss": 0.3148, + "step": 15879 + }, + { + "epoch": 0.46, + "grad_norm": 1.2944247271335898, + "learning_rate": 5.8766173157548825e-06, + "loss": 0.3059, + "step": 15880 + }, + { + "epoch": 0.46, + "grad_norm": 0.9048406692049228, + "learning_rate": 5.876154879592073e-06, + "loss": 0.6457, + "step": 15881 + }, + { + "epoch": 0.46, + "grad_norm": 1.7508372243529244, + "learning_rate": 5.875692435697188e-06, + "loss": 0.3172, + "step": 15882 + }, + { + "epoch": 0.46, + "grad_norm": 1.2395382024102775, + "learning_rate": 5.8752299840743056e-06, + "loss": 0.3012, + "step": 15883 + }, + { + "epoch": 0.46, + "grad_norm": 1.7247868969739384, + "learning_rate": 5.874767524727509e-06, + "loss": 0.3119, + "step": 15884 + }, + { + "epoch": 0.46, + "grad_norm": 1.3984997848932905, + "learning_rate": 5.8743050576608795e-06, + "loss": 0.3128, + "step": 15885 + }, + { + "epoch": 0.46, + "grad_norm": 1.3328983792661513, + "learning_rate": 5.8738425828784995e-06, + "loss": 0.3234, + "step": 15886 + }, + { + "epoch": 0.46, + "grad_norm": 1.3800046071992595, + "learning_rate": 5.873380100384447e-06, + "loss": 0.3349, + "step": 15887 + }, + { + "epoch": 0.46, + "grad_norm": 1.5701004062399453, + "learning_rate": 5.872917610182805e-06, + "loss": 0.3123, + "step": 15888 + }, + { + "epoch": 0.46, + "grad_norm": 1.0155662034630544, + "learning_rate": 5.872455112277657e-06, + "loss": 0.6159, + "step": 15889 + }, + { + "epoch": 0.46, + "grad_norm": 1.3817559935992212, + "learning_rate": 5.871992606673084e-06, + "loss": 0.2852, + "step": 15890 + }, + { + "epoch": 0.46, + "grad_norm": 1.434070036614671, + "learning_rate": 5.871530093373165e-06, + "loss": 0.3214, + "step": 15891 + }, + { + "epoch": 0.46, + "grad_norm": 1.4179756894937832, + "learning_rate": 5.871067572381983e-06, + "loss": 0.3157, + "step": 15892 + }, + { + "epoch": 0.46, + "grad_norm": 1.2908124675436432, + "learning_rate": 5.87060504370362e-06, + "loss": 0.3295, + "step": 15893 + }, + { + "epoch": 0.46, + "grad_norm": 1.5594587866316991, + "learning_rate": 5.870142507342159e-06, + "loss": 0.3062, + "step": 15894 + }, + { + "epoch": 0.46, + "grad_norm": 1.4057501273383275, + "learning_rate": 5.869679963301681e-06, + "loss": 0.3214, + "step": 15895 + }, + { + "epoch": 0.46, + "grad_norm": 1.4513872418538096, + "learning_rate": 5.8692174115862665e-06, + "loss": 0.332, + "step": 15896 + }, + { + "epoch": 0.46, + "grad_norm": 1.3489414988096737, + "learning_rate": 5.868754852199998e-06, + "loss": 0.2973, + "step": 15897 + }, + { + "epoch": 0.46, + "grad_norm": 1.5323134806762186, + "learning_rate": 5.8682922851469594e-06, + "loss": 0.3072, + "step": 15898 + }, + { + "epoch": 0.46, + "grad_norm": 3.2316268245159065, + "learning_rate": 5.867829710431231e-06, + "loss": 0.3217, + "step": 15899 + }, + { + "epoch": 0.46, + "grad_norm": 1.2461935562170119, + "learning_rate": 5.867367128056897e-06, + "loss": 0.3343, + "step": 15900 + }, + { + "epoch": 0.46, + "grad_norm": 1.615239237813311, + "learning_rate": 5.8669045380280375e-06, + "loss": 0.3698, + "step": 15901 + }, + { + "epoch": 0.46, + "grad_norm": 1.4631391060665682, + "learning_rate": 5.866441940348736e-06, + "loss": 0.3349, + "step": 15902 + }, + { + "epoch": 0.46, + "grad_norm": 1.268595002767929, + "learning_rate": 5.865979335023074e-06, + "loss": 0.3296, + "step": 15903 + }, + { + "epoch": 0.46, + "grad_norm": 1.3173140827502798, + "learning_rate": 5.865516722055137e-06, + "loss": 0.317, + "step": 15904 + }, + { + "epoch": 0.46, + "grad_norm": 1.2498125251037036, + "learning_rate": 5.865054101449004e-06, + "loss": 0.3028, + "step": 15905 + }, + { + "epoch": 0.46, + "grad_norm": 1.4417126428737082, + "learning_rate": 5.8645914732087585e-06, + "loss": 0.3227, + "step": 15906 + }, + { + "epoch": 0.46, + "grad_norm": 1.1653520844733034, + "learning_rate": 5.864128837338483e-06, + "loss": 0.3082, + "step": 15907 + }, + { + "epoch": 0.46, + "grad_norm": 2.269374428048611, + "learning_rate": 5.863666193842263e-06, + "loss": 0.3053, + "step": 15908 + }, + { + "epoch": 0.46, + "grad_norm": 1.2202130888012712, + "learning_rate": 5.86320354272418e-06, + "loss": 0.3136, + "step": 15909 + }, + { + "epoch": 0.46, + "grad_norm": 1.4289871343659137, + "learning_rate": 5.862740883988314e-06, + "loss": 0.2937, + "step": 15910 + }, + { + "epoch": 0.46, + "grad_norm": 1.1933295081426483, + "learning_rate": 5.862278217638749e-06, + "loss": 0.3136, + "step": 15911 + }, + { + "epoch": 0.46, + "grad_norm": 1.320603670410757, + "learning_rate": 5.861815543679571e-06, + "loss": 0.3292, + "step": 15912 + }, + { + "epoch": 0.46, + "grad_norm": 1.463354897703673, + "learning_rate": 5.861352862114862e-06, + "loss": 0.3422, + "step": 15913 + }, + { + "epoch": 0.46, + "grad_norm": 1.3373433108282748, + "learning_rate": 5.860890172948704e-06, + "loss": 0.3461, + "step": 15914 + }, + { + "epoch": 0.46, + "grad_norm": 1.533344902085351, + "learning_rate": 5.860427476185178e-06, + "loss": 0.3411, + "step": 15915 + }, + { + "epoch": 0.46, + "grad_norm": 1.2998023347448253, + "learning_rate": 5.859964771828373e-06, + "loss": 0.2944, + "step": 15916 + }, + { + "epoch": 0.46, + "grad_norm": 1.3414318275329946, + "learning_rate": 5.859502059882367e-06, + "loss": 0.3192, + "step": 15917 + }, + { + "epoch": 0.46, + "grad_norm": 5.427396450657292, + "learning_rate": 5.8590393403512494e-06, + "loss": 0.3198, + "step": 15918 + }, + { + "epoch": 0.46, + "grad_norm": 1.524409848097006, + "learning_rate": 5.858576613239096e-06, + "loss": 0.3614, + "step": 15919 + }, + { + "epoch": 0.46, + "grad_norm": 1.244110510405949, + "learning_rate": 5.858113878549997e-06, + "loss": 0.3162, + "step": 15920 + }, + { + "epoch": 0.46, + "grad_norm": 2.1522152772467393, + "learning_rate": 5.857651136288033e-06, + "loss": 0.3374, + "step": 15921 + }, + { + "epoch": 0.46, + "grad_norm": 1.2882861823746279, + "learning_rate": 5.857188386457289e-06, + "loss": 0.2974, + "step": 15922 + }, + { + "epoch": 0.46, + "grad_norm": 1.450307746566392, + "learning_rate": 5.856725629061847e-06, + "loss": 0.3233, + "step": 15923 + }, + { + "epoch": 0.46, + "grad_norm": 1.3780612314159557, + "learning_rate": 5.856262864105792e-06, + "loss": 0.3127, + "step": 15924 + }, + { + "epoch": 0.46, + "grad_norm": 1.435722044200647, + "learning_rate": 5.855800091593206e-06, + "loss": 0.3148, + "step": 15925 + }, + { + "epoch": 0.46, + "grad_norm": 1.4110699737826855, + "learning_rate": 5.8553373115281776e-06, + "loss": 0.3085, + "step": 15926 + }, + { + "epoch": 0.46, + "grad_norm": 1.2874347201023582, + "learning_rate": 5.854874523914787e-06, + "loss": 0.3146, + "step": 15927 + }, + { + "epoch": 0.46, + "grad_norm": 1.901481691951081, + "learning_rate": 5.854411728757119e-06, + "loss": 0.3266, + "step": 15928 + }, + { + "epoch": 0.46, + "grad_norm": 2.917485223219983, + "learning_rate": 5.853948926059257e-06, + "loss": 0.3352, + "step": 15929 + }, + { + "epoch": 0.46, + "grad_norm": 1.368776855572258, + "learning_rate": 5.853486115825287e-06, + "loss": 0.3203, + "step": 15930 + }, + { + "epoch": 0.46, + "grad_norm": 1.3930738053361023, + "learning_rate": 5.853023298059294e-06, + "loss": 0.2998, + "step": 15931 + }, + { + "epoch": 0.46, + "grad_norm": 1.3272614782048968, + "learning_rate": 5.852560472765358e-06, + "loss": 0.3244, + "step": 15932 + }, + { + "epoch": 0.46, + "grad_norm": 1.598321232005411, + "learning_rate": 5.852097639947568e-06, + "loss": 0.3175, + "step": 15933 + }, + { + "epoch": 0.46, + "grad_norm": 1.255514326854933, + "learning_rate": 5.851634799610007e-06, + "loss": 0.2983, + "step": 15934 + }, + { + "epoch": 0.46, + "grad_norm": 1.7342917283715569, + "learning_rate": 5.851171951756759e-06, + "loss": 0.343, + "step": 15935 + }, + { + "epoch": 0.46, + "grad_norm": 1.5749645150330316, + "learning_rate": 5.85070909639191e-06, + "loss": 0.3041, + "step": 15936 + }, + { + "epoch": 0.46, + "grad_norm": 1.3517506568783162, + "learning_rate": 5.850246233519544e-06, + "loss": 0.3139, + "step": 15937 + }, + { + "epoch": 0.46, + "grad_norm": 1.280007495425998, + "learning_rate": 5.849783363143745e-06, + "loss": 0.316, + "step": 15938 + }, + { + "epoch": 0.46, + "grad_norm": 1.2618140639233144, + "learning_rate": 5.849320485268597e-06, + "loss": 0.2909, + "step": 15939 + }, + { + "epoch": 0.46, + "grad_norm": 1.7426508819278654, + "learning_rate": 5.848857599898188e-06, + "loss": 0.2936, + "step": 15940 + }, + { + "epoch": 0.46, + "grad_norm": 1.3583402505759175, + "learning_rate": 5.848394707036601e-06, + "loss": 0.3385, + "step": 15941 + }, + { + "epoch": 0.46, + "grad_norm": 1.4520269655193232, + "learning_rate": 5.847931806687921e-06, + "loss": 0.35, + "step": 15942 + }, + { + "epoch": 0.46, + "grad_norm": 1.3009160186200137, + "learning_rate": 5.847468898856233e-06, + "loss": 0.3155, + "step": 15943 + }, + { + "epoch": 0.46, + "grad_norm": 1.4181988088677562, + "learning_rate": 5.8470059835456225e-06, + "loss": 0.3096, + "step": 15944 + }, + { + "epoch": 0.46, + "grad_norm": 1.4248467820991144, + "learning_rate": 5.846543060760176e-06, + "loss": 0.3222, + "step": 15945 + }, + { + "epoch": 0.46, + "grad_norm": 0.9663999031001258, + "learning_rate": 5.846080130503977e-06, + "loss": 0.5897, + "step": 15946 + }, + { + "epoch": 0.46, + "grad_norm": 1.2532571065007594, + "learning_rate": 5.845617192781111e-06, + "loss": 0.3291, + "step": 15947 + }, + { + "epoch": 0.46, + "grad_norm": 1.4118396597073124, + "learning_rate": 5.845154247595663e-06, + "loss": 0.3196, + "step": 15948 + }, + { + "epoch": 0.46, + "grad_norm": 1.2672775653465678, + "learning_rate": 5.844691294951722e-06, + "loss": 0.2991, + "step": 15949 + }, + { + "epoch": 0.46, + "grad_norm": 1.2675937166515512, + "learning_rate": 5.844228334853369e-06, + "loss": 0.3188, + "step": 15950 + }, + { + "epoch": 0.46, + "grad_norm": 1.3005592259732845, + "learning_rate": 5.843765367304693e-06, + "loss": 0.3036, + "step": 15951 + }, + { + "epoch": 0.46, + "grad_norm": 1.380119615163321, + "learning_rate": 5.8433023923097775e-06, + "loss": 0.3235, + "step": 15952 + }, + { + "epoch": 0.46, + "grad_norm": 1.7080370800104818, + "learning_rate": 5.8428394098727085e-06, + "loss": 0.2993, + "step": 15953 + }, + { + "epoch": 0.46, + "grad_norm": 0.9505049925619453, + "learning_rate": 5.842376419997574e-06, + "loss": 0.6316, + "step": 15954 + }, + { + "epoch": 0.46, + "grad_norm": 1.8024377622149448, + "learning_rate": 5.841913422688456e-06, + "loss": 0.3166, + "step": 15955 + }, + { + "epoch": 0.46, + "grad_norm": 1.705912120818066, + "learning_rate": 5.841450417949443e-06, + "loss": 0.3617, + "step": 15956 + }, + { + "epoch": 0.46, + "grad_norm": 1.5046603943032644, + "learning_rate": 5.8409874057846215e-06, + "loss": 0.3308, + "step": 15957 + }, + { + "epoch": 0.46, + "grad_norm": 1.3921661759716955, + "learning_rate": 5.840524386198077e-06, + "loss": 0.32, + "step": 15958 + }, + { + "epoch": 0.46, + "grad_norm": 1.6295962050952302, + "learning_rate": 5.840061359193896e-06, + "loss": 0.3487, + "step": 15959 + }, + { + "epoch": 0.46, + "grad_norm": 1.3318800363810805, + "learning_rate": 5.839598324776164e-06, + "loss": 0.326, + "step": 15960 + }, + { + "epoch": 0.46, + "grad_norm": 1.541248927282467, + "learning_rate": 5.839135282948965e-06, + "loss": 0.3165, + "step": 15961 + }, + { + "epoch": 0.46, + "grad_norm": 1.2158356082933428, + "learning_rate": 5.83867223371639e-06, + "loss": 0.3069, + "step": 15962 + }, + { + "epoch": 0.46, + "grad_norm": 1.2516409486426885, + "learning_rate": 5.838209177082525e-06, + "loss": 0.3197, + "step": 15963 + }, + { + "epoch": 0.46, + "grad_norm": 1.3857607262706797, + "learning_rate": 5.837746113051451e-06, + "loss": 0.3259, + "step": 15964 + }, + { + "epoch": 0.46, + "grad_norm": 1.1961048439491806, + "learning_rate": 5.8372830416272616e-06, + "loss": 0.285, + "step": 15965 + }, + { + "epoch": 0.46, + "grad_norm": 1.493299304865073, + "learning_rate": 5.836819962814038e-06, + "loss": 0.3145, + "step": 15966 + }, + { + "epoch": 0.46, + "grad_norm": 1.2142470645481718, + "learning_rate": 5.836356876615869e-06, + "loss": 0.2946, + "step": 15967 + }, + { + "epoch": 0.46, + "grad_norm": 1.415405745083342, + "learning_rate": 5.835893783036842e-06, + "loss": 0.3259, + "step": 15968 + }, + { + "epoch": 0.46, + "grad_norm": 0.9236804210606051, + "learning_rate": 5.835430682081043e-06, + "loss": 0.6131, + "step": 15969 + }, + { + "epoch": 0.46, + "grad_norm": 1.4267578592552015, + "learning_rate": 5.834967573752558e-06, + "loss": 0.336, + "step": 15970 + }, + { + "epoch": 0.46, + "grad_norm": 1.2293059853433121, + "learning_rate": 5.834504458055475e-06, + "loss": 0.3168, + "step": 15971 + }, + { + "epoch": 0.46, + "grad_norm": 1.2145760713457725, + "learning_rate": 5.834041334993883e-06, + "loss": 0.3149, + "step": 15972 + }, + { + "epoch": 0.46, + "grad_norm": 1.2416689783203634, + "learning_rate": 5.833578204571865e-06, + "loss": 0.3294, + "step": 15973 + }, + { + "epoch": 0.46, + "grad_norm": 1.3211187486903826, + "learning_rate": 5.83311506679351e-06, + "loss": 0.3099, + "step": 15974 + }, + { + "epoch": 0.46, + "grad_norm": 1.2490698646103446, + "learning_rate": 5.832651921662906e-06, + "loss": 0.3202, + "step": 15975 + }, + { + "epoch": 0.46, + "grad_norm": 1.297654539037566, + "learning_rate": 5.832188769184139e-06, + "loss": 0.3082, + "step": 15976 + }, + { + "epoch": 0.46, + "grad_norm": 1.4831190844131579, + "learning_rate": 5.831725609361298e-06, + "loss": 0.3325, + "step": 15977 + }, + { + "epoch": 0.46, + "grad_norm": 1.9741320877284745, + "learning_rate": 5.831262442198469e-06, + "loss": 0.3096, + "step": 15978 + }, + { + "epoch": 0.46, + "grad_norm": 1.3145556809334724, + "learning_rate": 5.830799267699739e-06, + "loss": 0.2898, + "step": 15979 + }, + { + "epoch": 0.46, + "grad_norm": 1.7451570499523263, + "learning_rate": 5.8303360858691975e-06, + "loss": 0.3222, + "step": 15980 + }, + { + "epoch": 0.46, + "grad_norm": 1.6037869068962967, + "learning_rate": 5.829872896710929e-06, + "loss": 0.3455, + "step": 15981 + }, + { + "epoch": 0.46, + "grad_norm": 1.3931001726227317, + "learning_rate": 5.829409700229024e-06, + "loss": 0.3314, + "step": 15982 + }, + { + "epoch": 0.46, + "grad_norm": 1.4435907333367202, + "learning_rate": 5.828946496427569e-06, + "loss": 0.3606, + "step": 15983 + }, + { + "epoch": 0.46, + "grad_norm": 1.728464789404172, + "learning_rate": 5.828483285310652e-06, + "loss": 0.2996, + "step": 15984 + }, + { + "epoch": 0.46, + "grad_norm": 0.9208547055309673, + "learning_rate": 5.828020066882361e-06, + "loss": 0.5824, + "step": 15985 + }, + { + "epoch": 0.46, + "grad_norm": 1.3796554773740255, + "learning_rate": 5.827556841146784e-06, + "loss": 0.3275, + "step": 15986 + }, + { + "epoch": 0.46, + "grad_norm": 1.3456580323931329, + "learning_rate": 5.827093608108008e-06, + "loss": 0.3257, + "step": 15987 + }, + { + "epoch": 0.46, + "grad_norm": 0.9823153117572029, + "learning_rate": 5.826630367770121e-06, + "loss": 0.6164, + "step": 15988 + }, + { + "epoch": 0.46, + "grad_norm": 1.909301035523192, + "learning_rate": 5.826167120137213e-06, + "loss": 0.3281, + "step": 15989 + }, + { + "epoch": 0.46, + "grad_norm": 1.3205192088641586, + "learning_rate": 5.825703865213371e-06, + "loss": 0.3193, + "step": 15990 + }, + { + "epoch": 0.46, + "grad_norm": 1.2990084177121766, + "learning_rate": 5.825240603002683e-06, + "loss": 0.3348, + "step": 15991 + }, + { + "epoch": 0.46, + "grad_norm": 1.3036331245375934, + "learning_rate": 5.8247773335092374e-06, + "loss": 0.3187, + "step": 15992 + }, + { + "epoch": 0.46, + "grad_norm": 1.5386429050384391, + "learning_rate": 5.824314056737124e-06, + "loss": 0.3114, + "step": 15993 + }, + { + "epoch": 0.46, + "grad_norm": 2.3583293039314355, + "learning_rate": 5.823850772690428e-06, + "loss": 0.3149, + "step": 15994 + }, + { + "epoch": 0.46, + "grad_norm": 1.2613352421919015, + "learning_rate": 5.823387481373241e-06, + "loss": 0.3328, + "step": 15995 + }, + { + "epoch": 0.46, + "grad_norm": 1.2659183354527332, + "learning_rate": 5.82292418278965e-06, + "loss": 0.3313, + "step": 15996 + }, + { + "epoch": 0.46, + "grad_norm": 1.320727696627639, + "learning_rate": 5.822460876943744e-06, + "loss": 0.3287, + "step": 15997 + }, + { + "epoch": 0.46, + "grad_norm": 1.2429210842758407, + "learning_rate": 5.821997563839611e-06, + "loss": 0.3187, + "step": 15998 + }, + { + "epoch": 0.46, + "grad_norm": 1.279982836248164, + "learning_rate": 5.821534243481341e-06, + "loss": 0.3146, + "step": 15999 + }, + { + "epoch": 0.46, + "grad_norm": 1.4712329218239337, + "learning_rate": 5.821070915873023e-06, + "loss": 0.3061, + "step": 16000 + }, + { + "epoch": 0.46, + "grad_norm": 1.2480165399729755, + "learning_rate": 5.820607581018743e-06, + "loss": 0.3067, + "step": 16001 + }, + { + "epoch": 0.46, + "grad_norm": 1.3352464629158438, + "learning_rate": 5.820144238922593e-06, + "loss": 0.2978, + "step": 16002 + }, + { + "epoch": 0.46, + "grad_norm": 1.2944586410519803, + "learning_rate": 5.819680889588662e-06, + "loss": 0.3264, + "step": 16003 + }, + { + "epoch": 0.46, + "grad_norm": 1.5384410848427825, + "learning_rate": 5.819217533021036e-06, + "loss": 0.2948, + "step": 16004 + }, + { + "epoch": 0.46, + "grad_norm": 1.3181503672540236, + "learning_rate": 5.818754169223808e-06, + "loss": 0.3311, + "step": 16005 + }, + { + "epoch": 0.46, + "grad_norm": 1.335689333930093, + "learning_rate": 5.818290798201064e-06, + "loss": 0.3303, + "step": 16006 + }, + { + "epoch": 0.46, + "grad_norm": 1.1847206556160075, + "learning_rate": 5.8178274199568966e-06, + "loss": 0.2869, + "step": 16007 + }, + { + "epoch": 0.46, + "grad_norm": 1.1688333136666473, + "learning_rate": 5.817364034495392e-06, + "loss": 0.3028, + "step": 16008 + }, + { + "epoch": 0.46, + "grad_norm": 1.299013644534175, + "learning_rate": 5.8169006418206405e-06, + "loss": 0.3132, + "step": 16009 + }, + { + "epoch": 0.46, + "grad_norm": 1.375456746243459, + "learning_rate": 5.816437241936732e-06, + "loss": 0.3452, + "step": 16010 + }, + { + "epoch": 0.46, + "grad_norm": 1.3104364389006125, + "learning_rate": 5.815973834847756e-06, + "loss": 0.301, + "step": 16011 + }, + { + "epoch": 0.46, + "grad_norm": 1.2794526321170427, + "learning_rate": 5.815510420557801e-06, + "loss": 0.3157, + "step": 16012 + }, + { + "epoch": 0.46, + "grad_norm": 1.2800011909640987, + "learning_rate": 5.815046999070957e-06, + "loss": 0.3304, + "step": 16013 + }, + { + "epoch": 0.46, + "grad_norm": 1.2704525546567158, + "learning_rate": 5.814583570391317e-06, + "loss": 0.3222, + "step": 16014 + }, + { + "epoch": 0.46, + "grad_norm": 1.2580789615205734, + "learning_rate": 5.8141201345229644e-06, + "loss": 0.3106, + "step": 16015 + }, + { + "epoch": 0.46, + "grad_norm": 1.245829628898441, + "learning_rate": 5.8136566914699945e-06, + "loss": 0.3095, + "step": 16016 + }, + { + "epoch": 0.46, + "grad_norm": 1.2625234452794514, + "learning_rate": 5.813193241236496e-06, + "loss": 0.3343, + "step": 16017 + }, + { + "epoch": 0.46, + "grad_norm": 1.2542402318078651, + "learning_rate": 5.812729783826557e-06, + "loss": 0.3116, + "step": 16018 + }, + { + "epoch": 0.46, + "grad_norm": 1.1924059819302937, + "learning_rate": 5.812266319244269e-06, + "loss": 0.3073, + "step": 16019 + }, + { + "epoch": 0.46, + "grad_norm": 1.3201459041772479, + "learning_rate": 5.811802847493722e-06, + "loss": 0.32, + "step": 16020 + }, + { + "epoch": 0.46, + "grad_norm": 1.468206282876256, + "learning_rate": 5.811339368579006e-06, + "loss": 0.3142, + "step": 16021 + }, + { + "epoch": 0.46, + "grad_norm": 2.235447236256222, + "learning_rate": 5.81087588250421e-06, + "loss": 0.3301, + "step": 16022 + }, + { + "epoch": 0.46, + "grad_norm": 1.2224569725967802, + "learning_rate": 5.810412389273427e-06, + "loss": 0.3057, + "step": 16023 + }, + { + "epoch": 0.46, + "grad_norm": 1.8333918102584836, + "learning_rate": 5.8099488888907445e-06, + "loss": 0.3342, + "step": 16024 + }, + { + "epoch": 0.46, + "grad_norm": 1.8645473189496389, + "learning_rate": 5.809485381360253e-06, + "loss": 0.3175, + "step": 16025 + }, + { + "epoch": 0.46, + "grad_norm": 1.2636513603069002, + "learning_rate": 5.809021866686044e-06, + "loss": 0.3042, + "step": 16026 + }, + { + "epoch": 0.46, + "grad_norm": 1.290797464005378, + "learning_rate": 5.80855834487221e-06, + "loss": 0.3054, + "step": 16027 + }, + { + "epoch": 0.46, + "grad_norm": 1.5173486023387333, + "learning_rate": 5.808094815922839e-06, + "loss": 0.3131, + "step": 16028 + }, + { + "epoch": 0.46, + "grad_norm": 1.2756563729470325, + "learning_rate": 5.807631279842022e-06, + "loss": 0.3183, + "step": 16029 + }, + { + "epoch": 0.46, + "grad_norm": 1.3609743379648918, + "learning_rate": 5.80716773663385e-06, + "loss": 0.3214, + "step": 16030 + }, + { + "epoch": 0.46, + "grad_norm": 1.2197010801897643, + "learning_rate": 5.806704186302413e-06, + "loss": 0.3036, + "step": 16031 + }, + { + "epoch": 0.47, + "grad_norm": 1.2504778874536202, + "learning_rate": 5.806240628851805e-06, + "loss": 0.3204, + "step": 16032 + }, + { + "epoch": 0.47, + "grad_norm": 1.3406031981304518, + "learning_rate": 5.805777064286112e-06, + "loss": 0.3262, + "step": 16033 + }, + { + "epoch": 0.47, + "grad_norm": 1.5082772943221772, + "learning_rate": 5.805313492609429e-06, + "loss": 0.3345, + "step": 16034 + }, + { + "epoch": 0.47, + "grad_norm": 1.5277668871100611, + "learning_rate": 5.804849913825845e-06, + "loss": 0.3347, + "step": 16035 + }, + { + "epoch": 0.47, + "grad_norm": 1.3602167547235555, + "learning_rate": 5.804386327939452e-06, + "loss": 0.3222, + "step": 16036 + }, + { + "epoch": 0.47, + "grad_norm": 1.2601693353559984, + "learning_rate": 5.803922734954341e-06, + "loss": 0.3524, + "step": 16037 + }, + { + "epoch": 0.47, + "grad_norm": 1.1922450535560212, + "learning_rate": 5.803459134874602e-06, + "loss": 0.2876, + "step": 16038 + }, + { + "epoch": 0.47, + "grad_norm": 1.3366511721590173, + "learning_rate": 5.802995527704327e-06, + "loss": 0.2978, + "step": 16039 + }, + { + "epoch": 0.47, + "grad_norm": 1.4520274489594474, + "learning_rate": 5.802531913447608e-06, + "loss": 0.3596, + "step": 16040 + }, + { + "epoch": 0.47, + "grad_norm": 1.2219576941292802, + "learning_rate": 5.802068292108537e-06, + "loss": 0.3166, + "step": 16041 + }, + { + "epoch": 0.47, + "grad_norm": 1.3279741453147529, + "learning_rate": 5.801604663691203e-06, + "loss": 0.3095, + "step": 16042 + }, + { + "epoch": 0.47, + "grad_norm": 1.2463697547391597, + "learning_rate": 5.801141028199699e-06, + "loss": 0.3036, + "step": 16043 + }, + { + "epoch": 0.47, + "grad_norm": 2.637847907982155, + "learning_rate": 5.800677385638116e-06, + "loss": 0.3712, + "step": 16044 + }, + { + "epoch": 0.47, + "grad_norm": 1.3570281737382586, + "learning_rate": 5.800213736010548e-06, + "loss": 0.3385, + "step": 16045 + }, + { + "epoch": 0.47, + "grad_norm": 1.5073041880821707, + "learning_rate": 5.799750079321084e-06, + "loss": 0.3415, + "step": 16046 + }, + { + "epoch": 0.47, + "grad_norm": 1.3394884854676452, + "learning_rate": 5.799286415573816e-06, + "loss": 0.3271, + "step": 16047 + }, + { + "epoch": 0.47, + "grad_norm": 0.941139370524451, + "learning_rate": 5.798822744772838e-06, + "loss": 0.5691, + "step": 16048 + }, + { + "epoch": 0.47, + "grad_norm": 1.3553740325082408, + "learning_rate": 5.79835906692224e-06, + "loss": 0.3042, + "step": 16049 + }, + { + "epoch": 0.47, + "grad_norm": 1.322771559133041, + "learning_rate": 5.797895382026113e-06, + "loss": 0.3139, + "step": 16050 + }, + { + "epoch": 0.47, + "grad_norm": 1.2419215791769833, + "learning_rate": 5.797431690088553e-06, + "loss": 0.3041, + "step": 16051 + }, + { + "epoch": 0.47, + "grad_norm": 1.3917858561141174, + "learning_rate": 5.796967991113647e-06, + "loss": 0.3305, + "step": 16052 + }, + { + "epoch": 0.47, + "grad_norm": 1.4000200318232652, + "learning_rate": 5.79650428510549e-06, + "loss": 0.3306, + "step": 16053 + }, + { + "epoch": 0.47, + "grad_norm": 1.260662259403248, + "learning_rate": 5.796040572068175e-06, + "loss": 0.346, + "step": 16054 + }, + { + "epoch": 0.47, + "grad_norm": 1.3874355407334316, + "learning_rate": 5.795576852005793e-06, + "loss": 0.2999, + "step": 16055 + }, + { + "epoch": 0.47, + "grad_norm": 1.218096152614586, + "learning_rate": 5.795113124922435e-06, + "loss": 0.3015, + "step": 16056 + }, + { + "epoch": 0.47, + "grad_norm": 1.2362626599026727, + "learning_rate": 5.794649390822196e-06, + "loss": 0.3123, + "step": 16057 + }, + { + "epoch": 0.47, + "grad_norm": 1.5452768623491173, + "learning_rate": 5.794185649709167e-06, + "loss": 0.3426, + "step": 16058 + }, + { + "epoch": 0.47, + "grad_norm": 1.3025381222806434, + "learning_rate": 5.793721901587441e-06, + "loss": 0.3271, + "step": 16059 + }, + { + "epoch": 0.47, + "grad_norm": 1.7698807658149283, + "learning_rate": 5.79325814646111e-06, + "loss": 0.337, + "step": 16060 + }, + { + "epoch": 0.47, + "grad_norm": 1.35065589938081, + "learning_rate": 5.792794384334267e-06, + "loss": 0.3211, + "step": 16061 + }, + { + "epoch": 0.47, + "grad_norm": 1.1962020606745425, + "learning_rate": 5.792330615211005e-06, + "loss": 0.2909, + "step": 16062 + }, + { + "epoch": 0.47, + "grad_norm": 1.23769728310122, + "learning_rate": 5.791866839095418e-06, + "loss": 0.3106, + "step": 16063 + }, + { + "epoch": 0.47, + "grad_norm": 1.4343094543860422, + "learning_rate": 5.791403055991597e-06, + "loss": 0.3038, + "step": 16064 + }, + { + "epoch": 0.47, + "grad_norm": 1.5375083157872231, + "learning_rate": 5.790939265903634e-06, + "loss": 0.3385, + "step": 16065 + }, + { + "epoch": 0.47, + "grad_norm": 1.266654460320475, + "learning_rate": 5.790475468835623e-06, + "loss": 0.3186, + "step": 16066 + }, + { + "epoch": 0.47, + "grad_norm": 1.5139002189168005, + "learning_rate": 5.790011664791657e-06, + "loss": 0.325, + "step": 16067 + }, + { + "epoch": 0.47, + "grad_norm": 1.3072425976068676, + "learning_rate": 5.78954785377583e-06, + "loss": 0.3222, + "step": 16068 + }, + { + "epoch": 0.47, + "grad_norm": 1.4451945177140302, + "learning_rate": 5.789084035792235e-06, + "loss": 0.3263, + "step": 16069 + }, + { + "epoch": 0.47, + "grad_norm": 1.3327205610319341, + "learning_rate": 5.788620210844964e-06, + "loss": 0.3287, + "step": 16070 + }, + { + "epoch": 0.47, + "grad_norm": 1.366215306115359, + "learning_rate": 5.78815637893811e-06, + "loss": 0.3162, + "step": 16071 + }, + { + "epoch": 0.47, + "grad_norm": 1.3542656021088826, + "learning_rate": 5.787692540075769e-06, + "loss": 0.3088, + "step": 16072 + }, + { + "epoch": 0.47, + "grad_norm": 1.2772046600129925, + "learning_rate": 5.7872286942620326e-06, + "loss": 0.3321, + "step": 16073 + }, + { + "epoch": 0.47, + "grad_norm": 1.3983816406613288, + "learning_rate": 5.786764841500992e-06, + "loss": 0.3681, + "step": 16074 + }, + { + "epoch": 0.47, + "grad_norm": 1.2393619288028428, + "learning_rate": 5.786300981796744e-06, + "loss": 0.299, + "step": 16075 + }, + { + "epoch": 0.47, + "grad_norm": 1.2896127420995527, + "learning_rate": 5.785837115153382e-06, + "loss": 0.3147, + "step": 16076 + }, + { + "epoch": 0.47, + "grad_norm": 1.3767862166202085, + "learning_rate": 5.7853732415749985e-06, + "loss": 0.3108, + "step": 16077 + }, + { + "epoch": 0.47, + "grad_norm": 1.183076509914222, + "learning_rate": 5.784909361065688e-06, + "loss": 0.3031, + "step": 16078 + }, + { + "epoch": 0.47, + "grad_norm": 1.3002914168049553, + "learning_rate": 5.784445473629543e-06, + "loss": 0.287, + "step": 16079 + }, + { + "epoch": 0.47, + "grad_norm": 1.4097838736138602, + "learning_rate": 5.783981579270658e-06, + "loss": 0.3194, + "step": 16080 + }, + { + "epoch": 0.47, + "grad_norm": 1.3990907670717376, + "learning_rate": 5.783517677993126e-06, + "loss": 0.3211, + "step": 16081 + }, + { + "epoch": 0.47, + "grad_norm": 1.0138568426089283, + "learning_rate": 5.7830537698010425e-06, + "loss": 0.5775, + "step": 16082 + }, + { + "epoch": 0.47, + "grad_norm": 1.3762719503225953, + "learning_rate": 5.782589854698502e-06, + "loss": 0.3316, + "step": 16083 + }, + { + "epoch": 0.47, + "grad_norm": 1.2237079590243949, + "learning_rate": 5.7821259326895955e-06, + "loss": 0.307, + "step": 16084 + }, + { + "epoch": 0.47, + "grad_norm": 1.303151630904664, + "learning_rate": 5.781662003778419e-06, + "loss": 0.3388, + "step": 16085 + }, + { + "epoch": 0.47, + "grad_norm": 1.549296296597717, + "learning_rate": 5.781198067969067e-06, + "loss": 0.313, + "step": 16086 + }, + { + "epoch": 0.47, + "grad_norm": 1.4857689582078677, + "learning_rate": 5.780734125265633e-06, + "loss": 0.2988, + "step": 16087 + }, + { + "epoch": 0.47, + "grad_norm": 1.2893407641500703, + "learning_rate": 5.780270175672213e-06, + "loss": 0.2958, + "step": 16088 + }, + { + "epoch": 0.47, + "grad_norm": 1.2628742350540618, + "learning_rate": 5.7798062191929e-06, + "loss": 0.3115, + "step": 16089 + }, + { + "epoch": 0.47, + "grad_norm": 1.678681060022026, + "learning_rate": 5.779342255831787e-06, + "loss": 0.3216, + "step": 16090 + }, + { + "epoch": 0.47, + "grad_norm": 1.3142051517885118, + "learning_rate": 5.778878285592971e-06, + "loss": 0.3096, + "step": 16091 + }, + { + "epoch": 0.47, + "grad_norm": 1.2508471520640705, + "learning_rate": 5.778414308480547e-06, + "loss": 0.3498, + "step": 16092 + }, + { + "epoch": 0.47, + "grad_norm": 1.6437210663916098, + "learning_rate": 5.777950324498605e-06, + "loss": 0.3268, + "step": 16093 + }, + { + "epoch": 0.47, + "grad_norm": 1.3205382099018117, + "learning_rate": 5.777486333651244e-06, + "loss": 0.37, + "step": 16094 + }, + { + "epoch": 0.47, + "grad_norm": 1.4047311306255155, + "learning_rate": 5.777022335942557e-06, + "loss": 0.329, + "step": 16095 + }, + { + "epoch": 0.47, + "grad_norm": 1.2853023326591704, + "learning_rate": 5.7765583313766404e-06, + "loss": 0.3403, + "step": 16096 + }, + { + "epoch": 0.47, + "grad_norm": 1.4279824377000563, + "learning_rate": 5.776094319957588e-06, + "loss": 0.3103, + "step": 16097 + }, + { + "epoch": 0.47, + "grad_norm": 1.3683548144905078, + "learning_rate": 5.775630301689493e-06, + "loss": 0.3536, + "step": 16098 + }, + { + "epoch": 0.47, + "grad_norm": 1.245179697794467, + "learning_rate": 5.775166276576453e-06, + "loss": 0.3161, + "step": 16099 + }, + { + "epoch": 0.47, + "grad_norm": 0.9643235053074993, + "learning_rate": 5.774702244622563e-06, + "loss": 0.5716, + "step": 16100 + }, + { + "epoch": 0.47, + "grad_norm": 1.2786398783835333, + "learning_rate": 5.774238205831916e-06, + "loss": 0.3064, + "step": 16101 + }, + { + "epoch": 0.47, + "grad_norm": 1.5013489643313889, + "learning_rate": 5.773774160208607e-06, + "loss": 0.3097, + "step": 16102 + }, + { + "epoch": 0.47, + "grad_norm": 1.3850079095339531, + "learning_rate": 5.773310107756735e-06, + "loss": 0.3503, + "step": 16103 + }, + { + "epoch": 0.47, + "grad_norm": 1.5331333659810182, + "learning_rate": 5.772846048480391e-06, + "loss": 0.3197, + "step": 16104 + }, + { + "epoch": 0.47, + "grad_norm": 1.391311392424034, + "learning_rate": 5.772381982383674e-06, + "loss": 0.3077, + "step": 16105 + }, + { + "epoch": 0.47, + "grad_norm": 3.299693945288217, + "learning_rate": 5.771917909470675e-06, + "loss": 0.2944, + "step": 16106 + }, + { + "epoch": 0.47, + "grad_norm": 1.576771770483922, + "learning_rate": 5.771453829745492e-06, + "loss": 0.3574, + "step": 16107 + }, + { + "epoch": 0.47, + "grad_norm": 1.512548224168419, + "learning_rate": 5.770989743212222e-06, + "loss": 0.307, + "step": 16108 + }, + { + "epoch": 0.47, + "grad_norm": 1.350896475059535, + "learning_rate": 5.770525649874957e-06, + "loss": 0.3229, + "step": 16109 + }, + { + "epoch": 0.47, + "grad_norm": 1.3243587757434774, + "learning_rate": 5.770061549737797e-06, + "loss": 0.3113, + "step": 16110 + }, + { + "epoch": 0.47, + "grad_norm": 1.3814847031922826, + "learning_rate": 5.769597442804834e-06, + "loss": 0.3002, + "step": 16111 + }, + { + "epoch": 0.47, + "grad_norm": 1.4680111749473865, + "learning_rate": 5.769133329080165e-06, + "loss": 0.33, + "step": 16112 + }, + { + "epoch": 0.47, + "grad_norm": 1.7719432079116446, + "learning_rate": 5.768669208567885e-06, + "loss": 0.3087, + "step": 16113 + }, + { + "epoch": 0.47, + "grad_norm": 1.4531053911683829, + "learning_rate": 5.7682050812720935e-06, + "loss": 0.3341, + "step": 16114 + }, + { + "epoch": 0.47, + "grad_norm": 1.6643239567563057, + "learning_rate": 5.767740947196881e-06, + "loss": 0.307, + "step": 16115 + }, + { + "epoch": 0.47, + "grad_norm": 1.368571908039474, + "learning_rate": 5.767276806346347e-06, + "loss": 0.3158, + "step": 16116 + }, + { + "epoch": 0.47, + "grad_norm": 1.3974420096884508, + "learning_rate": 5.766812658724586e-06, + "loss": 0.3267, + "step": 16117 + }, + { + "epoch": 0.47, + "grad_norm": 1.387976983272064, + "learning_rate": 5.766348504335696e-06, + "loss": 0.3239, + "step": 16118 + }, + { + "epoch": 0.47, + "grad_norm": 1.513447107779021, + "learning_rate": 5.765884343183772e-06, + "loss": 0.3334, + "step": 16119 + }, + { + "epoch": 0.47, + "grad_norm": 1.7935762134385298, + "learning_rate": 5.76542017527291e-06, + "loss": 0.305, + "step": 16120 + }, + { + "epoch": 0.47, + "grad_norm": 2.087745283790489, + "learning_rate": 5.764956000607204e-06, + "loss": 0.3016, + "step": 16121 + }, + { + "epoch": 0.47, + "grad_norm": 1.6099816745311657, + "learning_rate": 5.764491819190757e-06, + "loss": 0.3173, + "step": 16122 + }, + { + "epoch": 0.47, + "grad_norm": 1.3515929173909536, + "learning_rate": 5.764027631027659e-06, + "loss": 0.3348, + "step": 16123 + }, + { + "epoch": 0.47, + "grad_norm": 1.2456664069095857, + "learning_rate": 5.7635634361220084e-06, + "loss": 0.3152, + "step": 16124 + }, + { + "epoch": 0.47, + "grad_norm": 1.371968951154205, + "learning_rate": 5.763099234477901e-06, + "loss": 0.2969, + "step": 16125 + }, + { + "epoch": 0.47, + "grad_norm": 1.2519810170376748, + "learning_rate": 5.762635026099436e-06, + "loss": 0.3218, + "step": 16126 + }, + { + "epoch": 0.47, + "grad_norm": 1.7848398671820291, + "learning_rate": 5.762170810990708e-06, + "loss": 0.3375, + "step": 16127 + }, + { + "epoch": 0.47, + "grad_norm": 1.2893426427901724, + "learning_rate": 5.761706589155815e-06, + "loss": 0.3251, + "step": 16128 + }, + { + "epoch": 0.47, + "grad_norm": 1.1864498501778795, + "learning_rate": 5.761242360598851e-06, + "loss": 0.293, + "step": 16129 + }, + { + "epoch": 0.47, + "grad_norm": 1.865286997592844, + "learning_rate": 5.760778125323916e-06, + "loss": 0.3563, + "step": 16130 + }, + { + "epoch": 0.47, + "grad_norm": 2.661761520792849, + "learning_rate": 5.760313883335105e-06, + "loss": 0.3061, + "step": 16131 + }, + { + "epoch": 0.47, + "grad_norm": 1.3943079858394187, + "learning_rate": 5.759849634636517e-06, + "loss": 0.329, + "step": 16132 + }, + { + "epoch": 0.47, + "grad_norm": 1.3443166817133274, + "learning_rate": 5.759385379232246e-06, + "loss": 0.2967, + "step": 16133 + }, + { + "epoch": 0.47, + "grad_norm": 1.3983639290577137, + "learning_rate": 5.758921117126393e-06, + "loss": 0.3324, + "step": 16134 + }, + { + "epoch": 0.47, + "grad_norm": 1.2623382611020393, + "learning_rate": 5.758456848323049e-06, + "loss": 0.307, + "step": 16135 + }, + { + "epoch": 0.47, + "grad_norm": 1.5855813986344665, + "learning_rate": 5.757992572826317e-06, + "loss": 0.3205, + "step": 16136 + }, + { + "epoch": 0.47, + "grad_norm": 1.2831616790142364, + "learning_rate": 5.757528290640293e-06, + "loss": 0.2975, + "step": 16137 + }, + { + "epoch": 0.47, + "grad_norm": 1.5518830993971144, + "learning_rate": 5.7570640017690725e-06, + "loss": 0.3407, + "step": 16138 + }, + { + "epoch": 0.47, + "grad_norm": 0.981967872194619, + "learning_rate": 5.756599706216754e-06, + "loss": 0.5998, + "step": 16139 + }, + { + "epoch": 0.47, + "grad_norm": 1.7384554904803362, + "learning_rate": 5.7561354039874325e-06, + "loss": 0.3251, + "step": 16140 + }, + { + "epoch": 0.47, + "grad_norm": 1.464769526707571, + "learning_rate": 5.755671095085211e-06, + "loss": 0.3169, + "step": 16141 + }, + { + "epoch": 0.47, + "grad_norm": 1.2713785662170343, + "learning_rate": 5.755206779514182e-06, + "loss": 0.3296, + "step": 16142 + }, + { + "epoch": 0.47, + "grad_norm": 1.2113352644744078, + "learning_rate": 5.754742457278444e-06, + "loss": 0.3464, + "step": 16143 + }, + { + "epoch": 0.47, + "grad_norm": 1.276635880870721, + "learning_rate": 5.754278128382096e-06, + "loss": 0.3315, + "step": 16144 + }, + { + "epoch": 0.47, + "grad_norm": 1.3997204326993036, + "learning_rate": 5.753813792829236e-06, + "loss": 0.3095, + "step": 16145 + }, + { + "epoch": 0.47, + "grad_norm": 1.236057153248897, + "learning_rate": 5.753349450623961e-06, + "loss": 0.3382, + "step": 16146 + }, + { + "epoch": 0.47, + "grad_norm": 1.370529605992555, + "learning_rate": 5.752885101770368e-06, + "loss": 0.3092, + "step": 16147 + }, + { + "epoch": 0.47, + "grad_norm": 1.3456144359012252, + "learning_rate": 5.752420746272556e-06, + "loss": 0.3235, + "step": 16148 + }, + { + "epoch": 0.47, + "grad_norm": 1.332609575921687, + "learning_rate": 5.751956384134623e-06, + "loss": 0.3098, + "step": 16149 + }, + { + "epoch": 0.47, + "grad_norm": 1.2746949001373502, + "learning_rate": 5.751492015360666e-06, + "loss": 0.2945, + "step": 16150 + }, + { + "epoch": 0.47, + "grad_norm": 1.3979848495923917, + "learning_rate": 5.751027639954784e-06, + "loss": 0.3293, + "step": 16151 + }, + { + "epoch": 0.47, + "grad_norm": 1.319864433076752, + "learning_rate": 5.7505632579210734e-06, + "loss": 0.3024, + "step": 16152 + }, + { + "epoch": 0.47, + "grad_norm": 1.1886543403988588, + "learning_rate": 5.750098869263635e-06, + "loss": 0.2954, + "step": 16153 + }, + { + "epoch": 0.47, + "grad_norm": 1.5944965353990965, + "learning_rate": 5.749634473986565e-06, + "loss": 0.3206, + "step": 16154 + }, + { + "epoch": 0.47, + "grad_norm": 1.2817620712490732, + "learning_rate": 5.749170072093964e-06, + "loss": 0.3136, + "step": 16155 + }, + { + "epoch": 0.47, + "grad_norm": 1.4116419096393351, + "learning_rate": 5.7487056635899275e-06, + "loss": 0.3416, + "step": 16156 + }, + { + "epoch": 0.47, + "grad_norm": 1.5829713845617108, + "learning_rate": 5.748241248478556e-06, + "loss": 0.3733, + "step": 16157 + }, + { + "epoch": 0.47, + "grad_norm": 1.285713247453509, + "learning_rate": 5.747776826763946e-06, + "loss": 0.2979, + "step": 16158 + }, + { + "epoch": 0.47, + "grad_norm": 1.2652066078629993, + "learning_rate": 5.7473123984501975e-06, + "loss": 0.2982, + "step": 16159 + }, + { + "epoch": 0.47, + "grad_norm": 1.3180938650285166, + "learning_rate": 5.7468479635414084e-06, + "loss": 0.3192, + "step": 16160 + }, + { + "epoch": 0.47, + "grad_norm": 1.3830223727124675, + "learning_rate": 5.746383522041679e-06, + "loss": 0.333, + "step": 16161 + }, + { + "epoch": 0.47, + "grad_norm": 2.473942110681668, + "learning_rate": 5.745919073955107e-06, + "loss": 0.3338, + "step": 16162 + }, + { + "epoch": 0.47, + "grad_norm": 1.2692684904406202, + "learning_rate": 5.74545461928579e-06, + "loss": 0.3125, + "step": 16163 + }, + { + "epoch": 0.47, + "grad_norm": 1.3890360537672217, + "learning_rate": 5.744990158037828e-06, + "loss": 0.3162, + "step": 16164 + }, + { + "epoch": 0.47, + "grad_norm": 1.3088757459269609, + "learning_rate": 5.744525690215319e-06, + "loss": 0.3218, + "step": 16165 + }, + { + "epoch": 0.47, + "grad_norm": 1.3559798941698227, + "learning_rate": 5.744061215822363e-06, + "loss": 0.3228, + "step": 16166 + }, + { + "epoch": 0.47, + "grad_norm": 1.363148783222587, + "learning_rate": 5.7435967348630586e-06, + "loss": 0.3313, + "step": 16167 + }, + { + "epoch": 0.47, + "grad_norm": 1.5775854477718398, + "learning_rate": 5.743132247341504e-06, + "loss": 0.3147, + "step": 16168 + }, + { + "epoch": 0.47, + "grad_norm": 1.549347937907501, + "learning_rate": 5.7426677532618e-06, + "loss": 0.3265, + "step": 16169 + }, + { + "epoch": 0.47, + "grad_norm": 1.1879917860592988, + "learning_rate": 5.742203252628045e-06, + "loss": 0.2994, + "step": 16170 + }, + { + "epoch": 0.47, + "grad_norm": 0.9541369087274105, + "learning_rate": 5.741738745444336e-06, + "loss": 0.6149, + "step": 16171 + }, + { + "epoch": 0.47, + "grad_norm": 1.3143998029225052, + "learning_rate": 5.741274231714776e-06, + "loss": 0.3236, + "step": 16172 + }, + { + "epoch": 0.47, + "grad_norm": 1.397709458960476, + "learning_rate": 5.740809711443464e-06, + "loss": 0.3014, + "step": 16173 + }, + { + "epoch": 0.47, + "grad_norm": 1.705814017524696, + "learning_rate": 5.740345184634496e-06, + "loss": 0.326, + "step": 16174 + }, + { + "epoch": 0.47, + "grad_norm": 0.9625060656070789, + "learning_rate": 5.739880651291973e-06, + "loss": 0.6366, + "step": 16175 + }, + { + "epoch": 0.47, + "grad_norm": 1.3535588166484531, + "learning_rate": 5.739416111419999e-06, + "loss": 0.3589, + "step": 16176 + }, + { + "epoch": 0.47, + "grad_norm": 0.9521864777700666, + "learning_rate": 5.738951565022666e-06, + "loss": 0.5682, + "step": 16177 + }, + { + "epoch": 0.47, + "grad_norm": 1.1888341287204292, + "learning_rate": 5.738487012104079e-06, + "loss": 0.3189, + "step": 16178 + }, + { + "epoch": 0.47, + "grad_norm": 1.308103823675324, + "learning_rate": 5.738022452668335e-06, + "loss": 0.3324, + "step": 16179 + }, + { + "epoch": 0.47, + "grad_norm": 1.2669455162833012, + "learning_rate": 5.737557886719534e-06, + "loss": 0.295, + "step": 16180 + }, + { + "epoch": 0.47, + "grad_norm": 1.4539136335705727, + "learning_rate": 5.737093314261777e-06, + "loss": 0.3352, + "step": 16181 + }, + { + "epoch": 0.47, + "grad_norm": 1.3548092229125803, + "learning_rate": 5.7366287352991636e-06, + "loss": 0.3267, + "step": 16182 + }, + { + "epoch": 0.47, + "grad_norm": 1.4332070842320455, + "learning_rate": 5.736164149835793e-06, + "loss": 0.3026, + "step": 16183 + }, + { + "epoch": 0.47, + "grad_norm": 1.345530275929396, + "learning_rate": 5.735699557875766e-06, + "loss": 0.3136, + "step": 16184 + }, + { + "epoch": 0.47, + "grad_norm": 1.3368340836663168, + "learning_rate": 5.735234959423182e-06, + "loss": 0.293, + "step": 16185 + }, + { + "epoch": 0.47, + "grad_norm": 1.3081849561871377, + "learning_rate": 5.734770354482141e-06, + "loss": 0.3096, + "step": 16186 + }, + { + "epoch": 0.47, + "grad_norm": 1.3729052494739735, + "learning_rate": 5.734305743056745e-06, + "loss": 0.3008, + "step": 16187 + }, + { + "epoch": 0.47, + "grad_norm": 3.0209625343257223, + "learning_rate": 5.733841125151092e-06, + "loss": 0.3469, + "step": 16188 + }, + { + "epoch": 0.47, + "grad_norm": 1.9762320961104989, + "learning_rate": 5.733376500769281e-06, + "loss": 0.3218, + "step": 16189 + }, + { + "epoch": 0.47, + "grad_norm": 1.5767344402028642, + "learning_rate": 5.732911869915418e-06, + "loss": 0.3199, + "step": 16190 + }, + { + "epoch": 0.47, + "grad_norm": 2.5831683326533526, + "learning_rate": 5.732447232593596e-06, + "loss": 0.3109, + "step": 16191 + }, + { + "epoch": 0.47, + "grad_norm": 2.430570943527845, + "learning_rate": 5.7319825888079215e-06, + "loss": 0.31, + "step": 16192 + }, + { + "epoch": 0.47, + "grad_norm": 1.2877179906708207, + "learning_rate": 5.731517938562492e-06, + "loss": 0.3138, + "step": 16193 + }, + { + "epoch": 0.47, + "grad_norm": 1.9076755306389672, + "learning_rate": 5.731053281861407e-06, + "loss": 0.3145, + "step": 16194 + }, + { + "epoch": 0.47, + "grad_norm": 1.2495982035078361, + "learning_rate": 5.73058861870877e-06, + "loss": 0.3194, + "step": 16195 + }, + { + "epoch": 0.47, + "grad_norm": 1.3267888701699482, + "learning_rate": 5.73012394910868e-06, + "loss": 0.306, + "step": 16196 + }, + { + "epoch": 0.47, + "grad_norm": 1.339888002768868, + "learning_rate": 5.729659273065239e-06, + "loss": 0.3297, + "step": 16197 + }, + { + "epoch": 0.47, + "grad_norm": 1.2209330914645595, + "learning_rate": 5.729194590582545e-06, + "loss": 0.319, + "step": 16198 + }, + { + "epoch": 0.47, + "grad_norm": 1.3443671321868254, + "learning_rate": 5.728729901664701e-06, + "loss": 0.3191, + "step": 16199 + }, + { + "epoch": 0.47, + "grad_norm": 1.2593474991521578, + "learning_rate": 5.728265206315809e-06, + "loss": 0.3151, + "step": 16200 + }, + { + "epoch": 0.47, + "grad_norm": 1.3507345111875217, + "learning_rate": 5.727800504539968e-06, + "loss": 0.3354, + "step": 16201 + }, + { + "epoch": 0.47, + "grad_norm": 1.4110946825491442, + "learning_rate": 5.727335796341279e-06, + "loss": 0.3117, + "step": 16202 + }, + { + "epoch": 0.47, + "grad_norm": 1.4406349940850918, + "learning_rate": 5.726871081723843e-06, + "loss": 0.317, + "step": 16203 + }, + { + "epoch": 0.47, + "grad_norm": 1.2304586712700973, + "learning_rate": 5.726406360691763e-06, + "loss": 0.3038, + "step": 16204 + }, + { + "epoch": 0.47, + "grad_norm": 1.3606598154923049, + "learning_rate": 5.725941633249138e-06, + "loss": 0.3306, + "step": 16205 + }, + { + "epoch": 0.47, + "grad_norm": 1.2766129671131756, + "learning_rate": 5.725476899400071e-06, + "loss": 0.2954, + "step": 16206 + }, + { + "epoch": 0.47, + "grad_norm": 1.299716281232766, + "learning_rate": 5.725012159148661e-06, + "loss": 0.3316, + "step": 16207 + }, + { + "epoch": 0.47, + "grad_norm": 2.0366151544344726, + "learning_rate": 5.7245474124990104e-06, + "loss": 0.3095, + "step": 16208 + }, + { + "epoch": 0.47, + "grad_norm": 1.2939905292445675, + "learning_rate": 5.724082659455222e-06, + "loss": 0.3122, + "step": 16209 + }, + { + "epoch": 0.47, + "grad_norm": 0.9824723502414516, + "learning_rate": 5.723617900021397e-06, + "loss": 0.5762, + "step": 16210 + }, + { + "epoch": 0.47, + "grad_norm": 1.4928075373788108, + "learning_rate": 5.723153134201633e-06, + "loss": 0.3288, + "step": 16211 + }, + { + "epoch": 0.47, + "grad_norm": 1.3430812652677975, + "learning_rate": 5.722688362000036e-06, + "loss": 0.3189, + "step": 16212 + }, + { + "epoch": 0.47, + "grad_norm": 1.339884656820528, + "learning_rate": 5.722223583420707e-06, + "loss": 0.3271, + "step": 16213 + }, + { + "epoch": 0.47, + "grad_norm": 1.3375566849887706, + "learning_rate": 5.721758798467747e-06, + "loss": 0.3266, + "step": 16214 + }, + { + "epoch": 0.47, + "grad_norm": 1.2563532472085084, + "learning_rate": 5.721294007145256e-06, + "loss": 0.3133, + "step": 16215 + }, + { + "epoch": 0.47, + "grad_norm": 1.454443638089983, + "learning_rate": 5.720829209457338e-06, + "loss": 0.304, + "step": 16216 + }, + { + "epoch": 0.47, + "grad_norm": 1.4550181400513749, + "learning_rate": 5.720364405408094e-06, + "loss": 0.3126, + "step": 16217 + }, + { + "epoch": 0.47, + "grad_norm": 1.626122114106351, + "learning_rate": 5.7198995950016276e-06, + "loss": 0.2987, + "step": 16218 + }, + { + "epoch": 0.47, + "grad_norm": 1.4665059275251944, + "learning_rate": 5.7194347782420375e-06, + "loss": 0.3339, + "step": 16219 + }, + { + "epoch": 0.47, + "grad_norm": 1.3970955130015785, + "learning_rate": 5.71896995513343e-06, + "loss": 0.3157, + "step": 16220 + }, + { + "epoch": 0.47, + "grad_norm": 1.218670571760767, + "learning_rate": 5.718505125679902e-06, + "loss": 0.325, + "step": 16221 + }, + { + "epoch": 0.47, + "grad_norm": 1.4819198040659576, + "learning_rate": 5.7180402898855595e-06, + "loss": 0.3479, + "step": 16222 + }, + { + "epoch": 0.47, + "grad_norm": 1.174351716167983, + "learning_rate": 5.717575447754502e-06, + "loss": 0.314, + "step": 16223 + }, + { + "epoch": 0.47, + "grad_norm": 2.113348750173902, + "learning_rate": 5.717110599290836e-06, + "loss": 0.3437, + "step": 16224 + }, + { + "epoch": 0.47, + "grad_norm": 1.2272886983461506, + "learning_rate": 5.716645744498659e-06, + "loss": 0.2913, + "step": 16225 + }, + { + "epoch": 0.47, + "grad_norm": 1.2341120162822967, + "learning_rate": 5.716180883382075e-06, + "loss": 0.3017, + "step": 16226 + }, + { + "epoch": 0.47, + "grad_norm": 1.230617429026254, + "learning_rate": 5.7157160159451875e-06, + "loss": 0.3345, + "step": 16227 + }, + { + "epoch": 0.47, + "grad_norm": 1.2366825572410938, + "learning_rate": 5.7152511421921e-06, + "loss": 0.3169, + "step": 16228 + }, + { + "epoch": 0.47, + "grad_norm": 1.205781372041461, + "learning_rate": 5.71478626212691e-06, + "loss": 0.3129, + "step": 16229 + }, + { + "epoch": 0.47, + "grad_norm": 1.2069577500671764, + "learning_rate": 5.714321375753725e-06, + "loss": 0.3165, + "step": 16230 + }, + { + "epoch": 0.47, + "grad_norm": 1.4069516340332493, + "learning_rate": 5.713856483076645e-06, + "loss": 0.3258, + "step": 16231 + }, + { + "epoch": 0.47, + "grad_norm": 1.1995934180515326, + "learning_rate": 5.713391584099776e-06, + "loss": 0.2904, + "step": 16232 + }, + { + "epoch": 0.47, + "grad_norm": 1.2215196971441378, + "learning_rate": 5.712926678827218e-06, + "loss": 0.3265, + "step": 16233 + }, + { + "epoch": 0.47, + "grad_norm": 1.293589868783808, + "learning_rate": 5.712461767263072e-06, + "loss": 0.313, + "step": 16234 + }, + { + "epoch": 0.47, + "grad_norm": 1.2249828818252992, + "learning_rate": 5.711996849411443e-06, + "loss": 0.3053, + "step": 16235 + }, + { + "epoch": 0.47, + "grad_norm": 1.2917281646645016, + "learning_rate": 5.711531925276434e-06, + "loss": 0.3024, + "step": 16236 + }, + { + "epoch": 0.47, + "grad_norm": 1.6435245059319503, + "learning_rate": 5.711066994862149e-06, + "loss": 0.3278, + "step": 16237 + }, + { + "epoch": 0.47, + "grad_norm": 1.4220177207923868, + "learning_rate": 5.710602058172691e-06, + "loss": 0.3187, + "step": 16238 + }, + { + "epoch": 0.47, + "grad_norm": 1.9581117718994911, + "learning_rate": 5.7101371152121596e-06, + "loss": 0.3209, + "step": 16239 + }, + { + "epoch": 0.47, + "grad_norm": 1.3682639619514778, + "learning_rate": 5.709672165984661e-06, + "loss": 0.3337, + "step": 16240 + }, + { + "epoch": 0.47, + "grad_norm": 1.2979085145731555, + "learning_rate": 5.709207210494297e-06, + "loss": 0.3329, + "step": 16241 + }, + { + "epoch": 0.47, + "grad_norm": 1.2563646588046609, + "learning_rate": 5.708742248745174e-06, + "loss": 0.3395, + "step": 16242 + }, + { + "epoch": 0.47, + "grad_norm": 1.323576019727917, + "learning_rate": 5.708277280741391e-06, + "loss": 0.3367, + "step": 16243 + }, + { + "epoch": 0.47, + "grad_norm": 1.5037611029001583, + "learning_rate": 5.707812306487053e-06, + "loss": 0.3292, + "step": 16244 + }, + { + "epoch": 0.47, + "grad_norm": 1.3701031233122571, + "learning_rate": 5.707347325986263e-06, + "loss": 0.3393, + "step": 16245 + }, + { + "epoch": 0.47, + "grad_norm": 1.3173968997025303, + "learning_rate": 5.706882339243127e-06, + "loss": 0.3004, + "step": 16246 + }, + { + "epoch": 0.47, + "grad_norm": 1.526341539400554, + "learning_rate": 5.706417346261745e-06, + "loss": 0.2952, + "step": 16247 + }, + { + "epoch": 0.47, + "grad_norm": 1.2202228355646865, + "learning_rate": 5.705952347046223e-06, + "loss": 0.3132, + "step": 16248 + }, + { + "epoch": 0.47, + "grad_norm": 1.1157660640924068, + "learning_rate": 5.705487341600663e-06, + "loss": 0.2815, + "step": 16249 + }, + { + "epoch": 0.47, + "grad_norm": 1.286772225275932, + "learning_rate": 5.705022329929169e-06, + "loss": 0.3268, + "step": 16250 + }, + { + "epoch": 0.47, + "grad_norm": 1.3857628064792398, + "learning_rate": 5.704557312035847e-06, + "loss": 0.3073, + "step": 16251 + }, + { + "epoch": 0.47, + "grad_norm": 1.4221608407709998, + "learning_rate": 5.704092287924797e-06, + "loss": 0.3105, + "step": 16252 + }, + { + "epoch": 0.47, + "grad_norm": 1.6775036326247683, + "learning_rate": 5.703627257600125e-06, + "loss": 0.3206, + "step": 16253 + }, + { + "epoch": 0.47, + "grad_norm": 1.4222516738137645, + "learning_rate": 5.703162221065935e-06, + "loss": 0.319, + "step": 16254 + }, + { + "epoch": 0.47, + "grad_norm": 1.7499931572507734, + "learning_rate": 5.70269717832633e-06, + "loss": 0.303, + "step": 16255 + }, + { + "epoch": 0.47, + "grad_norm": 1.3624334269993668, + "learning_rate": 5.7022321293854154e-06, + "loss": 0.3425, + "step": 16256 + }, + { + "epoch": 0.47, + "grad_norm": 1.4067525118250988, + "learning_rate": 5.701767074247294e-06, + "loss": 0.3214, + "step": 16257 + }, + { + "epoch": 0.47, + "grad_norm": 1.3285990243124675, + "learning_rate": 5.70130201291607e-06, + "loss": 0.3077, + "step": 16258 + }, + { + "epoch": 0.47, + "grad_norm": 1.307937437032008, + "learning_rate": 5.700836945395849e-06, + "loss": 0.3068, + "step": 16259 + }, + { + "epoch": 0.47, + "grad_norm": 1.3287650016503763, + "learning_rate": 5.700371871690734e-06, + "loss": 0.3225, + "step": 16260 + }, + { + "epoch": 0.47, + "grad_norm": 1.2931190147847287, + "learning_rate": 5.69990679180483e-06, + "loss": 0.3229, + "step": 16261 + }, + { + "epoch": 0.47, + "grad_norm": 0.8857469979324287, + "learning_rate": 5.6994417057422406e-06, + "loss": 0.5826, + "step": 16262 + }, + { + "epoch": 0.47, + "grad_norm": 2.556785457581296, + "learning_rate": 5.698976613507069e-06, + "loss": 0.3064, + "step": 16263 + }, + { + "epoch": 0.47, + "grad_norm": 1.3318690373943916, + "learning_rate": 5.698511515103421e-06, + "loss": 0.3193, + "step": 16264 + }, + { + "epoch": 0.47, + "grad_norm": 1.4428944722849524, + "learning_rate": 5.698046410535403e-06, + "loss": 0.3151, + "step": 16265 + }, + { + "epoch": 0.47, + "grad_norm": 1.328889292876362, + "learning_rate": 5.697581299807117e-06, + "loss": 0.317, + "step": 16266 + }, + { + "epoch": 0.47, + "grad_norm": 1.3901504569700511, + "learning_rate": 5.697116182922668e-06, + "loss": 0.3231, + "step": 16267 + }, + { + "epoch": 0.47, + "grad_norm": 0.9751097073185774, + "learning_rate": 5.696651059886159e-06, + "loss": 0.6127, + "step": 16268 + }, + { + "epoch": 0.47, + "grad_norm": 2.2598686312422047, + "learning_rate": 5.6961859307017e-06, + "loss": 0.3242, + "step": 16269 + }, + { + "epoch": 0.47, + "grad_norm": 1.458545252774375, + "learning_rate": 5.6957207953733904e-06, + "loss": 0.3078, + "step": 16270 + }, + { + "epoch": 0.47, + "grad_norm": 1.2971467546661541, + "learning_rate": 5.695255653905337e-06, + "loss": 0.3126, + "step": 16271 + }, + { + "epoch": 0.47, + "grad_norm": 1.291027607433865, + "learning_rate": 5.694790506301645e-06, + "loss": 0.3101, + "step": 16272 + }, + { + "epoch": 0.47, + "grad_norm": 1.2062941483982599, + "learning_rate": 5.69432535256642e-06, + "loss": 0.2968, + "step": 16273 + }, + { + "epoch": 0.47, + "grad_norm": 1.174720296210534, + "learning_rate": 5.693860192703768e-06, + "loss": 0.2895, + "step": 16274 + }, + { + "epoch": 0.47, + "grad_norm": 1.3940368048204164, + "learning_rate": 5.693395026717788e-06, + "loss": 0.312, + "step": 16275 + }, + { + "epoch": 0.47, + "grad_norm": 1.2585650209455401, + "learning_rate": 5.692929854612591e-06, + "loss": 0.3315, + "step": 16276 + }, + { + "epoch": 0.47, + "grad_norm": 1.4953930097991595, + "learning_rate": 5.69246467639228e-06, + "loss": 0.3324, + "step": 16277 + }, + { + "epoch": 0.47, + "grad_norm": 1.5557130113705058, + "learning_rate": 5.69199949206096e-06, + "loss": 0.3027, + "step": 16278 + }, + { + "epoch": 0.47, + "grad_norm": 1.3166906491077792, + "learning_rate": 5.691534301622739e-06, + "loss": 0.3287, + "step": 16279 + }, + { + "epoch": 0.47, + "grad_norm": 1.2933253034725165, + "learning_rate": 5.691069105081718e-06, + "loss": 0.3142, + "step": 16280 + }, + { + "epoch": 0.47, + "grad_norm": 1.2049332919378646, + "learning_rate": 5.690603902442005e-06, + "loss": 0.3026, + "step": 16281 + }, + { + "epoch": 0.47, + "grad_norm": 1.306424364573959, + "learning_rate": 5.690138693707704e-06, + "loss": 0.3335, + "step": 16282 + }, + { + "epoch": 0.47, + "grad_norm": 1.3384946449185235, + "learning_rate": 5.689673478882923e-06, + "loss": 0.3109, + "step": 16283 + }, + { + "epoch": 0.47, + "grad_norm": 1.411212566193023, + "learning_rate": 5.689208257971766e-06, + "loss": 0.303, + "step": 16284 + }, + { + "epoch": 0.47, + "grad_norm": 1.4050982223866064, + "learning_rate": 5.6887430309783365e-06, + "loss": 0.3393, + "step": 16285 + }, + { + "epoch": 0.47, + "grad_norm": 3.7733844591500976, + "learning_rate": 5.688277797906744e-06, + "loss": 0.3045, + "step": 16286 + }, + { + "epoch": 0.47, + "grad_norm": 1.303000819782969, + "learning_rate": 5.687812558761092e-06, + "loss": 0.3008, + "step": 16287 + }, + { + "epoch": 0.47, + "grad_norm": 1.2566238094195339, + "learning_rate": 5.687347313545487e-06, + "loss": 0.3232, + "step": 16288 + }, + { + "epoch": 0.47, + "grad_norm": 1.2028029183070472, + "learning_rate": 5.686882062264033e-06, + "loss": 0.2866, + "step": 16289 + }, + { + "epoch": 0.47, + "grad_norm": 1.2833434767921308, + "learning_rate": 5.686416804920838e-06, + "loss": 0.3022, + "step": 16290 + }, + { + "epoch": 0.47, + "grad_norm": 1.422533513892258, + "learning_rate": 5.685951541520006e-06, + "loss": 0.289, + "step": 16291 + }, + { + "epoch": 0.47, + "grad_norm": 1.2691891429193214, + "learning_rate": 5.685486272065645e-06, + "loss": 0.309, + "step": 16292 + }, + { + "epoch": 0.47, + "grad_norm": 1.4178833589011979, + "learning_rate": 5.685020996561861e-06, + "loss": 0.3215, + "step": 16293 + }, + { + "epoch": 0.47, + "grad_norm": 1.4873345935055853, + "learning_rate": 5.684555715012757e-06, + "loss": 0.296, + "step": 16294 + }, + { + "epoch": 0.47, + "grad_norm": 1.5525310784700639, + "learning_rate": 5.684090427422442e-06, + "loss": 0.3312, + "step": 16295 + }, + { + "epoch": 0.47, + "grad_norm": 1.5231246394936275, + "learning_rate": 5.683625133795021e-06, + "loss": 0.3219, + "step": 16296 + }, + { + "epoch": 0.47, + "grad_norm": 1.2945272209443308, + "learning_rate": 5.683159834134602e-06, + "loss": 0.3342, + "step": 16297 + }, + { + "epoch": 0.47, + "grad_norm": 1.379412579563387, + "learning_rate": 5.68269452844529e-06, + "loss": 0.3279, + "step": 16298 + }, + { + "epoch": 0.47, + "grad_norm": 1.3221342144832142, + "learning_rate": 5.682229216731189e-06, + "loss": 0.295, + "step": 16299 + }, + { + "epoch": 0.47, + "grad_norm": 1.3027559366805175, + "learning_rate": 5.6817638989964086e-06, + "loss": 0.3138, + "step": 16300 + }, + { + "epoch": 0.47, + "grad_norm": 1.3549530894502673, + "learning_rate": 5.681298575245056e-06, + "loss": 0.3464, + "step": 16301 + }, + { + "epoch": 0.47, + "grad_norm": 1.5859280632805939, + "learning_rate": 5.680833245481234e-06, + "loss": 0.3256, + "step": 16302 + }, + { + "epoch": 0.47, + "grad_norm": 1.4614755984585093, + "learning_rate": 5.6803679097090515e-06, + "loss": 0.3373, + "step": 16303 + }, + { + "epoch": 0.47, + "grad_norm": 1.3405359164427135, + "learning_rate": 5.679902567932615e-06, + "loss": 0.3222, + "step": 16304 + }, + { + "epoch": 0.47, + "grad_norm": 1.4208108056470972, + "learning_rate": 5.6794372201560295e-06, + "loss": 0.3179, + "step": 16305 + }, + { + "epoch": 0.47, + "grad_norm": 1.3397714879416411, + "learning_rate": 5.678971866383405e-06, + "loss": 0.3134, + "step": 16306 + }, + { + "epoch": 0.47, + "grad_norm": 1.3103760089300482, + "learning_rate": 5.678506506618845e-06, + "loss": 0.3118, + "step": 16307 + }, + { + "epoch": 0.47, + "grad_norm": 2.0090932184208476, + "learning_rate": 5.678041140866457e-06, + "loss": 0.3158, + "step": 16308 + }, + { + "epoch": 0.47, + "grad_norm": 2.6588741039686945, + "learning_rate": 5.677575769130349e-06, + "loss": 0.3231, + "step": 16309 + }, + { + "epoch": 0.47, + "grad_norm": 1.41810603703426, + "learning_rate": 5.677110391414629e-06, + "loss": 0.3474, + "step": 16310 + }, + { + "epoch": 0.47, + "grad_norm": 1.3874237199800632, + "learning_rate": 5.6766450077234e-06, + "loss": 0.3178, + "step": 16311 + }, + { + "epoch": 0.47, + "grad_norm": 1.2479807545823136, + "learning_rate": 5.676179618060772e-06, + "loss": 0.3137, + "step": 16312 + }, + { + "epoch": 0.47, + "grad_norm": 1.2607458877286917, + "learning_rate": 5.67571422243085e-06, + "loss": 0.2934, + "step": 16313 + }, + { + "epoch": 0.47, + "grad_norm": 1.4189196231320225, + "learning_rate": 5.675248820837744e-06, + "loss": 0.3173, + "step": 16314 + }, + { + "epoch": 0.47, + "grad_norm": 1.4342326139429304, + "learning_rate": 5.67478341328556e-06, + "loss": 0.3355, + "step": 16315 + }, + { + "epoch": 0.47, + "grad_norm": 1.2759252621654935, + "learning_rate": 5.674317999778406e-06, + "loss": 0.3101, + "step": 16316 + }, + { + "epoch": 0.47, + "grad_norm": 1.6249175124084103, + "learning_rate": 5.6738525803203855e-06, + "loss": 0.3132, + "step": 16317 + }, + { + "epoch": 0.47, + "grad_norm": 1.4021941968098597, + "learning_rate": 5.673387154915611e-06, + "loss": 0.3237, + "step": 16318 + }, + { + "epoch": 0.47, + "grad_norm": 1.476636251954147, + "learning_rate": 5.672921723568184e-06, + "loss": 0.3175, + "step": 16319 + }, + { + "epoch": 0.47, + "grad_norm": 0.9322794446051151, + "learning_rate": 5.672456286282218e-06, + "loss": 0.5927, + "step": 16320 + }, + { + "epoch": 0.47, + "grad_norm": 1.3617591719793707, + "learning_rate": 5.671990843061817e-06, + "loss": 0.3298, + "step": 16321 + }, + { + "epoch": 0.47, + "grad_norm": 1.2419696280363621, + "learning_rate": 5.671525393911091e-06, + "loss": 0.3222, + "step": 16322 + }, + { + "epoch": 0.47, + "grad_norm": 1.4131427893117456, + "learning_rate": 5.671059938834144e-06, + "loss": 0.3306, + "step": 16323 + }, + { + "epoch": 0.47, + "grad_norm": 0.9123853223883505, + "learning_rate": 5.670594477835086e-06, + "loss": 0.5258, + "step": 16324 + }, + { + "epoch": 0.47, + "grad_norm": 1.4035685402744913, + "learning_rate": 5.670129010918024e-06, + "loss": 0.3117, + "step": 16325 + }, + { + "epoch": 0.47, + "grad_norm": 1.2763832381052969, + "learning_rate": 5.669663538087067e-06, + "loss": 0.2991, + "step": 16326 + }, + { + "epoch": 0.47, + "grad_norm": 1.298561543868207, + "learning_rate": 5.66919805934632e-06, + "loss": 0.3181, + "step": 16327 + }, + { + "epoch": 0.47, + "grad_norm": 1.395752987100804, + "learning_rate": 5.668732574699896e-06, + "loss": 0.3018, + "step": 16328 + }, + { + "epoch": 0.47, + "grad_norm": 1.2734748227075645, + "learning_rate": 5.668267084151896e-06, + "loss": 0.3112, + "step": 16329 + }, + { + "epoch": 0.47, + "grad_norm": 1.3009248598015044, + "learning_rate": 5.667801587706434e-06, + "loss": 0.3108, + "step": 16330 + }, + { + "epoch": 0.47, + "grad_norm": 1.769354801078482, + "learning_rate": 5.667336085367614e-06, + "loss": 0.3106, + "step": 16331 + }, + { + "epoch": 0.47, + "grad_norm": 1.3064341142233094, + "learning_rate": 5.666870577139547e-06, + "loss": 0.3207, + "step": 16332 + }, + { + "epoch": 0.47, + "grad_norm": 1.3824383543514567, + "learning_rate": 5.666405063026339e-06, + "loss": 0.3161, + "step": 16333 + }, + { + "epoch": 0.47, + "grad_norm": 1.3236113213284595, + "learning_rate": 5.665939543032099e-06, + "loss": 0.3149, + "step": 16334 + }, + { + "epoch": 0.47, + "grad_norm": 1.3077084217167858, + "learning_rate": 5.665474017160936e-06, + "loss": 0.323, + "step": 16335 + }, + { + "epoch": 0.47, + "grad_norm": 1.5579692828416165, + "learning_rate": 5.6650084854169555e-06, + "loss": 0.3031, + "step": 16336 + }, + { + "epoch": 0.47, + "grad_norm": 1.2259398577146459, + "learning_rate": 5.664542947804269e-06, + "loss": 0.3085, + "step": 16337 + }, + { + "epoch": 0.47, + "grad_norm": 1.221250331618326, + "learning_rate": 5.664077404326984e-06, + "loss": 0.3117, + "step": 16338 + }, + { + "epoch": 0.47, + "grad_norm": 1.2582360230234495, + "learning_rate": 5.663611854989207e-06, + "loss": 0.306, + "step": 16339 + }, + { + "epoch": 0.47, + "grad_norm": 1.197644584222646, + "learning_rate": 5.663146299795049e-06, + "loss": 0.3041, + "step": 16340 + }, + { + "epoch": 0.47, + "grad_norm": 1.5692576090327721, + "learning_rate": 5.662680738748617e-06, + "loss": 0.312, + "step": 16341 + }, + { + "epoch": 0.47, + "grad_norm": 1.8039994614452617, + "learning_rate": 5.662215171854022e-06, + "loss": 0.3141, + "step": 16342 + }, + { + "epoch": 0.47, + "grad_norm": 1.2390762791563494, + "learning_rate": 5.66174959911537e-06, + "loss": 0.2951, + "step": 16343 + }, + { + "epoch": 0.47, + "grad_norm": 1.2905485691510534, + "learning_rate": 5.66128402053677e-06, + "loss": 0.3043, + "step": 16344 + }, + { + "epoch": 0.47, + "grad_norm": 1.324126413162699, + "learning_rate": 5.660818436122331e-06, + "loss": 0.3244, + "step": 16345 + }, + { + "epoch": 0.47, + "grad_norm": 1.2997262238961933, + "learning_rate": 5.6603528458761616e-06, + "loss": 0.3091, + "step": 16346 + }, + { + "epoch": 0.47, + "grad_norm": 1.4001008313344334, + "learning_rate": 5.659887249802372e-06, + "loss": 0.2997, + "step": 16347 + }, + { + "epoch": 0.47, + "grad_norm": 1.2676275929582743, + "learning_rate": 5.659421647905069e-06, + "loss": 0.3087, + "step": 16348 + }, + { + "epoch": 0.47, + "grad_norm": 1.4040899877532977, + "learning_rate": 5.658956040188363e-06, + "loss": 0.3273, + "step": 16349 + }, + { + "epoch": 0.47, + "grad_norm": 1.4986697467867212, + "learning_rate": 5.658490426656362e-06, + "loss": 0.301, + "step": 16350 + }, + { + "epoch": 0.47, + "grad_norm": 1.212933681627217, + "learning_rate": 5.6580248073131775e-06, + "loss": 0.291, + "step": 16351 + }, + { + "epoch": 0.47, + "grad_norm": 1.5735563539723132, + "learning_rate": 5.657559182162916e-06, + "loss": 0.3066, + "step": 16352 + }, + { + "epoch": 0.47, + "grad_norm": 1.4233637577421452, + "learning_rate": 5.657093551209687e-06, + "loss": 0.3298, + "step": 16353 + }, + { + "epoch": 0.47, + "grad_norm": 1.222311084697717, + "learning_rate": 5.6566279144576e-06, + "loss": 0.2996, + "step": 16354 + }, + { + "epoch": 0.47, + "grad_norm": 1.4696627379479765, + "learning_rate": 5.656162271910765e-06, + "loss": 0.3284, + "step": 16355 + }, + { + "epoch": 0.47, + "grad_norm": 1.558420858516451, + "learning_rate": 5.655696623573291e-06, + "loss": 0.3179, + "step": 16356 + }, + { + "epoch": 0.47, + "grad_norm": 1.6806932696070984, + "learning_rate": 5.655230969449285e-06, + "loss": 0.3272, + "step": 16357 + }, + { + "epoch": 0.47, + "grad_norm": 1.2507130499480617, + "learning_rate": 5.654765309542861e-06, + "loss": 0.3168, + "step": 16358 + }, + { + "epoch": 0.47, + "grad_norm": 1.5400444451953308, + "learning_rate": 5.6542996438581256e-06, + "loss": 0.3119, + "step": 16359 + }, + { + "epoch": 0.47, + "grad_norm": 1.454026787312775, + "learning_rate": 5.653833972399186e-06, + "loss": 0.2952, + "step": 16360 + }, + { + "epoch": 0.47, + "grad_norm": 1.3713046189225337, + "learning_rate": 5.653368295170158e-06, + "loss": 0.3014, + "step": 16361 + }, + { + "epoch": 0.47, + "grad_norm": 1.0058272525114127, + "learning_rate": 5.6529026121751455e-06, + "loss": 0.6668, + "step": 16362 + }, + { + "epoch": 0.47, + "grad_norm": 1.1749396630817452, + "learning_rate": 5.6524369234182594e-06, + "loss": 0.3077, + "step": 16363 + }, + { + "epoch": 0.47, + "grad_norm": 1.316438170066995, + "learning_rate": 5.651971228903611e-06, + "loss": 0.3269, + "step": 16364 + }, + { + "epoch": 0.47, + "grad_norm": 1.5756991753253038, + "learning_rate": 5.65150552863531e-06, + "loss": 0.3293, + "step": 16365 + }, + { + "epoch": 0.47, + "grad_norm": 1.3361989878978637, + "learning_rate": 5.651039822617465e-06, + "loss": 0.3002, + "step": 16366 + }, + { + "epoch": 0.47, + "grad_norm": 0.8838216442833946, + "learning_rate": 5.650574110854185e-06, + "loss": 0.5776, + "step": 16367 + }, + { + "epoch": 0.47, + "grad_norm": 1.8028747865337673, + "learning_rate": 5.650108393349583e-06, + "loss": 0.2988, + "step": 16368 + }, + { + "epoch": 0.47, + "grad_norm": 1.3681240716763918, + "learning_rate": 5.6496426701077655e-06, + "loss": 0.2819, + "step": 16369 + }, + { + "epoch": 0.47, + "grad_norm": 1.315293921729204, + "learning_rate": 5.649176941132847e-06, + "loss": 0.313, + "step": 16370 + }, + { + "epoch": 0.47, + "grad_norm": 1.5383469057171109, + "learning_rate": 5.6487112064289316e-06, + "loss": 0.3247, + "step": 16371 + }, + { + "epoch": 0.47, + "grad_norm": 1.340223456295633, + "learning_rate": 5.648245466000136e-06, + "loss": 0.2952, + "step": 16372 + }, + { + "epoch": 0.47, + "grad_norm": 1.4710510489302773, + "learning_rate": 5.647779719850564e-06, + "loss": 0.3106, + "step": 16373 + }, + { + "epoch": 0.47, + "grad_norm": 1.233931389641082, + "learning_rate": 5.647313967984331e-06, + "loss": 0.3089, + "step": 16374 + }, + { + "epoch": 0.47, + "grad_norm": 1.3057457522834068, + "learning_rate": 5.646848210405544e-06, + "loss": 0.3103, + "step": 16375 + }, + { + "epoch": 0.47, + "grad_norm": 1.4098971266409395, + "learning_rate": 5.646382447118315e-06, + "loss": 0.3169, + "step": 16376 + }, + { + "epoch": 0.48, + "grad_norm": 1.3391022445664167, + "learning_rate": 5.645916678126752e-06, + "loss": 0.3415, + "step": 16377 + }, + { + "epoch": 0.48, + "grad_norm": 1.2748284988995415, + "learning_rate": 5.645450903434968e-06, + "loss": 0.3141, + "step": 16378 + }, + { + "epoch": 0.48, + "grad_norm": 1.3856817118692948, + "learning_rate": 5.6449851230470735e-06, + "loss": 0.3448, + "step": 16379 + }, + { + "epoch": 0.48, + "grad_norm": 1.7454938756870273, + "learning_rate": 5.644519336967177e-06, + "loss": 0.309, + "step": 16380 + }, + { + "epoch": 0.48, + "grad_norm": 1.3133267865294376, + "learning_rate": 5.64405354519939e-06, + "loss": 0.3092, + "step": 16381 + }, + { + "epoch": 0.48, + "grad_norm": 1.8321209664333005, + "learning_rate": 5.643587747747825e-06, + "loss": 0.3188, + "step": 16382 + }, + { + "epoch": 0.48, + "grad_norm": 1.2818087237762308, + "learning_rate": 5.643121944616591e-06, + "loss": 0.2942, + "step": 16383 + }, + { + "epoch": 0.48, + "grad_norm": 1.732981472610919, + "learning_rate": 5.6426561358097965e-06, + "loss": 0.3211, + "step": 16384 + }, + { + "epoch": 0.48, + "grad_norm": 1.2317994563854973, + "learning_rate": 5.642190321331556e-06, + "loss": 0.2984, + "step": 16385 + }, + { + "epoch": 0.48, + "grad_norm": 1.2694812239991953, + "learning_rate": 5.6417245011859775e-06, + "loss": 0.3242, + "step": 16386 + }, + { + "epoch": 0.48, + "grad_norm": 1.747769367887689, + "learning_rate": 5.641258675377175e-06, + "loss": 0.2905, + "step": 16387 + }, + { + "epoch": 0.48, + "grad_norm": 1.325696128345541, + "learning_rate": 5.640792843909257e-06, + "loss": 0.3152, + "step": 16388 + }, + { + "epoch": 0.48, + "grad_norm": 1.040399121916134, + "learning_rate": 5.6403270067863345e-06, + "loss": 0.62, + "step": 16389 + }, + { + "epoch": 0.48, + "grad_norm": 1.2475231485065497, + "learning_rate": 5.63986116401252e-06, + "loss": 0.3078, + "step": 16390 + }, + { + "epoch": 0.48, + "grad_norm": 1.293237411245624, + "learning_rate": 5.6393953155919225e-06, + "loss": 0.3077, + "step": 16391 + }, + { + "epoch": 0.48, + "grad_norm": 1.4043244707385896, + "learning_rate": 5.638929461528655e-06, + "loss": 0.3024, + "step": 16392 + }, + { + "epoch": 0.48, + "grad_norm": 1.4289368494017007, + "learning_rate": 5.638463601826828e-06, + "loss": 0.3204, + "step": 16393 + }, + { + "epoch": 0.48, + "grad_norm": 1.3308498059367615, + "learning_rate": 5.637997736490551e-06, + "loss": 0.3223, + "step": 16394 + }, + { + "epoch": 0.48, + "grad_norm": 1.6257129574034306, + "learning_rate": 5.637531865523939e-06, + "loss": 0.3135, + "step": 16395 + }, + { + "epoch": 0.48, + "grad_norm": 1.470888119022003, + "learning_rate": 5.6370659889311e-06, + "loss": 0.2965, + "step": 16396 + }, + { + "epoch": 0.48, + "grad_norm": 1.4996758964667063, + "learning_rate": 5.636600106716147e-06, + "loss": 0.304, + "step": 16397 + }, + { + "epoch": 0.48, + "grad_norm": 1.687720286217182, + "learning_rate": 5.6361342188831915e-06, + "loss": 0.3243, + "step": 16398 + }, + { + "epoch": 0.48, + "grad_norm": 1.4516797467606368, + "learning_rate": 5.635668325436343e-06, + "loss": 0.2971, + "step": 16399 + }, + { + "epoch": 0.48, + "grad_norm": 1.4941157414649864, + "learning_rate": 5.6352024263797155e-06, + "loss": 0.3441, + "step": 16400 + }, + { + "epoch": 0.48, + "grad_norm": 2.0876731426736974, + "learning_rate": 5.63473652171742e-06, + "loss": 0.3011, + "step": 16401 + }, + { + "epoch": 0.48, + "grad_norm": 1.2158704490860424, + "learning_rate": 5.634270611453567e-06, + "loss": 0.3006, + "step": 16402 + }, + { + "epoch": 0.48, + "grad_norm": 1.2687129086378226, + "learning_rate": 5.633804695592269e-06, + "loss": 0.3229, + "step": 16403 + }, + { + "epoch": 0.48, + "grad_norm": 1.719355715364042, + "learning_rate": 5.633338774137638e-06, + "loss": 0.3169, + "step": 16404 + }, + { + "epoch": 0.48, + "grad_norm": 1.4086968019046158, + "learning_rate": 5.6328728470937845e-06, + "loss": 0.3216, + "step": 16405 + }, + { + "epoch": 0.48, + "grad_norm": 1.2921850676121376, + "learning_rate": 5.632406914464821e-06, + "loss": 0.3236, + "step": 16406 + }, + { + "epoch": 0.48, + "grad_norm": 1.3911631639930584, + "learning_rate": 5.631940976254859e-06, + "loss": 0.3094, + "step": 16407 + }, + { + "epoch": 0.48, + "grad_norm": 1.337813794789593, + "learning_rate": 5.631475032468012e-06, + "loss": 0.3446, + "step": 16408 + }, + { + "epoch": 0.48, + "grad_norm": 1.28068185397869, + "learning_rate": 5.6310090831083905e-06, + "loss": 0.3083, + "step": 16409 + }, + { + "epoch": 0.48, + "grad_norm": 1.566333135507003, + "learning_rate": 5.630543128180106e-06, + "loss": 0.3513, + "step": 16410 + }, + { + "epoch": 0.48, + "grad_norm": 2.0565609791760604, + "learning_rate": 5.630077167687274e-06, + "loss": 0.3103, + "step": 16411 + }, + { + "epoch": 0.48, + "grad_norm": 1.3657535020537088, + "learning_rate": 5.629611201634001e-06, + "loss": 0.3302, + "step": 16412 + }, + { + "epoch": 0.48, + "grad_norm": 1.266514786112487, + "learning_rate": 5.629145230024403e-06, + "loss": 0.3175, + "step": 16413 + }, + { + "epoch": 0.48, + "grad_norm": 1.2774054919041768, + "learning_rate": 5.628679252862592e-06, + "loss": 0.3015, + "step": 16414 + }, + { + "epoch": 0.48, + "grad_norm": 1.3049194347059392, + "learning_rate": 5.62821327015268e-06, + "loss": 0.3137, + "step": 16415 + }, + { + "epoch": 0.48, + "grad_norm": 1.388506960264279, + "learning_rate": 5.627747281898778e-06, + "loss": 0.3117, + "step": 16416 + }, + { + "epoch": 0.48, + "grad_norm": 1.4372835259348755, + "learning_rate": 5.627281288105001e-06, + "loss": 0.3148, + "step": 16417 + }, + { + "epoch": 0.48, + "grad_norm": 1.2618935729683627, + "learning_rate": 5.626815288775458e-06, + "loss": 0.305, + "step": 16418 + }, + { + "epoch": 0.48, + "grad_norm": 2.125775141414936, + "learning_rate": 5.626349283914263e-06, + "loss": 0.3228, + "step": 16419 + }, + { + "epoch": 0.48, + "grad_norm": 1.249861591902306, + "learning_rate": 5.62588327352553e-06, + "loss": 0.2881, + "step": 16420 + }, + { + "epoch": 0.48, + "grad_norm": 1.6462304057291088, + "learning_rate": 5.6254172576133685e-06, + "loss": 0.3558, + "step": 16421 + }, + { + "epoch": 0.48, + "grad_norm": 1.233200673906026, + "learning_rate": 5.624951236181893e-06, + "loss": 0.3048, + "step": 16422 + }, + { + "epoch": 0.48, + "grad_norm": 1.4229259581630942, + "learning_rate": 5.624485209235215e-06, + "loss": 0.3507, + "step": 16423 + }, + { + "epoch": 0.48, + "grad_norm": 1.5036369167048953, + "learning_rate": 5.62401917677745e-06, + "loss": 0.3214, + "step": 16424 + }, + { + "epoch": 0.48, + "grad_norm": 1.2470697522192307, + "learning_rate": 5.623553138812709e-06, + "loss": 0.3174, + "step": 16425 + }, + { + "epoch": 0.48, + "grad_norm": 1.2837713093219374, + "learning_rate": 5.623087095345103e-06, + "loss": 0.3155, + "step": 16426 + }, + { + "epoch": 0.48, + "grad_norm": 1.2693167146405957, + "learning_rate": 5.622621046378748e-06, + "loss": 0.3243, + "step": 16427 + }, + { + "epoch": 0.48, + "grad_norm": 1.4941018103316814, + "learning_rate": 5.622154991917754e-06, + "loss": 0.307, + "step": 16428 + }, + { + "epoch": 0.48, + "grad_norm": 1.3612605179844803, + "learning_rate": 5.621688931966237e-06, + "loss": 0.3358, + "step": 16429 + }, + { + "epoch": 0.48, + "grad_norm": 1.427769046956453, + "learning_rate": 5.621222866528308e-06, + "loss": 0.3247, + "step": 16430 + }, + { + "epoch": 0.48, + "grad_norm": 1.3861942858194505, + "learning_rate": 5.6207567956080796e-06, + "loss": 0.3194, + "step": 16431 + }, + { + "epoch": 0.48, + "grad_norm": 1.2864231122607488, + "learning_rate": 5.620290719209665e-06, + "loss": 0.3174, + "step": 16432 + }, + { + "epoch": 0.48, + "grad_norm": 1.3041169167284807, + "learning_rate": 5.619824637337178e-06, + "loss": 0.3214, + "step": 16433 + }, + { + "epoch": 0.48, + "grad_norm": 1.9263695611696545, + "learning_rate": 5.619358549994734e-06, + "loss": 0.3112, + "step": 16434 + }, + { + "epoch": 0.48, + "grad_norm": 1.333395007264516, + "learning_rate": 5.618892457186441e-06, + "loss": 0.3223, + "step": 16435 + }, + { + "epoch": 0.48, + "grad_norm": 1.2484909311839876, + "learning_rate": 5.618426358916416e-06, + "loss": 0.3202, + "step": 16436 + }, + { + "epoch": 0.48, + "grad_norm": 1.5129921767462648, + "learning_rate": 5.617960255188772e-06, + "loss": 0.2956, + "step": 16437 + }, + { + "epoch": 0.48, + "grad_norm": 1.4499898590879048, + "learning_rate": 5.617494146007623e-06, + "loss": 0.3027, + "step": 16438 + }, + { + "epoch": 0.48, + "grad_norm": 1.2634523936359345, + "learning_rate": 5.61702803137708e-06, + "loss": 0.3213, + "step": 16439 + }, + { + "epoch": 0.48, + "grad_norm": 1.2229735102011754, + "learning_rate": 5.616561911301257e-06, + "loss": 0.3103, + "step": 16440 + }, + { + "epoch": 0.48, + "grad_norm": 1.5168192242035061, + "learning_rate": 5.616095785784269e-06, + "loss": 0.3254, + "step": 16441 + }, + { + "epoch": 0.48, + "grad_norm": 1.7171219027111468, + "learning_rate": 5.615629654830228e-06, + "loss": 0.3234, + "step": 16442 + }, + { + "epoch": 0.48, + "grad_norm": 1.4380291627529713, + "learning_rate": 5.6151635184432515e-06, + "loss": 0.3121, + "step": 16443 + }, + { + "epoch": 0.48, + "grad_norm": 1.45694403659869, + "learning_rate": 5.6146973766274486e-06, + "loss": 0.3135, + "step": 16444 + }, + { + "epoch": 0.48, + "grad_norm": 1.278167681738596, + "learning_rate": 5.614231229386933e-06, + "loss": 0.3018, + "step": 16445 + }, + { + "epoch": 0.48, + "grad_norm": 1.579781576793798, + "learning_rate": 5.613765076725821e-06, + "loss": 0.337, + "step": 16446 + }, + { + "epoch": 0.48, + "grad_norm": 1.522475975110191, + "learning_rate": 5.613298918648225e-06, + "loss": 0.3114, + "step": 16447 + }, + { + "epoch": 0.48, + "grad_norm": 1.5264236676589003, + "learning_rate": 5.612832755158261e-06, + "loss": 0.3251, + "step": 16448 + }, + { + "epoch": 0.48, + "grad_norm": 1.402057382571314, + "learning_rate": 5.612366586260039e-06, + "loss": 0.322, + "step": 16449 + }, + { + "epoch": 0.48, + "grad_norm": 2.5465831727990373, + "learning_rate": 5.611900411957676e-06, + "loss": 0.3231, + "step": 16450 + }, + { + "epoch": 0.48, + "grad_norm": 1.390811681356537, + "learning_rate": 5.611434232255284e-06, + "loss": 0.3021, + "step": 16451 + }, + { + "epoch": 0.48, + "grad_norm": 1.26667675219658, + "learning_rate": 5.610968047156981e-06, + "loss": 0.3173, + "step": 16452 + }, + { + "epoch": 0.48, + "grad_norm": 1.868380098315362, + "learning_rate": 5.610501856666875e-06, + "loss": 0.3075, + "step": 16453 + }, + { + "epoch": 0.48, + "grad_norm": 1.5956955392214753, + "learning_rate": 5.610035660789084e-06, + "loss": 0.322, + "step": 16454 + }, + { + "epoch": 0.48, + "grad_norm": 1.2412436367166328, + "learning_rate": 5.609569459527722e-06, + "loss": 0.2946, + "step": 16455 + }, + { + "epoch": 0.48, + "grad_norm": 1.2985184708199136, + "learning_rate": 5.609103252886904e-06, + "loss": 0.3239, + "step": 16456 + }, + { + "epoch": 0.48, + "grad_norm": 1.5426116076938343, + "learning_rate": 5.608637040870743e-06, + "loss": 0.3083, + "step": 16457 + }, + { + "epoch": 0.48, + "grad_norm": 1.2742210780042322, + "learning_rate": 5.608170823483352e-06, + "loss": 0.3085, + "step": 16458 + }, + { + "epoch": 0.48, + "grad_norm": 1.4253114454033824, + "learning_rate": 5.607704600728846e-06, + "loss": 0.314, + "step": 16459 + }, + { + "epoch": 0.48, + "grad_norm": 1.3904482229003756, + "learning_rate": 5.607238372611341e-06, + "loss": 0.3008, + "step": 16460 + }, + { + "epoch": 0.48, + "grad_norm": 1.2277111464884911, + "learning_rate": 5.606772139134952e-06, + "loss": 0.3025, + "step": 16461 + }, + { + "epoch": 0.48, + "grad_norm": 1.287209313791566, + "learning_rate": 5.60630590030379e-06, + "loss": 0.3082, + "step": 16462 + }, + { + "epoch": 0.48, + "grad_norm": 1.3340979813178804, + "learning_rate": 5.605839656121973e-06, + "loss": 0.3291, + "step": 16463 + }, + { + "epoch": 0.48, + "grad_norm": 1.3928057907320817, + "learning_rate": 5.6053734065936134e-06, + "loss": 0.3271, + "step": 16464 + }, + { + "epoch": 0.48, + "grad_norm": 1.319563587035885, + "learning_rate": 5.604907151722827e-06, + "loss": 0.3059, + "step": 16465 + }, + { + "epoch": 0.48, + "grad_norm": 1.2553725669077593, + "learning_rate": 5.604440891513729e-06, + "loss": 0.31, + "step": 16466 + }, + { + "epoch": 0.48, + "grad_norm": 1.8741364845625488, + "learning_rate": 5.603974625970433e-06, + "loss": 0.296, + "step": 16467 + }, + { + "epoch": 0.48, + "grad_norm": 1.3703643223934396, + "learning_rate": 5.603508355097054e-06, + "loss": 0.3494, + "step": 16468 + }, + { + "epoch": 0.48, + "grad_norm": 1.2285306246990289, + "learning_rate": 5.603042078897707e-06, + "loss": 0.3102, + "step": 16469 + }, + { + "epoch": 0.48, + "grad_norm": 1.182191989982131, + "learning_rate": 5.602575797376508e-06, + "loss": 0.3076, + "step": 16470 + }, + { + "epoch": 0.48, + "grad_norm": 1.3600595780727425, + "learning_rate": 5.602109510537571e-06, + "loss": 0.3077, + "step": 16471 + }, + { + "epoch": 0.48, + "grad_norm": 1.6770947556840137, + "learning_rate": 5.60164321838501e-06, + "loss": 0.3252, + "step": 16472 + }, + { + "epoch": 0.48, + "grad_norm": 1.0280852411920351, + "learning_rate": 5.601176920922941e-06, + "loss": 0.6294, + "step": 16473 + }, + { + "epoch": 0.48, + "grad_norm": 1.2468032265019118, + "learning_rate": 5.600710618155479e-06, + "loss": 0.3068, + "step": 16474 + }, + { + "epoch": 0.48, + "grad_norm": 1.465007614743703, + "learning_rate": 5.60024431008674e-06, + "loss": 0.3018, + "step": 16475 + }, + { + "epoch": 0.48, + "grad_norm": 2.344822422083272, + "learning_rate": 5.5997779967208364e-06, + "loss": 0.2904, + "step": 16476 + }, + { + "epoch": 0.48, + "grad_norm": 1.3919761926725116, + "learning_rate": 5.599311678061886e-06, + "loss": 0.3025, + "step": 16477 + }, + { + "epoch": 0.48, + "grad_norm": 1.466546295633369, + "learning_rate": 5.598845354114004e-06, + "loss": 0.32, + "step": 16478 + }, + { + "epoch": 0.48, + "grad_norm": 1.2685597675958047, + "learning_rate": 5.598379024881305e-06, + "loss": 0.324, + "step": 16479 + }, + { + "epoch": 0.48, + "grad_norm": 0.9180774100701287, + "learning_rate": 5.597912690367905e-06, + "loss": 0.5926, + "step": 16480 + }, + { + "epoch": 0.48, + "grad_norm": 1.49561446962458, + "learning_rate": 5.597446350577918e-06, + "loss": 0.3305, + "step": 16481 + }, + { + "epoch": 0.48, + "grad_norm": 1.970252298898328, + "learning_rate": 5.59698000551546e-06, + "loss": 0.3075, + "step": 16482 + }, + { + "epoch": 0.48, + "grad_norm": 0.9747218493214794, + "learning_rate": 5.596513655184647e-06, + "loss": 0.6541, + "step": 16483 + }, + { + "epoch": 0.48, + "grad_norm": 1.4469382130416921, + "learning_rate": 5.596047299589595e-06, + "loss": 0.3148, + "step": 16484 + }, + { + "epoch": 0.48, + "grad_norm": 1.4951508823084732, + "learning_rate": 5.5955809387344194e-06, + "loss": 0.3036, + "step": 16485 + }, + { + "epoch": 0.48, + "grad_norm": 1.1932928186024558, + "learning_rate": 5.595114572623235e-06, + "loss": 0.3025, + "step": 16486 + }, + { + "epoch": 0.48, + "grad_norm": 1.4220291634937283, + "learning_rate": 5.594648201260158e-06, + "loss": 0.3022, + "step": 16487 + }, + { + "epoch": 0.48, + "grad_norm": 1.4050899175239944, + "learning_rate": 5.594181824649303e-06, + "loss": 0.3057, + "step": 16488 + }, + { + "epoch": 0.48, + "grad_norm": 1.5164353724895732, + "learning_rate": 5.593715442794788e-06, + "loss": 0.3336, + "step": 16489 + }, + { + "epoch": 0.48, + "grad_norm": 1.2489985606802338, + "learning_rate": 5.5932490557007265e-06, + "loss": 0.3031, + "step": 16490 + }, + { + "epoch": 0.48, + "grad_norm": 1.2515982387917035, + "learning_rate": 5.592782663371237e-06, + "loss": 0.3094, + "step": 16491 + }, + { + "epoch": 0.48, + "grad_norm": 2.006801959638243, + "learning_rate": 5.5923162658104325e-06, + "loss": 0.2929, + "step": 16492 + }, + { + "epoch": 0.48, + "grad_norm": 1.7035242823086285, + "learning_rate": 5.591849863022432e-06, + "loss": 0.3003, + "step": 16493 + }, + { + "epoch": 0.48, + "grad_norm": 1.3425216663820174, + "learning_rate": 5.591383455011348e-06, + "loss": 0.3155, + "step": 16494 + }, + { + "epoch": 0.48, + "grad_norm": 1.3968205517652295, + "learning_rate": 5.5909170417812985e-06, + "loss": 0.3021, + "step": 16495 + }, + { + "epoch": 0.48, + "grad_norm": 1.6513725014091492, + "learning_rate": 5.590450623336399e-06, + "loss": 0.328, + "step": 16496 + }, + { + "epoch": 0.48, + "grad_norm": 1.3711064436157543, + "learning_rate": 5.5899841996807695e-06, + "loss": 0.3268, + "step": 16497 + }, + { + "epoch": 0.48, + "grad_norm": 1.3277909895575117, + "learning_rate": 5.58951777081852e-06, + "loss": 0.3264, + "step": 16498 + }, + { + "epoch": 0.48, + "grad_norm": 1.6086130855709446, + "learning_rate": 5.589051336753771e-06, + "loss": 0.3194, + "step": 16499 + }, + { + "epoch": 0.48, + "grad_norm": 1.5220361911609561, + "learning_rate": 5.588584897490638e-06, + "loss": 0.3012, + "step": 16500 + }, + { + "epoch": 0.48, + "grad_norm": 1.2885194909511843, + "learning_rate": 5.588118453033234e-06, + "loss": 0.3021, + "step": 16501 + }, + { + "epoch": 0.48, + "grad_norm": 1.3431481002881847, + "learning_rate": 5.587652003385681e-06, + "loss": 0.3371, + "step": 16502 + }, + { + "epoch": 0.48, + "grad_norm": 1.2566059421371079, + "learning_rate": 5.587185548552092e-06, + "loss": 0.3243, + "step": 16503 + }, + { + "epoch": 0.48, + "grad_norm": 1.3232198385590832, + "learning_rate": 5.586719088536583e-06, + "loss": 0.2996, + "step": 16504 + }, + { + "epoch": 0.48, + "grad_norm": 1.3355660059960883, + "learning_rate": 5.586252623343271e-06, + "loss": 0.3152, + "step": 16505 + }, + { + "epoch": 0.48, + "grad_norm": 1.2975689209309487, + "learning_rate": 5.585786152976274e-06, + "loss": 0.3245, + "step": 16506 + }, + { + "epoch": 0.48, + "grad_norm": 1.346146673345647, + "learning_rate": 5.585319677439709e-06, + "loss": 0.3227, + "step": 16507 + }, + { + "epoch": 0.48, + "grad_norm": 1.3498924175384341, + "learning_rate": 5.584853196737689e-06, + "loss": 0.2859, + "step": 16508 + }, + { + "epoch": 0.48, + "grad_norm": 1.254584478377825, + "learning_rate": 5.584386710874334e-06, + "loss": 0.3149, + "step": 16509 + }, + { + "epoch": 0.48, + "grad_norm": 1.402439468025692, + "learning_rate": 5.58392021985376e-06, + "loss": 0.3093, + "step": 16510 + }, + { + "epoch": 0.48, + "grad_norm": 1.4119907081551193, + "learning_rate": 5.5834537236800835e-06, + "loss": 0.3096, + "step": 16511 + }, + { + "epoch": 0.48, + "grad_norm": 1.2653679540251483, + "learning_rate": 5.582987222357421e-06, + "loss": 0.3226, + "step": 16512 + }, + { + "epoch": 0.48, + "grad_norm": 1.2236246320530397, + "learning_rate": 5.582520715889892e-06, + "loss": 0.3098, + "step": 16513 + }, + { + "epoch": 0.48, + "grad_norm": 1.290784687977533, + "learning_rate": 5.582054204281609e-06, + "loss": 0.312, + "step": 16514 + }, + { + "epoch": 0.48, + "grad_norm": 1.6656061409972194, + "learning_rate": 5.581587687536691e-06, + "loss": 0.3092, + "step": 16515 + }, + { + "epoch": 0.48, + "grad_norm": 1.2642632433101912, + "learning_rate": 5.581121165659257e-06, + "loss": 0.3081, + "step": 16516 + }, + { + "epoch": 0.48, + "grad_norm": 1.2167618417409016, + "learning_rate": 5.58065463865342e-06, + "loss": 0.3091, + "step": 16517 + }, + { + "epoch": 0.48, + "grad_norm": 1.2778046674461074, + "learning_rate": 5.580188106523301e-06, + "loss": 0.3137, + "step": 16518 + }, + { + "epoch": 0.48, + "grad_norm": 1.4241844140589721, + "learning_rate": 5.579721569273015e-06, + "loss": 0.2894, + "step": 16519 + }, + { + "epoch": 0.48, + "grad_norm": 1.262731701657369, + "learning_rate": 5.5792550269066795e-06, + "loss": 0.3063, + "step": 16520 + }, + { + "epoch": 0.48, + "grad_norm": 1.2585040331227997, + "learning_rate": 5.5787884794284116e-06, + "loss": 0.2961, + "step": 16521 + }, + { + "epoch": 0.48, + "grad_norm": 1.5430706601435364, + "learning_rate": 5.578321926842329e-06, + "loss": 0.3282, + "step": 16522 + }, + { + "epoch": 0.48, + "grad_norm": 1.3757521661697765, + "learning_rate": 5.57785536915255e-06, + "loss": 0.3138, + "step": 16523 + }, + { + "epoch": 0.48, + "grad_norm": 1.718475722435777, + "learning_rate": 5.5773888063631895e-06, + "loss": 0.2983, + "step": 16524 + }, + { + "epoch": 0.48, + "grad_norm": 1.2492506919612714, + "learning_rate": 5.576922238478368e-06, + "loss": 0.3049, + "step": 16525 + }, + { + "epoch": 0.48, + "grad_norm": 1.3581842885923174, + "learning_rate": 5.576455665502202e-06, + "loss": 0.3149, + "step": 16526 + }, + { + "epoch": 0.48, + "grad_norm": 1.3223734407166656, + "learning_rate": 5.575989087438807e-06, + "loss": 0.3034, + "step": 16527 + }, + { + "epoch": 0.48, + "grad_norm": 1.2911129526087037, + "learning_rate": 5.575522504292302e-06, + "loss": 0.2923, + "step": 16528 + }, + { + "epoch": 0.48, + "grad_norm": 1.3824215760432181, + "learning_rate": 5.575055916066804e-06, + "loss": 0.3238, + "step": 16529 + }, + { + "epoch": 0.48, + "grad_norm": 1.2909188422280047, + "learning_rate": 5.574589322766433e-06, + "loss": 0.2972, + "step": 16530 + }, + { + "epoch": 0.48, + "grad_norm": 1.2074479709061952, + "learning_rate": 5.574122724395302e-06, + "loss": 0.2954, + "step": 16531 + }, + { + "epoch": 0.48, + "grad_norm": 1.2620740553678562, + "learning_rate": 5.573656120957533e-06, + "loss": 0.3184, + "step": 16532 + }, + { + "epoch": 0.48, + "grad_norm": 1.3405124444880034, + "learning_rate": 5.573189512457242e-06, + "loss": 0.3062, + "step": 16533 + }, + { + "epoch": 0.48, + "grad_norm": 1.3992915088802644, + "learning_rate": 5.572722898898549e-06, + "loss": 0.3181, + "step": 16534 + }, + { + "epoch": 0.48, + "grad_norm": 2.210965633902141, + "learning_rate": 5.5722562802855685e-06, + "loss": 0.3152, + "step": 16535 + }, + { + "epoch": 0.48, + "grad_norm": 0.8964281915021449, + "learning_rate": 5.57178965662242e-06, + "loss": 0.5667, + "step": 16536 + }, + { + "epoch": 0.48, + "grad_norm": 1.186620086862245, + "learning_rate": 5.571323027913221e-06, + "loss": 0.2904, + "step": 16537 + }, + { + "epoch": 0.48, + "grad_norm": 1.2648482139125716, + "learning_rate": 5.570856394162089e-06, + "loss": 0.3109, + "step": 16538 + }, + { + "epoch": 0.48, + "grad_norm": 1.282514117454389, + "learning_rate": 5.570389755373147e-06, + "loss": 0.3571, + "step": 16539 + }, + { + "epoch": 0.48, + "grad_norm": 1.4012294650873263, + "learning_rate": 5.569923111550506e-06, + "loss": 0.3242, + "step": 16540 + }, + { + "epoch": 0.48, + "grad_norm": 1.3508694200756408, + "learning_rate": 5.5694564626982885e-06, + "loss": 0.3054, + "step": 16541 + }, + { + "epoch": 0.48, + "grad_norm": 1.4229698045023216, + "learning_rate": 5.5689898088206104e-06, + "loss": 0.3134, + "step": 16542 + }, + { + "epoch": 0.48, + "grad_norm": 1.2293783261942868, + "learning_rate": 5.568523149921591e-06, + "loss": 0.3101, + "step": 16543 + }, + { + "epoch": 0.48, + "grad_norm": 1.7411093380729097, + "learning_rate": 5.56805648600535e-06, + "loss": 0.297, + "step": 16544 + }, + { + "epoch": 0.48, + "grad_norm": 1.4145206757980697, + "learning_rate": 5.567589817076002e-06, + "loss": 0.3089, + "step": 16545 + }, + { + "epoch": 0.48, + "grad_norm": 1.442466099347978, + "learning_rate": 5.5671231431376685e-06, + "loss": 0.3009, + "step": 16546 + }, + { + "epoch": 0.48, + "grad_norm": 1.5057591061385471, + "learning_rate": 5.566656464194466e-06, + "loss": 0.3423, + "step": 16547 + }, + { + "epoch": 0.48, + "grad_norm": 2.32621126578914, + "learning_rate": 5.566189780250516e-06, + "loss": 0.3124, + "step": 16548 + }, + { + "epoch": 0.48, + "grad_norm": 1.4477887579692426, + "learning_rate": 5.5657230913099335e-06, + "loss": 0.3011, + "step": 16549 + }, + { + "epoch": 0.48, + "grad_norm": 1.2980577547061376, + "learning_rate": 5.565256397376839e-06, + "loss": 0.3092, + "step": 16550 + }, + { + "epoch": 0.48, + "grad_norm": 1.57364066852208, + "learning_rate": 5.56478969845535e-06, + "loss": 0.2856, + "step": 16551 + }, + { + "epoch": 0.48, + "grad_norm": 1.6019356721387925, + "learning_rate": 5.564322994549587e-06, + "loss": 0.3202, + "step": 16552 + }, + { + "epoch": 0.48, + "grad_norm": 1.4773048821528152, + "learning_rate": 5.5638562856636655e-06, + "loss": 0.3155, + "step": 16553 + }, + { + "epoch": 0.48, + "grad_norm": 1.4731481930869392, + "learning_rate": 5.563389571801706e-06, + "loss": 0.3182, + "step": 16554 + }, + { + "epoch": 0.48, + "grad_norm": 1.990373989192094, + "learning_rate": 5.562922852967829e-06, + "loss": 0.3067, + "step": 16555 + }, + { + "epoch": 0.48, + "grad_norm": 1.5675420387098224, + "learning_rate": 5.562456129166151e-06, + "loss": 0.3345, + "step": 16556 + }, + { + "epoch": 0.48, + "grad_norm": 1.3421107878491763, + "learning_rate": 5.5619894004007905e-06, + "loss": 0.3126, + "step": 16557 + }, + { + "epoch": 0.48, + "grad_norm": 1.7987293234117705, + "learning_rate": 5.5615226666758695e-06, + "loss": 0.3187, + "step": 16558 + }, + { + "epoch": 0.48, + "grad_norm": 2.2044603380678724, + "learning_rate": 5.5610559279955025e-06, + "loss": 0.3038, + "step": 16559 + }, + { + "epoch": 0.48, + "grad_norm": 1.4220821943895183, + "learning_rate": 5.56058918436381e-06, + "loss": 0.3386, + "step": 16560 + }, + { + "epoch": 0.48, + "grad_norm": 1.2311961930720554, + "learning_rate": 5.560122435784914e-06, + "loss": 0.304, + "step": 16561 + }, + { + "epoch": 0.48, + "grad_norm": 1.242762923971092, + "learning_rate": 5.55965568226293e-06, + "loss": 0.2978, + "step": 16562 + }, + { + "epoch": 0.48, + "grad_norm": 1.3522284540170055, + "learning_rate": 5.559188923801979e-06, + "loss": 0.3047, + "step": 16563 + }, + { + "epoch": 0.48, + "grad_norm": 1.624200337815381, + "learning_rate": 5.558722160406179e-06, + "loss": 0.3529, + "step": 16564 + }, + { + "epoch": 0.48, + "grad_norm": 1.3703492948288345, + "learning_rate": 5.55825539207965e-06, + "loss": 0.3226, + "step": 16565 + }, + { + "epoch": 0.48, + "grad_norm": 1.4042791552411626, + "learning_rate": 5.5577886188265114e-06, + "loss": 0.3291, + "step": 16566 + }, + { + "epoch": 0.48, + "grad_norm": 1.416473323420339, + "learning_rate": 5.557321840650881e-06, + "loss": 0.3066, + "step": 16567 + }, + { + "epoch": 0.48, + "grad_norm": 1.2354262561796314, + "learning_rate": 5.55685505755688e-06, + "loss": 0.3183, + "step": 16568 + }, + { + "epoch": 0.48, + "grad_norm": 1.595766388769092, + "learning_rate": 5.556388269548627e-06, + "loss": 0.3078, + "step": 16569 + }, + { + "epoch": 0.48, + "grad_norm": 1.2667337699299788, + "learning_rate": 5.555921476630242e-06, + "loss": 0.3125, + "step": 16570 + }, + { + "epoch": 0.48, + "grad_norm": 1.209663993513312, + "learning_rate": 5.555454678805843e-06, + "loss": 0.2961, + "step": 16571 + }, + { + "epoch": 0.48, + "grad_norm": 1.4750936452390617, + "learning_rate": 5.55498787607955e-06, + "loss": 0.3034, + "step": 16572 + }, + { + "epoch": 0.48, + "grad_norm": 1.5833014570799793, + "learning_rate": 5.554521068455483e-06, + "loss": 0.3081, + "step": 16573 + }, + { + "epoch": 0.48, + "grad_norm": 1.4159936208717252, + "learning_rate": 5.554054255937761e-06, + "loss": 0.3299, + "step": 16574 + }, + { + "epoch": 0.48, + "grad_norm": 2.043984775415299, + "learning_rate": 5.5535874385305035e-06, + "loss": 0.2999, + "step": 16575 + }, + { + "epoch": 0.48, + "grad_norm": 1.3735797566895251, + "learning_rate": 5.553120616237833e-06, + "loss": 0.3193, + "step": 16576 + }, + { + "epoch": 0.48, + "grad_norm": 1.2022881461576036, + "learning_rate": 5.552653789063866e-06, + "loss": 0.2901, + "step": 16577 + }, + { + "epoch": 0.48, + "grad_norm": 1.0270270454269643, + "learning_rate": 5.5521869570127216e-06, + "loss": 0.5428, + "step": 16578 + }, + { + "epoch": 0.48, + "grad_norm": 1.327568272026878, + "learning_rate": 5.551720120088522e-06, + "loss": 0.3013, + "step": 16579 + }, + { + "epoch": 0.48, + "grad_norm": 1.706524006023776, + "learning_rate": 5.551253278295387e-06, + "loss": 0.3238, + "step": 16580 + }, + { + "epoch": 0.48, + "grad_norm": 1.287568027122159, + "learning_rate": 5.550786431637436e-06, + "loss": 0.3073, + "step": 16581 + }, + { + "epoch": 0.48, + "grad_norm": 1.288148533771402, + "learning_rate": 5.550319580118787e-06, + "loss": 0.3027, + "step": 16582 + }, + { + "epoch": 0.48, + "grad_norm": 1.5129308753008126, + "learning_rate": 5.549852723743564e-06, + "loss": 0.325, + "step": 16583 + }, + { + "epoch": 0.48, + "grad_norm": 1.3049085378911294, + "learning_rate": 5.549385862515882e-06, + "loss": 0.3091, + "step": 16584 + }, + { + "epoch": 0.48, + "grad_norm": 1.355452348626271, + "learning_rate": 5.548918996439866e-06, + "loss": 0.3049, + "step": 16585 + }, + { + "epoch": 0.48, + "grad_norm": 1.530537214450207, + "learning_rate": 5.548452125519632e-06, + "loss": 0.2988, + "step": 16586 + }, + { + "epoch": 0.48, + "grad_norm": 1.4560393101325906, + "learning_rate": 5.547985249759303e-06, + "loss": 0.3014, + "step": 16587 + }, + { + "epoch": 0.48, + "grad_norm": 1.423344859307034, + "learning_rate": 5.547518369162996e-06, + "loss": 0.3167, + "step": 16588 + }, + { + "epoch": 0.48, + "grad_norm": 1.3129678311924176, + "learning_rate": 5.547051483734837e-06, + "loss": 0.3111, + "step": 16589 + }, + { + "epoch": 0.48, + "grad_norm": 1.9031803221537076, + "learning_rate": 5.546584593478939e-06, + "loss": 0.3276, + "step": 16590 + }, + { + "epoch": 0.48, + "grad_norm": 1.4709973219884034, + "learning_rate": 5.546117698399428e-06, + "loss": 0.3371, + "step": 16591 + }, + { + "epoch": 0.48, + "grad_norm": 1.220994943939208, + "learning_rate": 5.545650798500422e-06, + "loss": 0.3148, + "step": 16592 + }, + { + "epoch": 0.48, + "grad_norm": 1.7608670136862568, + "learning_rate": 5.54518389378604e-06, + "loss": 0.3015, + "step": 16593 + }, + { + "epoch": 0.48, + "grad_norm": 1.6657757600366, + "learning_rate": 5.544716984260406e-06, + "loss": 0.3001, + "step": 16594 + }, + { + "epoch": 0.48, + "grad_norm": 2.880750692595555, + "learning_rate": 5.544250069927638e-06, + "loss": 0.3303, + "step": 16595 + }, + { + "epoch": 0.48, + "grad_norm": 1.3407873545441744, + "learning_rate": 5.5437831507918575e-06, + "loss": 0.3114, + "step": 16596 + }, + { + "epoch": 0.48, + "grad_norm": 1.6009562988609256, + "learning_rate": 5.543316226857185e-06, + "loss": 0.3333, + "step": 16597 + }, + { + "epoch": 0.48, + "grad_norm": 1.5005767706483775, + "learning_rate": 5.542849298127739e-06, + "loss": 0.2897, + "step": 16598 + }, + { + "epoch": 0.48, + "grad_norm": 1.2576281772784228, + "learning_rate": 5.542382364607643e-06, + "loss": 0.312, + "step": 16599 + }, + { + "epoch": 0.48, + "grad_norm": 1.2848305574671068, + "learning_rate": 5.541915426301017e-06, + "loss": 0.2964, + "step": 16600 + }, + { + "epoch": 0.48, + "grad_norm": 1.9200253170734272, + "learning_rate": 5.541448483211981e-06, + "loss": 0.2785, + "step": 16601 + }, + { + "epoch": 0.48, + "grad_norm": 1.390645630321144, + "learning_rate": 5.540981535344655e-06, + "loss": 0.3125, + "step": 16602 + }, + { + "epoch": 0.48, + "grad_norm": 1.2900031505093212, + "learning_rate": 5.540514582703162e-06, + "loss": 0.2977, + "step": 16603 + }, + { + "epoch": 0.48, + "grad_norm": 0.9602453367906687, + "learning_rate": 5.540047625291622e-06, + "loss": 0.6218, + "step": 16604 + }, + { + "epoch": 0.48, + "grad_norm": 2.0957523649540253, + "learning_rate": 5.539580663114156e-06, + "loss": 0.3142, + "step": 16605 + }, + { + "epoch": 0.48, + "grad_norm": 1.2967159714414889, + "learning_rate": 5.539113696174884e-06, + "loss": 0.3145, + "step": 16606 + }, + { + "epoch": 0.48, + "grad_norm": 1.2878493038654117, + "learning_rate": 5.5386467244779285e-06, + "loss": 0.3117, + "step": 16607 + }, + { + "epoch": 0.48, + "grad_norm": 1.6727039837422553, + "learning_rate": 5.538179748027408e-06, + "loss": 0.3133, + "step": 16608 + }, + { + "epoch": 0.48, + "grad_norm": 1.7937828133734925, + "learning_rate": 5.5377127668274465e-06, + "loss": 0.314, + "step": 16609 + }, + { + "epoch": 0.48, + "grad_norm": 0.9735523654816151, + "learning_rate": 5.537245780882163e-06, + "loss": 0.6416, + "step": 16610 + }, + { + "epoch": 0.48, + "grad_norm": 0.9318215886897636, + "learning_rate": 5.536778790195681e-06, + "loss": 0.6104, + "step": 16611 + }, + { + "epoch": 0.48, + "grad_norm": 1.4938507908873946, + "learning_rate": 5.53631179477212e-06, + "loss": 0.3183, + "step": 16612 + }, + { + "epoch": 0.48, + "grad_norm": 1.2052866647232257, + "learning_rate": 5.535844794615602e-06, + "loss": 0.3087, + "step": 16613 + }, + { + "epoch": 0.48, + "grad_norm": 1.276620719442816, + "learning_rate": 5.535377789730245e-06, + "loss": 0.3016, + "step": 16614 + }, + { + "epoch": 0.48, + "grad_norm": 0.9327891340912484, + "learning_rate": 5.534910780120174e-06, + "loss": 0.611, + "step": 16615 + }, + { + "epoch": 0.48, + "grad_norm": 1.7568140598615867, + "learning_rate": 5.53444376578951e-06, + "loss": 0.3178, + "step": 16616 + }, + { + "epoch": 0.48, + "grad_norm": 1.3099577322566829, + "learning_rate": 5.533976746742374e-06, + "loss": 0.3178, + "step": 16617 + }, + { + "epoch": 0.48, + "grad_norm": 1.3982369736989866, + "learning_rate": 5.5335097229828885e-06, + "loss": 0.3202, + "step": 16618 + }, + { + "epoch": 0.48, + "grad_norm": 1.2611044321749618, + "learning_rate": 5.533042694515172e-06, + "loss": 0.3035, + "step": 16619 + }, + { + "epoch": 0.48, + "grad_norm": 1.328651130352779, + "learning_rate": 5.532575661343346e-06, + "loss": 0.3019, + "step": 16620 + }, + { + "epoch": 0.48, + "grad_norm": 1.248664618822627, + "learning_rate": 5.5321086234715386e-06, + "loss": 0.3016, + "step": 16621 + }, + { + "epoch": 0.48, + "grad_norm": 1.3473172526581376, + "learning_rate": 5.531641580903864e-06, + "loss": 0.3187, + "step": 16622 + }, + { + "epoch": 0.48, + "grad_norm": 1.960459131406598, + "learning_rate": 5.531174533644446e-06, + "loss": 0.3058, + "step": 16623 + }, + { + "epoch": 0.48, + "grad_norm": 1.5334478366787265, + "learning_rate": 5.530707481697407e-06, + "loss": 0.3166, + "step": 16624 + }, + { + "epoch": 0.48, + "grad_norm": 1.2453014087313004, + "learning_rate": 5.53024042506687e-06, + "loss": 0.3075, + "step": 16625 + }, + { + "epoch": 0.48, + "grad_norm": 1.2655966978227757, + "learning_rate": 5.529773363756956e-06, + "loss": 0.2786, + "step": 16626 + }, + { + "epoch": 0.48, + "grad_norm": 1.4429771246063594, + "learning_rate": 5.529306297771783e-06, + "loss": 0.3095, + "step": 16627 + }, + { + "epoch": 0.48, + "grad_norm": 1.2861621545955175, + "learning_rate": 5.528839227115479e-06, + "loss": 0.3053, + "step": 16628 + }, + { + "epoch": 0.48, + "grad_norm": 1.3046424626391744, + "learning_rate": 5.528372151792161e-06, + "loss": 0.3168, + "step": 16629 + }, + { + "epoch": 0.48, + "grad_norm": 1.368738523634015, + "learning_rate": 5.527905071805955e-06, + "loss": 0.3217, + "step": 16630 + }, + { + "epoch": 0.48, + "grad_norm": 1.2879205694316913, + "learning_rate": 5.52743798716098e-06, + "loss": 0.3366, + "step": 16631 + }, + { + "epoch": 0.48, + "grad_norm": 1.4077678217664726, + "learning_rate": 5.526970897861357e-06, + "loss": 0.3199, + "step": 16632 + }, + { + "epoch": 0.48, + "grad_norm": 1.258116939173495, + "learning_rate": 5.526503803911212e-06, + "loss": 0.3136, + "step": 16633 + }, + { + "epoch": 0.48, + "grad_norm": 1.3022470464361349, + "learning_rate": 5.526036705314663e-06, + "loss": 0.3338, + "step": 16634 + }, + { + "epoch": 0.48, + "grad_norm": 1.3895694383843382, + "learning_rate": 5.525569602075838e-06, + "loss": 0.3322, + "step": 16635 + }, + { + "epoch": 0.48, + "grad_norm": 1.3441413764714374, + "learning_rate": 5.525102494198852e-06, + "loss": 0.3181, + "step": 16636 + }, + { + "epoch": 0.48, + "grad_norm": 1.3293212527390934, + "learning_rate": 5.524635381687831e-06, + "loss": 0.3047, + "step": 16637 + }, + { + "epoch": 0.48, + "grad_norm": 1.4786847028892478, + "learning_rate": 5.524168264546898e-06, + "loss": 0.3219, + "step": 16638 + }, + { + "epoch": 0.48, + "grad_norm": 1.016707740167655, + "learning_rate": 5.523701142780176e-06, + "loss": 0.5531, + "step": 16639 + }, + { + "epoch": 0.48, + "grad_norm": 1.614135178832004, + "learning_rate": 5.523234016391783e-06, + "loss": 0.3231, + "step": 16640 + }, + { + "epoch": 0.48, + "grad_norm": 1.2667433157556938, + "learning_rate": 5.5227668853858455e-06, + "loss": 0.3123, + "step": 16641 + }, + { + "epoch": 0.48, + "grad_norm": 1.2915856740083071, + "learning_rate": 5.522299749766483e-06, + "loss": 0.3261, + "step": 16642 + }, + { + "epoch": 0.48, + "grad_norm": 1.254368121362118, + "learning_rate": 5.52183260953782e-06, + "loss": 0.315, + "step": 16643 + }, + { + "epoch": 0.48, + "grad_norm": 1.2384094301785693, + "learning_rate": 5.521365464703979e-06, + "loss": 0.3165, + "step": 16644 + }, + { + "epoch": 0.48, + "grad_norm": 1.3434962352164292, + "learning_rate": 5.520898315269081e-06, + "loss": 0.3257, + "step": 16645 + }, + { + "epoch": 0.48, + "grad_norm": 1.7734367155388029, + "learning_rate": 5.520431161237249e-06, + "loss": 0.2874, + "step": 16646 + }, + { + "epoch": 0.48, + "grad_norm": 1.3994019485968443, + "learning_rate": 5.519964002612607e-06, + "loss": 0.32, + "step": 16647 + }, + { + "epoch": 0.48, + "grad_norm": 1.2917295136005635, + "learning_rate": 5.519496839399279e-06, + "loss": 0.3107, + "step": 16648 + }, + { + "epoch": 0.48, + "grad_norm": 1.8245711098853508, + "learning_rate": 5.519029671601384e-06, + "loss": 0.3059, + "step": 16649 + }, + { + "epoch": 0.48, + "grad_norm": 1.7718031424805933, + "learning_rate": 5.5185624992230445e-06, + "loss": 0.3243, + "step": 16650 + }, + { + "epoch": 0.48, + "grad_norm": 1.3482864282316658, + "learning_rate": 5.518095322268388e-06, + "loss": 0.3009, + "step": 16651 + }, + { + "epoch": 0.48, + "grad_norm": 0.9390151329246611, + "learning_rate": 5.517628140741532e-06, + "loss": 0.6096, + "step": 16652 + }, + { + "epoch": 0.48, + "grad_norm": 1.3357456240086647, + "learning_rate": 5.5171609546466045e-06, + "loss": 0.3159, + "step": 16653 + }, + { + "epoch": 0.48, + "grad_norm": 1.3953980756147233, + "learning_rate": 5.516693763987725e-06, + "loss": 0.3265, + "step": 16654 + }, + { + "epoch": 0.48, + "grad_norm": 0.9521747586013777, + "learning_rate": 5.516226568769017e-06, + "loss": 0.5801, + "step": 16655 + }, + { + "epoch": 0.48, + "grad_norm": 1.2995513727072925, + "learning_rate": 5.515759368994603e-06, + "loss": 0.2936, + "step": 16656 + }, + { + "epoch": 0.48, + "grad_norm": 1.400248439910737, + "learning_rate": 5.5152921646686075e-06, + "loss": 0.3442, + "step": 16657 + }, + { + "epoch": 0.48, + "grad_norm": 1.4010015351117062, + "learning_rate": 5.514824955795154e-06, + "loss": 0.3093, + "step": 16658 + }, + { + "epoch": 0.48, + "grad_norm": 1.3994558225865539, + "learning_rate": 5.514357742378363e-06, + "loss": 0.3092, + "step": 16659 + }, + { + "epoch": 0.48, + "grad_norm": 1.4671971691398742, + "learning_rate": 5.513890524422359e-06, + "loss": 0.3058, + "step": 16660 + }, + { + "epoch": 0.48, + "grad_norm": 1.3208906708152741, + "learning_rate": 5.513423301931266e-06, + "loss": 0.3025, + "step": 16661 + }, + { + "epoch": 0.48, + "grad_norm": 1.424575792920389, + "learning_rate": 5.512956074909208e-06, + "loss": 0.3109, + "step": 16662 + }, + { + "epoch": 0.48, + "grad_norm": 1.4717643237672653, + "learning_rate": 5.512488843360306e-06, + "loss": 0.3282, + "step": 16663 + }, + { + "epoch": 0.48, + "grad_norm": 1.2830925604071808, + "learning_rate": 5.512021607288684e-06, + "loss": 0.2999, + "step": 16664 + }, + { + "epoch": 0.48, + "grad_norm": 1.2681234761643492, + "learning_rate": 5.511554366698465e-06, + "loss": 0.2935, + "step": 16665 + }, + { + "epoch": 0.48, + "grad_norm": 1.375686337708998, + "learning_rate": 5.511087121593776e-06, + "loss": 0.3057, + "step": 16666 + }, + { + "epoch": 0.48, + "grad_norm": 1.499868577015606, + "learning_rate": 5.510619871978734e-06, + "loss": 0.3205, + "step": 16667 + }, + { + "epoch": 0.48, + "grad_norm": 1.3319418236572094, + "learning_rate": 5.510152617857468e-06, + "loss": 0.3205, + "step": 16668 + }, + { + "epoch": 0.48, + "grad_norm": 1.2873982445409036, + "learning_rate": 5.509685359234099e-06, + "loss": 0.2977, + "step": 16669 + }, + { + "epoch": 0.48, + "grad_norm": 1.2962792296478023, + "learning_rate": 5.50921809611275e-06, + "loss": 0.3451, + "step": 16670 + }, + { + "epoch": 0.48, + "grad_norm": 1.3987310491306455, + "learning_rate": 5.508750828497547e-06, + "loss": 0.2849, + "step": 16671 + }, + { + "epoch": 0.48, + "grad_norm": 1.407910279712589, + "learning_rate": 5.508283556392612e-06, + "loss": 0.3157, + "step": 16672 + }, + { + "epoch": 0.48, + "grad_norm": 1.5942610557796566, + "learning_rate": 5.5078162798020685e-06, + "loss": 0.331, + "step": 16673 + }, + { + "epoch": 0.48, + "grad_norm": 1.547239738908648, + "learning_rate": 5.50734899873004e-06, + "loss": 0.2997, + "step": 16674 + }, + { + "epoch": 0.48, + "grad_norm": 1.4942734016144694, + "learning_rate": 5.506881713180652e-06, + "loss": 0.329, + "step": 16675 + }, + { + "epoch": 0.48, + "grad_norm": 1.3469095829970488, + "learning_rate": 5.506414423158027e-06, + "loss": 0.2895, + "step": 16676 + }, + { + "epoch": 0.48, + "grad_norm": 1.281962295562624, + "learning_rate": 5.5059471286662905e-06, + "loss": 0.3334, + "step": 16677 + }, + { + "epoch": 0.48, + "grad_norm": 2.34370158301707, + "learning_rate": 5.5054798297095625e-06, + "loss": 0.3356, + "step": 16678 + }, + { + "epoch": 0.48, + "grad_norm": 1.497251863577511, + "learning_rate": 5.50501252629197e-06, + "loss": 0.3009, + "step": 16679 + }, + { + "epoch": 0.48, + "grad_norm": 2.61243354210092, + "learning_rate": 5.504545218417639e-06, + "loss": 0.3246, + "step": 16680 + }, + { + "epoch": 0.48, + "grad_norm": 1.432470002369788, + "learning_rate": 5.504077906090688e-06, + "loss": 0.3199, + "step": 16681 + }, + { + "epoch": 0.48, + "grad_norm": 1.509397380440751, + "learning_rate": 5.503610589315245e-06, + "loss": 0.298, + "step": 16682 + }, + { + "epoch": 0.48, + "grad_norm": 1.2893581543903279, + "learning_rate": 5.503143268095432e-06, + "loss": 0.2933, + "step": 16683 + }, + { + "epoch": 0.48, + "grad_norm": 1.32811516020822, + "learning_rate": 5.502675942435375e-06, + "loss": 0.3198, + "step": 16684 + }, + { + "epoch": 0.48, + "grad_norm": 1.3569053140746619, + "learning_rate": 5.5022086123391985e-06, + "loss": 0.3023, + "step": 16685 + }, + { + "epoch": 0.48, + "grad_norm": 1.4296450218540437, + "learning_rate": 5.501741277811022e-06, + "loss": 0.32, + "step": 16686 + }, + { + "epoch": 0.48, + "grad_norm": 1.2743975154720093, + "learning_rate": 5.501273938854975e-06, + "loss": 0.3081, + "step": 16687 + }, + { + "epoch": 0.48, + "grad_norm": 1.3151727993140019, + "learning_rate": 5.50080659547518e-06, + "loss": 0.2938, + "step": 16688 + }, + { + "epoch": 0.48, + "grad_norm": 0.9476189154924374, + "learning_rate": 5.5003392476757615e-06, + "loss": 0.6022, + "step": 16689 + }, + { + "epoch": 0.48, + "grad_norm": 1.5637860105565449, + "learning_rate": 5.499871895460844e-06, + "loss": 0.3339, + "step": 16690 + }, + { + "epoch": 0.48, + "grad_norm": 1.2729386064050754, + "learning_rate": 5.499404538834551e-06, + "loss": 0.296, + "step": 16691 + }, + { + "epoch": 0.48, + "grad_norm": 2.529233954285832, + "learning_rate": 5.498937177801006e-06, + "loss": 0.3192, + "step": 16692 + }, + { + "epoch": 0.48, + "grad_norm": 1.5252831598834142, + "learning_rate": 5.498469812364335e-06, + "loss": 0.2948, + "step": 16693 + }, + { + "epoch": 0.48, + "grad_norm": 1.7979182087159806, + "learning_rate": 5.4980024425286645e-06, + "loss": 0.3029, + "step": 16694 + }, + { + "epoch": 0.48, + "grad_norm": 1.6384409620818818, + "learning_rate": 5.497535068298117e-06, + "loss": 0.3247, + "step": 16695 + }, + { + "epoch": 0.48, + "grad_norm": 1.384751775608786, + "learning_rate": 5.497067689676816e-06, + "loss": 0.3099, + "step": 16696 + }, + { + "epoch": 0.48, + "grad_norm": 1.2481512927544218, + "learning_rate": 5.496600306668887e-06, + "loss": 0.299, + "step": 16697 + }, + { + "epoch": 0.48, + "grad_norm": 1.3765011192139693, + "learning_rate": 5.496132919278454e-06, + "loss": 0.3076, + "step": 16698 + }, + { + "epoch": 0.48, + "grad_norm": 1.5188391492562903, + "learning_rate": 5.495665527509643e-06, + "loss": 0.3145, + "step": 16699 + }, + { + "epoch": 0.48, + "grad_norm": 1.3885636102406609, + "learning_rate": 5.495198131366579e-06, + "loss": 0.31, + "step": 16700 + }, + { + "epoch": 0.48, + "grad_norm": 1.7015898928108268, + "learning_rate": 5.494730730853386e-06, + "loss": 0.3088, + "step": 16701 + }, + { + "epoch": 0.48, + "grad_norm": 1.5020984723537696, + "learning_rate": 5.494263325974188e-06, + "loss": 0.3166, + "step": 16702 + }, + { + "epoch": 0.48, + "grad_norm": 1.324208775512045, + "learning_rate": 5.493795916733111e-06, + "loss": 0.315, + "step": 16703 + }, + { + "epoch": 0.48, + "grad_norm": 1.2505018040566158, + "learning_rate": 5.4933285031342794e-06, + "loss": 0.3127, + "step": 16704 + }, + { + "epoch": 0.48, + "grad_norm": 1.3128503349125373, + "learning_rate": 5.492861085181818e-06, + "loss": 0.3211, + "step": 16705 + }, + { + "epoch": 0.48, + "grad_norm": 1.1775288131745956, + "learning_rate": 5.492393662879852e-06, + "loss": 0.2887, + "step": 16706 + }, + { + "epoch": 0.48, + "grad_norm": 1.3712268867596118, + "learning_rate": 5.491926236232507e-06, + "loss": 0.3148, + "step": 16707 + }, + { + "epoch": 0.48, + "grad_norm": 1.2867850494616506, + "learning_rate": 5.491458805243908e-06, + "loss": 0.3083, + "step": 16708 + }, + { + "epoch": 0.48, + "grad_norm": 1.8188143686597866, + "learning_rate": 5.490991369918179e-06, + "loss": 0.2998, + "step": 16709 + }, + { + "epoch": 0.48, + "grad_norm": 1.2318947234276572, + "learning_rate": 5.490523930259446e-06, + "loss": 0.3401, + "step": 16710 + }, + { + "epoch": 0.48, + "grad_norm": 1.354793199823872, + "learning_rate": 5.490056486271833e-06, + "loss": 0.3249, + "step": 16711 + }, + { + "epoch": 0.48, + "grad_norm": 1.2869040534519203, + "learning_rate": 5.489589037959466e-06, + "loss": 0.3048, + "step": 16712 + }, + { + "epoch": 0.48, + "grad_norm": 1.2346841423954114, + "learning_rate": 5.489121585326472e-06, + "loss": 0.2983, + "step": 16713 + }, + { + "epoch": 0.48, + "grad_norm": 1.34555647710245, + "learning_rate": 5.488654128376974e-06, + "loss": 0.2949, + "step": 16714 + }, + { + "epoch": 0.48, + "grad_norm": 1.312357213665408, + "learning_rate": 5.488186667115098e-06, + "loss": 0.3247, + "step": 16715 + }, + { + "epoch": 0.48, + "grad_norm": 1.3081075098732868, + "learning_rate": 5.487719201544968e-06, + "loss": 0.298, + "step": 16716 + }, + { + "epoch": 0.48, + "grad_norm": 1.4034555987632598, + "learning_rate": 5.4872517316707115e-06, + "loss": 0.3354, + "step": 16717 + }, + { + "epoch": 0.48, + "grad_norm": 1.247097541881721, + "learning_rate": 5.4867842574964525e-06, + "loss": 0.3007, + "step": 16718 + }, + { + "epoch": 0.48, + "grad_norm": 1.3723327955948976, + "learning_rate": 5.486316779026318e-06, + "loss": 0.3174, + "step": 16719 + }, + { + "epoch": 0.48, + "grad_norm": 1.4197670958764965, + "learning_rate": 5.48584929626443e-06, + "loss": 0.3183, + "step": 16720 + }, + { + "epoch": 0.48, + "grad_norm": 1.389528905785488, + "learning_rate": 5.485381809214921e-06, + "loss": 0.2812, + "step": 16721 + }, + { + "epoch": 0.49, + "grad_norm": 1.4477751158747267, + "learning_rate": 5.484914317881909e-06, + "loss": 0.338, + "step": 16722 + }, + { + "epoch": 0.49, + "grad_norm": 1.2709601320402097, + "learning_rate": 5.484446822269523e-06, + "loss": 0.3295, + "step": 16723 + }, + { + "epoch": 0.49, + "grad_norm": 1.3252116415042066, + "learning_rate": 5.4839793223818896e-06, + "loss": 0.3461, + "step": 16724 + }, + { + "epoch": 0.49, + "grad_norm": 1.2575849717444076, + "learning_rate": 5.483511818223132e-06, + "loss": 0.2962, + "step": 16725 + }, + { + "epoch": 0.49, + "grad_norm": 1.311694964089219, + "learning_rate": 5.483044309797378e-06, + "loss": 0.3088, + "step": 16726 + }, + { + "epoch": 0.49, + "grad_norm": 1.4222305084973763, + "learning_rate": 5.4825767971087525e-06, + "loss": 0.2884, + "step": 16727 + }, + { + "epoch": 0.49, + "grad_norm": 1.3580536457701782, + "learning_rate": 5.482109280161382e-06, + "loss": 0.3224, + "step": 16728 + }, + { + "epoch": 0.49, + "grad_norm": 1.364902366190565, + "learning_rate": 5.48164175895939e-06, + "loss": 0.3326, + "step": 16729 + }, + { + "epoch": 0.49, + "grad_norm": 1.2863893349841564, + "learning_rate": 5.481174233506905e-06, + "loss": 0.3001, + "step": 16730 + }, + { + "epoch": 0.49, + "grad_norm": 1.19670318099635, + "learning_rate": 5.480706703808053e-06, + "loss": 0.2889, + "step": 16731 + }, + { + "epoch": 0.49, + "grad_norm": 1.449134744943733, + "learning_rate": 5.4802391698669574e-06, + "loss": 0.3108, + "step": 16732 + }, + { + "epoch": 0.49, + "grad_norm": 1.65912327494234, + "learning_rate": 5.479771631687747e-06, + "loss": 0.3563, + "step": 16733 + }, + { + "epoch": 0.49, + "grad_norm": 1.3715812476176685, + "learning_rate": 5.479304089274546e-06, + "loss": 0.3475, + "step": 16734 + }, + { + "epoch": 0.49, + "grad_norm": 1.2202168774008557, + "learning_rate": 5.478836542631482e-06, + "loss": 0.3278, + "step": 16735 + }, + { + "epoch": 0.49, + "grad_norm": 2.6545845996226136, + "learning_rate": 5.478368991762679e-06, + "loss": 0.3231, + "step": 16736 + }, + { + "epoch": 0.49, + "grad_norm": 1.333764638600872, + "learning_rate": 5.477901436672265e-06, + "loss": 0.2995, + "step": 16737 + }, + { + "epoch": 0.49, + "grad_norm": 1.3095454269342603, + "learning_rate": 5.477433877364367e-06, + "loss": 0.3039, + "step": 16738 + }, + { + "epoch": 0.49, + "grad_norm": 2.1049373034769516, + "learning_rate": 5.476966313843107e-06, + "loss": 0.3132, + "step": 16739 + }, + { + "epoch": 0.49, + "grad_norm": 1.4113726468013235, + "learning_rate": 5.476498746112616e-06, + "loss": 0.3271, + "step": 16740 + }, + { + "epoch": 0.49, + "grad_norm": 1.6975267139860224, + "learning_rate": 5.4760311741770176e-06, + "loss": 0.3111, + "step": 16741 + }, + { + "epoch": 0.49, + "grad_norm": 1.5624457498311672, + "learning_rate": 5.475563598040439e-06, + "loss": 0.3209, + "step": 16742 + }, + { + "epoch": 0.49, + "grad_norm": 1.7174534469853189, + "learning_rate": 5.4750960177070054e-06, + "loss": 0.3221, + "step": 16743 + }, + { + "epoch": 0.49, + "grad_norm": 1.6542923367758284, + "learning_rate": 5.474628433180844e-06, + "loss": 0.2947, + "step": 16744 + }, + { + "epoch": 0.49, + "grad_norm": 1.6125525108677161, + "learning_rate": 5.474160844466083e-06, + "loss": 0.3331, + "step": 16745 + }, + { + "epoch": 0.49, + "grad_norm": 1.676190126955251, + "learning_rate": 5.473693251566846e-06, + "loss": 0.2901, + "step": 16746 + }, + { + "epoch": 0.49, + "grad_norm": 1.2805869382426645, + "learning_rate": 5.473225654487262e-06, + "loss": 0.3294, + "step": 16747 + }, + { + "epoch": 0.49, + "grad_norm": 1.270270141690904, + "learning_rate": 5.472758053231455e-06, + "loss": 0.3229, + "step": 16748 + }, + { + "epoch": 0.49, + "grad_norm": 1.3063096924219229, + "learning_rate": 5.472290447803555e-06, + "loss": 0.3166, + "step": 16749 + }, + { + "epoch": 0.49, + "grad_norm": 1.3951984792673562, + "learning_rate": 5.471822838207685e-06, + "loss": 0.3249, + "step": 16750 + }, + { + "epoch": 0.49, + "grad_norm": 1.284419075082062, + "learning_rate": 5.471355224447972e-06, + "loss": 0.2988, + "step": 16751 + }, + { + "epoch": 0.49, + "grad_norm": 1.3343541773984393, + "learning_rate": 5.470887606528547e-06, + "loss": 0.3338, + "step": 16752 + }, + { + "epoch": 0.49, + "grad_norm": 1.2601920432703824, + "learning_rate": 5.470419984453531e-06, + "loss": 0.3037, + "step": 16753 + }, + { + "epoch": 0.49, + "grad_norm": 1.1939404947561723, + "learning_rate": 5.469952358227055e-06, + "loss": 0.2867, + "step": 16754 + }, + { + "epoch": 0.49, + "grad_norm": 1.2511948211399508, + "learning_rate": 5.4694847278532435e-06, + "loss": 0.3049, + "step": 16755 + }, + { + "epoch": 0.49, + "grad_norm": 1.6110421532747325, + "learning_rate": 5.469017093336224e-06, + "loss": 0.3026, + "step": 16756 + }, + { + "epoch": 0.49, + "grad_norm": 1.287044412815107, + "learning_rate": 5.468549454680123e-06, + "loss": 0.3078, + "step": 16757 + }, + { + "epoch": 0.49, + "grad_norm": 2.4084831674635785, + "learning_rate": 5.46808181188907e-06, + "loss": 0.3738, + "step": 16758 + }, + { + "epoch": 0.49, + "grad_norm": 1.3456089055123899, + "learning_rate": 5.467614164967188e-06, + "loss": 0.3496, + "step": 16759 + }, + { + "epoch": 0.49, + "grad_norm": 1.3475598773763746, + "learning_rate": 5.467146513918605e-06, + "loss": 0.3338, + "step": 16760 + }, + { + "epoch": 0.49, + "grad_norm": 1.71316598312395, + "learning_rate": 5.46667885874745e-06, + "loss": 0.3318, + "step": 16761 + }, + { + "epoch": 0.49, + "grad_norm": 1.374583365452213, + "learning_rate": 5.466211199457848e-06, + "loss": 0.3107, + "step": 16762 + }, + { + "epoch": 0.49, + "grad_norm": 1.2857628385186848, + "learning_rate": 5.465743536053929e-06, + "loss": 0.3109, + "step": 16763 + }, + { + "epoch": 0.49, + "grad_norm": 1.4127989052472638, + "learning_rate": 5.465275868539817e-06, + "loss": 0.3064, + "step": 16764 + }, + { + "epoch": 0.49, + "grad_norm": 1.4462786659220568, + "learning_rate": 5.46480819691964e-06, + "loss": 0.3015, + "step": 16765 + }, + { + "epoch": 0.49, + "grad_norm": 1.5550305135662637, + "learning_rate": 5.464340521197527e-06, + "loss": 0.3242, + "step": 16766 + }, + { + "epoch": 0.49, + "grad_norm": 1.2614211559346447, + "learning_rate": 5.463872841377601e-06, + "loss": 0.3146, + "step": 16767 + }, + { + "epoch": 0.49, + "grad_norm": 1.2532243710718312, + "learning_rate": 5.4634051574639945e-06, + "loss": 0.2914, + "step": 16768 + }, + { + "epoch": 0.49, + "grad_norm": 1.4738714197730942, + "learning_rate": 5.462937469460832e-06, + "loss": 0.3625, + "step": 16769 + }, + { + "epoch": 0.49, + "grad_norm": 1.500279519748982, + "learning_rate": 5.462469777372239e-06, + "loss": 0.3233, + "step": 16770 + }, + { + "epoch": 0.49, + "grad_norm": 1.2678697817555602, + "learning_rate": 5.462002081202347e-06, + "loss": 0.3157, + "step": 16771 + }, + { + "epoch": 0.49, + "grad_norm": 1.8877129238846508, + "learning_rate": 5.461534380955281e-06, + "loss": 0.3224, + "step": 16772 + }, + { + "epoch": 0.49, + "grad_norm": 1.1943721453826837, + "learning_rate": 5.46106667663517e-06, + "loss": 0.2933, + "step": 16773 + }, + { + "epoch": 0.49, + "grad_norm": 1.3389595318036933, + "learning_rate": 5.460598968246139e-06, + "loss": 0.3033, + "step": 16774 + }, + { + "epoch": 0.49, + "grad_norm": 1.2656577919346441, + "learning_rate": 5.460131255792317e-06, + "loss": 0.3054, + "step": 16775 + }, + { + "epoch": 0.49, + "grad_norm": 0.9668218268465559, + "learning_rate": 5.459663539277833e-06, + "loss": 0.6074, + "step": 16776 + }, + { + "epoch": 0.49, + "grad_norm": 1.4756349575793832, + "learning_rate": 5.459195818706812e-06, + "loss": 0.2916, + "step": 16777 + }, + { + "epoch": 0.49, + "grad_norm": 1.353321834978815, + "learning_rate": 5.458728094083383e-06, + "loss": 0.3049, + "step": 16778 + }, + { + "epoch": 0.49, + "grad_norm": 1.2649881876747588, + "learning_rate": 5.4582603654116724e-06, + "loss": 0.337, + "step": 16779 + }, + { + "epoch": 0.49, + "grad_norm": 1.3074814217760293, + "learning_rate": 5.457792632695812e-06, + "loss": 0.3062, + "step": 16780 + }, + { + "epoch": 0.49, + "grad_norm": 1.4557722795342753, + "learning_rate": 5.457324895939925e-06, + "loss": 0.308, + "step": 16781 + }, + { + "epoch": 0.49, + "grad_norm": 1.2714105336766464, + "learning_rate": 5.456857155148141e-06, + "loss": 0.2874, + "step": 16782 + }, + { + "epoch": 0.49, + "grad_norm": 1.299801223442209, + "learning_rate": 5.456389410324586e-06, + "loss": 0.2918, + "step": 16783 + }, + { + "epoch": 0.49, + "grad_norm": 1.2521779928265486, + "learning_rate": 5.45592166147339e-06, + "loss": 0.3217, + "step": 16784 + }, + { + "epoch": 0.49, + "grad_norm": 1.327927726578063, + "learning_rate": 5.45545390859868e-06, + "loss": 0.3286, + "step": 16785 + }, + { + "epoch": 0.49, + "grad_norm": 1.3873988685330827, + "learning_rate": 5.454986151704586e-06, + "loss": 0.3401, + "step": 16786 + }, + { + "epoch": 0.49, + "grad_norm": 1.7834989991268377, + "learning_rate": 5.454518390795233e-06, + "loss": 0.339, + "step": 16787 + }, + { + "epoch": 0.49, + "grad_norm": 1.4019988715478249, + "learning_rate": 5.4540506258747495e-06, + "loss": 0.3112, + "step": 16788 + }, + { + "epoch": 0.49, + "grad_norm": 1.388468503236083, + "learning_rate": 5.453582856947265e-06, + "loss": 0.3204, + "step": 16789 + }, + { + "epoch": 0.49, + "grad_norm": 1.2845794298699162, + "learning_rate": 5.453115084016908e-06, + "loss": 0.3294, + "step": 16790 + }, + { + "epoch": 0.49, + "grad_norm": 1.7237645158442712, + "learning_rate": 5.4526473070878025e-06, + "loss": 0.3088, + "step": 16791 + }, + { + "epoch": 0.49, + "grad_norm": 1.2322561306678368, + "learning_rate": 5.452179526164081e-06, + "loss": 0.2902, + "step": 16792 + }, + { + "epoch": 0.49, + "grad_norm": 1.317025842226302, + "learning_rate": 5.45171174124987e-06, + "loss": 0.302, + "step": 16793 + }, + { + "epoch": 0.49, + "grad_norm": 1.2874700216820105, + "learning_rate": 5.451243952349299e-06, + "loss": 0.2988, + "step": 16794 + }, + { + "epoch": 0.49, + "grad_norm": 2.0936525940899524, + "learning_rate": 5.450776159466496e-06, + "loss": 0.3093, + "step": 16795 + }, + { + "epoch": 0.49, + "grad_norm": 1.543398211457565, + "learning_rate": 5.450308362605585e-06, + "loss": 0.3013, + "step": 16796 + }, + { + "epoch": 0.49, + "grad_norm": 1.8092766266475953, + "learning_rate": 5.449840561770698e-06, + "loss": 0.3128, + "step": 16797 + }, + { + "epoch": 0.49, + "grad_norm": 1.3012328247320537, + "learning_rate": 5.4493727569659646e-06, + "loss": 0.3178, + "step": 16798 + }, + { + "epoch": 0.49, + "grad_norm": 1.2431428684927865, + "learning_rate": 5.448904948195512e-06, + "loss": 0.273, + "step": 16799 + }, + { + "epoch": 0.49, + "grad_norm": 1.3540303924226174, + "learning_rate": 5.4484371354634676e-06, + "loss": 0.3166, + "step": 16800 + }, + { + "epoch": 0.49, + "grad_norm": 1.3087324440411832, + "learning_rate": 5.44796931877396e-06, + "loss": 0.3142, + "step": 16801 + }, + { + "epoch": 0.49, + "grad_norm": 1.2936534537909417, + "learning_rate": 5.447501498131119e-06, + "loss": 0.3458, + "step": 16802 + }, + { + "epoch": 0.49, + "grad_norm": 1.2932702796905713, + "learning_rate": 5.4470336735390714e-06, + "loss": 0.3019, + "step": 16803 + }, + { + "epoch": 0.49, + "grad_norm": 1.293804669999763, + "learning_rate": 5.4465658450019475e-06, + "loss": 0.2997, + "step": 16804 + }, + { + "epoch": 0.49, + "grad_norm": 1.3898127141406553, + "learning_rate": 5.446098012523875e-06, + "loss": 0.3201, + "step": 16805 + }, + { + "epoch": 0.49, + "grad_norm": 1.2674920499952345, + "learning_rate": 5.445630176108983e-06, + "loss": 0.3151, + "step": 16806 + }, + { + "epoch": 0.49, + "grad_norm": 1.4622007319335415, + "learning_rate": 5.445162335761398e-06, + "loss": 0.3037, + "step": 16807 + }, + { + "epoch": 0.49, + "grad_norm": 2.167733692009692, + "learning_rate": 5.444694491485253e-06, + "loss": 0.306, + "step": 16808 + }, + { + "epoch": 0.49, + "grad_norm": 1.3421218306423988, + "learning_rate": 5.444226643284674e-06, + "loss": 0.3291, + "step": 16809 + }, + { + "epoch": 0.49, + "grad_norm": 1.3793076659180388, + "learning_rate": 5.443758791163788e-06, + "loss": 0.3031, + "step": 16810 + }, + { + "epoch": 0.49, + "grad_norm": 1.364434548100013, + "learning_rate": 5.443290935126726e-06, + "loss": 0.3163, + "step": 16811 + }, + { + "epoch": 0.49, + "grad_norm": 1.2493992819103232, + "learning_rate": 5.442823075177617e-06, + "loss": 0.32, + "step": 16812 + }, + { + "epoch": 0.49, + "grad_norm": 1.9711704890753226, + "learning_rate": 5.44235521132059e-06, + "loss": 0.2988, + "step": 16813 + }, + { + "epoch": 0.49, + "grad_norm": 1.2572499489689322, + "learning_rate": 5.441887343559772e-06, + "loss": 0.3306, + "step": 16814 + }, + { + "epoch": 0.49, + "grad_norm": 4.236190963625487, + "learning_rate": 5.441419471899294e-06, + "loss": 0.3378, + "step": 16815 + }, + { + "epoch": 0.49, + "grad_norm": 1.2937563712799638, + "learning_rate": 5.440951596343285e-06, + "loss": 0.3031, + "step": 16816 + }, + { + "epoch": 0.49, + "grad_norm": 1.2513833959442728, + "learning_rate": 5.4404837168958725e-06, + "loss": 0.3138, + "step": 16817 + }, + { + "epoch": 0.49, + "grad_norm": 1.4855067361888175, + "learning_rate": 5.440015833561186e-06, + "loss": 0.3227, + "step": 16818 + }, + { + "epoch": 0.49, + "grad_norm": 1.3943136719828026, + "learning_rate": 5.439547946343355e-06, + "loss": 0.311, + "step": 16819 + }, + { + "epoch": 0.49, + "grad_norm": 1.8332641514202488, + "learning_rate": 5.43908005524651e-06, + "loss": 0.3197, + "step": 16820 + }, + { + "epoch": 0.49, + "grad_norm": 1.5814249977830253, + "learning_rate": 5.438612160274777e-06, + "loss": 0.3083, + "step": 16821 + }, + { + "epoch": 0.49, + "grad_norm": 1.2499086827262058, + "learning_rate": 5.4381442614322875e-06, + "loss": 0.3188, + "step": 16822 + }, + { + "epoch": 0.49, + "grad_norm": 1.3036504867344128, + "learning_rate": 5.437676358723171e-06, + "loss": 0.3227, + "step": 16823 + }, + { + "epoch": 0.49, + "grad_norm": 1.317442823092432, + "learning_rate": 5.4372084521515545e-06, + "loss": 0.2858, + "step": 16824 + }, + { + "epoch": 0.49, + "grad_norm": 1.2840967218019224, + "learning_rate": 5.436740541721568e-06, + "loss": 0.3075, + "step": 16825 + }, + { + "epoch": 0.49, + "grad_norm": 1.402452877072155, + "learning_rate": 5.436272627437342e-06, + "loss": 0.3339, + "step": 16826 + }, + { + "epoch": 0.49, + "grad_norm": 1.476905694675715, + "learning_rate": 5.4358047093030054e-06, + "loss": 0.2886, + "step": 16827 + }, + { + "epoch": 0.49, + "grad_norm": 1.1874499524875781, + "learning_rate": 5.435336787322687e-06, + "loss": 0.298, + "step": 16828 + }, + { + "epoch": 0.49, + "grad_norm": 0.9358755607267162, + "learning_rate": 5.4348688615005176e-06, + "loss": 0.543, + "step": 16829 + }, + { + "epoch": 0.49, + "grad_norm": 1.608761868561038, + "learning_rate": 5.434400931840624e-06, + "loss": 0.2827, + "step": 16830 + }, + { + "epoch": 0.49, + "grad_norm": 1.342180878666666, + "learning_rate": 5.433932998347139e-06, + "loss": 0.2813, + "step": 16831 + }, + { + "epoch": 0.49, + "grad_norm": 3.2866768373297606, + "learning_rate": 5.43346506102419e-06, + "loss": 0.3089, + "step": 16832 + }, + { + "epoch": 0.49, + "grad_norm": 1.2066615020996934, + "learning_rate": 5.432997119875907e-06, + "loss": 0.3084, + "step": 16833 + }, + { + "epoch": 0.49, + "grad_norm": 1.4096388732184228, + "learning_rate": 5.432529174906419e-06, + "loss": 0.3077, + "step": 16834 + }, + { + "epoch": 0.49, + "grad_norm": 1.471712566556239, + "learning_rate": 5.432061226119857e-06, + "loss": 0.2993, + "step": 16835 + }, + { + "epoch": 0.49, + "grad_norm": 1.4188353099940099, + "learning_rate": 5.43159327352035e-06, + "loss": 0.3054, + "step": 16836 + }, + { + "epoch": 0.49, + "grad_norm": 1.262414834447513, + "learning_rate": 5.431125317112027e-06, + "loss": 0.3071, + "step": 16837 + }, + { + "epoch": 0.49, + "grad_norm": 1.3840584185390714, + "learning_rate": 5.430657356899018e-06, + "loss": 0.3144, + "step": 16838 + }, + { + "epoch": 0.49, + "grad_norm": 1.4942198051942661, + "learning_rate": 5.430189392885454e-06, + "loss": 0.2948, + "step": 16839 + }, + { + "epoch": 0.49, + "grad_norm": 1.3028655539893645, + "learning_rate": 5.429721425075463e-06, + "loss": 0.3151, + "step": 16840 + }, + { + "epoch": 0.49, + "grad_norm": 1.4002056628280046, + "learning_rate": 5.429253453473177e-06, + "loss": 0.3046, + "step": 16841 + }, + { + "epoch": 0.49, + "grad_norm": 1.5685192413628095, + "learning_rate": 5.428785478082723e-06, + "loss": 0.3254, + "step": 16842 + }, + { + "epoch": 0.49, + "grad_norm": 1.2422264307046442, + "learning_rate": 5.428317498908232e-06, + "loss": 0.2894, + "step": 16843 + }, + { + "epoch": 0.49, + "grad_norm": 1.381955918736353, + "learning_rate": 5.427849515953835e-06, + "loss": 0.3147, + "step": 16844 + }, + { + "epoch": 0.49, + "grad_norm": 1.7960148764052042, + "learning_rate": 5.427381529223663e-06, + "loss": 0.3073, + "step": 16845 + }, + { + "epoch": 0.49, + "grad_norm": 1.983215512790825, + "learning_rate": 5.426913538721842e-06, + "loss": 0.3206, + "step": 16846 + }, + { + "epoch": 0.49, + "grad_norm": 3.897715860788895, + "learning_rate": 5.426445544452506e-06, + "loss": 0.3383, + "step": 16847 + }, + { + "epoch": 0.49, + "grad_norm": 1.2285616149206589, + "learning_rate": 5.425977546419782e-06, + "loss": 0.318, + "step": 16848 + }, + { + "epoch": 0.49, + "grad_norm": 1.1442050673199318, + "learning_rate": 5.425509544627802e-06, + "loss": 0.3148, + "step": 16849 + }, + { + "epoch": 0.49, + "grad_norm": 1.2393911407405869, + "learning_rate": 5.4250415390806966e-06, + "loss": 0.2853, + "step": 16850 + }, + { + "epoch": 0.49, + "grad_norm": 1.407871525251202, + "learning_rate": 5.424573529782593e-06, + "loss": 0.3336, + "step": 16851 + }, + { + "epoch": 0.49, + "grad_norm": 1.4358658484443174, + "learning_rate": 5.424105516737623e-06, + "loss": 0.3296, + "step": 16852 + }, + { + "epoch": 0.49, + "grad_norm": 1.3249326123461602, + "learning_rate": 5.4236374999499185e-06, + "loss": 0.337, + "step": 16853 + }, + { + "epoch": 0.49, + "grad_norm": 1.36471453296273, + "learning_rate": 5.423169479423609e-06, + "loss": 0.3359, + "step": 16854 + }, + { + "epoch": 0.49, + "grad_norm": 1.3949384320682048, + "learning_rate": 5.422701455162822e-06, + "loss": 0.321, + "step": 16855 + }, + { + "epoch": 0.49, + "grad_norm": 5.405979679857053, + "learning_rate": 5.422233427171692e-06, + "loss": 0.3035, + "step": 16856 + }, + { + "epoch": 0.49, + "grad_norm": 1.2715308833146615, + "learning_rate": 5.4217653954543445e-06, + "loss": 0.3125, + "step": 16857 + }, + { + "epoch": 0.49, + "grad_norm": 1.36680091614864, + "learning_rate": 5.421297360014915e-06, + "loss": 0.2904, + "step": 16858 + }, + { + "epoch": 0.49, + "grad_norm": 1.2898130196940616, + "learning_rate": 5.420829320857532e-06, + "loss": 0.3235, + "step": 16859 + }, + { + "epoch": 0.49, + "grad_norm": 1.9395688080389664, + "learning_rate": 5.420361277986325e-06, + "loss": 0.3166, + "step": 16860 + }, + { + "epoch": 0.49, + "grad_norm": 1.3386583501940255, + "learning_rate": 5.419893231405424e-06, + "loss": 0.3081, + "step": 16861 + }, + { + "epoch": 0.49, + "grad_norm": 1.320113263491053, + "learning_rate": 5.4194251811189615e-06, + "loss": 0.3014, + "step": 16862 + }, + { + "epoch": 0.49, + "grad_norm": 1.3419750985553365, + "learning_rate": 5.418957127131068e-06, + "loss": 0.3202, + "step": 16863 + }, + { + "epoch": 0.49, + "grad_norm": 1.2688299963429945, + "learning_rate": 5.418489069445872e-06, + "loss": 0.3079, + "step": 16864 + }, + { + "epoch": 0.49, + "grad_norm": 1.318168204702521, + "learning_rate": 5.418021008067506e-06, + "loss": 0.3121, + "step": 16865 + }, + { + "epoch": 0.49, + "grad_norm": 1.359585487690162, + "learning_rate": 5.4175529430001e-06, + "loss": 0.3021, + "step": 16866 + }, + { + "epoch": 0.49, + "grad_norm": 1.4699948450511, + "learning_rate": 5.4170848742477836e-06, + "loss": 0.3177, + "step": 16867 + }, + { + "epoch": 0.49, + "grad_norm": 1.6476891236569657, + "learning_rate": 5.416616801814689e-06, + "loss": 0.2956, + "step": 16868 + }, + { + "epoch": 0.49, + "grad_norm": 1.479908282956276, + "learning_rate": 5.4161487257049476e-06, + "loss": 0.2945, + "step": 16869 + }, + { + "epoch": 0.49, + "grad_norm": 1.405262532766898, + "learning_rate": 5.415680645922687e-06, + "loss": 0.3398, + "step": 16870 + }, + { + "epoch": 0.49, + "grad_norm": 1.5993851127890812, + "learning_rate": 5.415212562472041e-06, + "loss": 0.2981, + "step": 16871 + }, + { + "epoch": 0.49, + "grad_norm": 1.4740587797496483, + "learning_rate": 5.414744475357141e-06, + "loss": 0.3222, + "step": 16872 + }, + { + "epoch": 0.49, + "grad_norm": 1.2735236156833551, + "learning_rate": 5.414276384582115e-06, + "loss": 0.3125, + "step": 16873 + }, + { + "epoch": 0.49, + "grad_norm": 1.2235041460967382, + "learning_rate": 5.4138082901510945e-06, + "loss": 0.3102, + "step": 16874 + }, + { + "epoch": 0.49, + "grad_norm": 1.2226553064438457, + "learning_rate": 5.413340192068212e-06, + "loss": 0.3038, + "step": 16875 + }, + { + "epoch": 0.49, + "grad_norm": 1.4079632751174507, + "learning_rate": 5.412872090337597e-06, + "loss": 0.3125, + "step": 16876 + }, + { + "epoch": 0.49, + "grad_norm": 1.5186270655348033, + "learning_rate": 5.412403984963383e-06, + "loss": 0.2992, + "step": 16877 + }, + { + "epoch": 0.49, + "grad_norm": 1.2817196658286785, + "learning_rate": 5.4119358759497e-06, + "loss": 0.3186, + "step": 16878 + }, + { + "epoch": 0.49, + "grad_norm": 1.2189665766724975, + "learning_rate": 5.411467763300676e-06, + "loss": 0.3012, + "step": 16879 + }, + { + "epoch": 0.49, + "grad_norm": 1.3178937186909556, + "learning_rate": 5.410999647020445e-06, + "loss": 0.3031, + "step": 16880 + }, + { + "epoch": 0.49, + "grad_norm": 1.815517291363908, + "learning_rate": 5.410531527113137e-06, + "loss": 0.3188, + "step": 16881 + }, + { + "epoch": 0.49, + "grad_norm": 1.3044154321869486, + "learning_rate": 5.410063403582886e-06, + "loss": 0.317, + "step": 16882 + }, + { + "epoch": 0.49, + "grad_norm": 1.2162581141032296, + "learning_rate": 5.409595276433819e-06, + "loss": 0.3249, + "step": 16883 + }, + { + "epoch": 0.49, + "grad_norm": 1.7176409887643107, + "learning_rate": 5.409127145670069e-06, + "loss": 0.3168, + "step": 16884 + }, + { + "epoch": 0.49, + "grad_norm": 1.1962368248463837, + "learning_rate": 5.408659011295767e-06, + "loss": 0.2995, + "step": 16885 + }, + { + "epoch": 0.49, + "grad_norm": 3.0579894528766727, + "learning_rate": 5.408190873315047e-06, + "loss": 0.3229, + "step": 16886 + }, + { + "epoch": 0.49, + "grad_norm": 1.884046557265522, + "learning_rate": 5.407722731732036e-06, + "loss": 0.3226, + "step": 16887 + }, + { + "epoch": 0.49, + "grad_norm": 1.5686252238628966, + "learning_rate": 5.407254586550867e-06, + "loss": 0.317, + "step": 16888 + }, + { + "epoch": 0.49, + "grad_norm": 1.3751755144811433, + "learning_rate": 5.406786437775673e-06, + "loss": 0.319, + "step": 16889 + }, + { + "epoch": 0.49, + "grad_norm": 1.316450276758488, + "learning_rate": 5.406318285410584e-06, + "loss": 0.3128, + "step": 16890 + }, + { + "epoch": 0.49, + "grad_norm": 1.272622999804322, + "learning_rate": 5.405850129459732e-06, + "loss": 0.2968, + "step": 16891 + }, + { + "epoch": 0.49, + "grad_norm": 1.588954226496658, + "learning_rate": 5.405381969927248e-06, + "loss": 0.3231, + "step": 16892 + }, + { + "epoch": 0.49, + "grad_norm": 1.4486143006969847, + "learning_rate": 5.404913806817263e-06, + "loss": 0.304, + "step": 16893 + }, + { + "epoch": 0.49, + "grad_norm": 1.335374475997522, + "learning_rate": 5.40444564013391e-06, + "loss": 0.301, + "step": 16894 + }, + { + "epoch": 0.49, + "grad_norm": 1.434868214931543, + "learning_rate": 5.403977469881319e-06, + "loss": 0.2858, + "step": 16895 + }, + { + "epoch": 0.49, + "grad_norm": 1.7616475889550023, + "learning_rate": 5.403509296063623e-06, + "loss": 0.3259, + "step": 16896 + }, + { + "epoch": 0.49, + "grad_norm": 1.483375503717525, + "learning_rate": 5.4030411186849516e-06, + "loss": 0.3176, + "step": 16897 + }, + { + "epoch": 0.49, + "grad_norm": 1.5626836105277644, + "learning_rate": 5.402572937749439e-06, + "loss": 0.3266, + "step": 16898 + }, + { + "epoch": 0.49, + "grad_norm": 1.3377440388330022, + "learning_rate": 5.402104753261215e-06, + "loss": 0.2973, + "step": 16899 + }, + { + "epoch": 0.49, + "grad_norm": 1.289260540906907, + "learning_rate": 5.4016365652244125e-06, + "loss": 0.2913, + "step": 16900 + }, + { + "epoch": 0.49, + "grad_norm": 1.3115983986087525, + "learning_rate": 5.401168373643163e-06, + "loss": 0.3247, + "step": 16901 + }, + { + "epoch": 0.49, + "grad_norm": 1.3853339002703364, + "learning_rate": 5.400700178521598e-06, + "loss": 0.3516, + "step": 16902 + }, + { + "epoch": 0.49, + "grad_norm": 1.342834126864037, + "learning_rate": 5.4002319798638496e-06, + "loss": 0.2963, + "step": 16903 + }, + { + "epoch": 0.49, + "grad_norm": 1.3367019462015726, + "learning_rate": 5.399763777674051e-06, + "loss": 0.3277, + "step": 16904 + }, + { + "epoch": 0.49, + "grad_norm": 1.5710154330639015, + "learning_rate": 5.39929557195633e-06, + "loss": 0.2955, + "step": 16905 + }, + { + "epoch": 0.49, + "grad_norm": 1.321830480772044, + "learning_rate": 5.398827362714824e-06, + "loss": 0.2953, + "step": 16906 + }, + { + "epoch": 0.49, + "grad_norm": 1.271824440501541, + "learning_rate": 5.398359149953659e-06, + "loss": 0.3147, + "step": 16907 + }, + { + "epoch": 0.49, + "grad_norm": 1.9304445558072363, + "learning_rate": 5.397890933676971e-06, + "loss": 0.3191, + "step": 16908 + }, + { + "epoch": 0.49, + "grad_norm": 1.4352684808041525, + "learning_rate": 5.397422713888892e-06, + "loss": 0.3136, + "step": 16909 + }, + { + "epoch": 0.49, + "grad_norm": 1.3432273332518765, + "learning_rate": 5.396954490593553e-06, + "loss": 0.2837, + "step": 16910 + }, + { + "epoch": 0.49, + "grad_norm": 1.2634319717602993, + "learning_rate": 5.396486263795086e-06, + "loss": 0.3076, + "step": 16911 + }, + { + "epoch": 0.49, + "grad_norm": 1.288095036126334, + "learning_rate": 5.396018033497623e-06, + "loss": 0.3299, + "step": 16912 + }, + { + "epoch": 0.49, + "grad_norm": 1.4123648980965786, + "learning_rate": 5.3955497997052966e-06, + "loss": 0.3167, + "step": 16913 + }, + { + "epoch": 0.49, + "grad_norm": 1.3881286755028126, + "learning_rate": 5.39508156242224e-06, + "loss": 0.3173, + "step": 16914 + }, + { + "epoch": 0.49, + "grad_norm": 1.3007953306590105, + "learning_rate": 5.394613321652582e-06, + "loss": 0.3135, + "step": 16915 + }, + { + "epoch": 0.49, + "grad_norm": 1.8119618573168035, + "learning_rate": 5.394145077400457e-06, + "loss": 0.2937, + "step": 16916 + }, + { + "epoch": 0.49, + "grad_norm": 1.2822463400233788, + "learning_rate": 5.393676829669999e-06, + "loss": 0.2978, + "step": 16917 + }, + { + "epoch": 0.49, + "grad_norm": 1.3392041274802249, + "learning_rate": 5.3932085784653394e-06, + "loss": 0.3161, + "step": 16918 + }, + { + "epoch": 0.49, + "grad_norm": 1.6999360119826548, + "learning_rate": 5.392740323790609e-06, + "loss": 0.3088, + "step": 16919 + }, + { + "epoch": 0.49, + "grad_norm": 1.2349150543920808, + "learning_rate": 5.392272065649941e-06, + "loss": 0.2905, + "step": 16920 + }, + { + "epoch": 0.49, + "grad_norm": 1.2720185463401303, + "learning_rate": 5.3918038040474665e-06, + "loss": 0.3077, + "step": 16921 + }, + { + "epoch": 0.49, + "grad_norm": 3.7642624824932533, + "learning_rate": 5.391335538987319e-06, + "loss": 0.279, + "step": 16922 + }, + { + "epoch": 0.49, + "grad_norm": 1.3528497320255966, + "learning_rate": 5.390867270473632e-06, + "loss": 0.3454, + "step": 16923 + }, + { + "epoch": 0.49, + "grad_norm": 1.3339988424476141, + "learning_rate": 5.3903989985105365e-06, + "loss": 0.3352, + "step": 16924 + }, + { + "epoch": 0.49, + "grad_norm": 1.3146001112667087, + "learning_rate": 5.389930723102166e-06, + "loss": 0.3376, + "step": 16925 + }, + { + "epoch": 0.49, + "grad_norm": 1.5952164511403808, + "learning_rate": 5.389462444252651e-06, + "loss": 0.3058, + "step": 16926 + }, + { + "epoch": 0.49, + "grad_norm": 1.5725767472878076, + "learning_rate": 5.388994161966128e-06, + "loss": 0.2982, + "step": 16927 + }, + { + "epoch": 0.49, + "grad_norm": 1.3379141348555343, + "learning_rate": 5.388525876246726e-06, + "loss": 0.3099, + "step": 16928 + }, + { + "epoch": 0.49, + "grad_norm": 1.2383731592860208, + "learning_rate": 5.388057587098578e-06, + "loss": 0.3066, + "step": 16929 + }, + { + "epoch": 0.49, + "grad_norm": 1.2581789850560632, + "learning_rate": 5.387589294525818e-06, + "loss": 0.3223, + "step": 16930 + }, + { + "epoch": 0.49, + "grad_norm": 1.343745126406842, + "learning_rate": 5.387120998532579e-06, + "loss": 0.3036, + "step": 16931 + }, + { + "epoch": 0.49, + "grad_norm": 1.2781417849304753, + "learning_rate": 5.386652699122993e-06, + "loss": 0.3229, + "step": 16932 + }, + { + "epoch": 0.49, + "grad_norm": 1.2973847749673575, + "learning_rate": 5.386184396301192e-06, + "loss": 0.3329, + "step": 16933 + }, + { + "epoch": 0.49, + "grad_norm": 1.3490656009750988, + "learning_rate": 5.385716090071309e-06, + "loss": 0.354, + "step": 16934 + }, + { + "epoch": 0.49, + "grad_norm": 1.4102040363769817, + "learning_rate": 5.385247780437478e-06, + "loss": 0.3147, + "step": 16935 + }, + { + "epoch": 0.49, + "grad_norm": 1.4518876427794933, + "learning_rate": 5.384779467403831e-06, + "loss": 0.3059, + "step": 16936 + }, + { + "epoch": 0.49, + "grad_norm": 1.4960494657365204, + "learning_rate": 5.3843111509745015e-06, + "loss": 0.2905, + "step": 16937 + }, + { + "epoch": 0.49, + "grad_norm": 1.4701264192824577, + "learning_rate": 5.3838428311536205e-06, + "loss": 0.2936, + "step": 16938 + }, + { + "epoch": 0.49, + "grad_norm": 1.3779466293770195, + "learning_rate": 5.383374507945323e-06, + "loss": 0.3345, + "step": 16939 + }, + { + "epoch": 0.49, + "grad_norm": 1.3439525074511338, + "learning_rate": 5.38290618135374e-06, + "loss": 0.3338, + "step": 16940 + }, + { + "epoch": 0.49, + "grad_norm": 1.2513657170666619, + "learning_rate": 5.382437851383008e-06, + "loss": 0.3141, + "step": 16941 + }, + { + "epoch": 0.49, + "grad_norm": 1.2435864555258413, + "learning_rate": 5.381969518037256e-06, + "loss": 0.3284, + "step": 16942 + }, + { + "epoch": 0.49, + "grad_norm": 1.139279059482151, + "learning_rate": 5.381501181320619e-06, + "loss": 0.3151, + "step": 16943 + }, + { + "epoch": 0.49, + "grad_norm": 1.2720833264707905, + "learning_rate": 5.381032841237229e-06, + "loss": 0.304, + "step": 16944 + }, + { + "epoch": 0.49, + "grad_norm": 1.4397405611901357, + "learning_rate": 5.380564497791222e-06, + "loss": 0.3, + "step": 16945 + }, + { + "epoch": 0.49, + "grad_norm": 1.7790036131696292, + "learning_rate": 5.3800961509867275e-06, + "loss": 0.3132, + "step": 16946 + }, + { + "epoch": 0.49, + "grad_norm": 1.2740744163292423, + "learning_rate": 5.379627800827881e-06, + "loss": 0.3027, + "step": 16947 + }, + { + "epoch": 0.49, + "grad_norm": 1.5966642116199232, + "learning_rate": 5.379159447318815e-06, + "loss": 0.3056, + "step": 16948 + }, + { + "epoch": 0.49, + "grad_norm": 1.249896629083869, + "learning_rate": 5.378691090463663e-06, + "loss": 0.3104, + "step": 16949 + }, + { + "epoch": 0.49, + "grad_norm": 1.443082413615871, + "learning_rate": 5.378222730266557e-06, + "loss": 0.3217, + "step": 16950 + }, + { + "epoch": 0.49, + "grad_norm": 1.3357374797760877, + "learning_rate": 5.377754366731633e-06, + "loss": 0.3248, + "step": 16951 + }, + { + "epoch": 0.49, + "grad_norm": 1.3461488304954783, + "learning_rate": 5.37728599986302e-06, + "loss": 0.3363, + "step": 16952 + }, + { + "epoch": 0.49, + "grad_norm": 1.3286883851789015, + "learning_rate": 5.376817629664854e-06, + "loss": 0.3021, + "step": 16953 + }, + { + "epoch": 0.49, + "grad_norm": 1.4687728185949058, + "learning_rate": 5.376349256141269e-06, + "loss": 0.3412, + "step": 16954 + }, + { + "epoch": 0.49, + "grad_norm": 1.2956015585227822, + "learning_rate": 5.375880879296399e-06, + "loss": 0.3305, + "step": 16955 + }, + { + "epoch": 0.49, + "grad_norm": 1.4867519742241249, + "learning_rate": 5.375412499134375e-06, + "loss": 0.3032, + "step": 16956 + }, + { + "epoch": 0.49, + "grad_norm": 1.444913903071611, + "learning_rate": 5.3749441156593305e-06, + "loss": 0.3294, + "step": 16957 + }, + { + "epoch": 0.49, + "grad_norm": 2.2642947453001705, + "learning_rate": 5.374475728875401e-06, + "loss": 0.3235, + "step": 16958 + }, + { + "epoch": 0.49, + "grad_norm": 1.2474581469339465, + "learning_rate": 5.37400733878672e-06, + "loss": 0.313, + "step": 16959 + }, + { + "epoch": 0.49, + "grad_norm": 1.2125338988785195, + "learning_rate": 5.373538945397418e-06, + "loss": 0.2973, + "step": 16960 + }, + { + "epoch": 0.49, + "grad_norm": 1.398196608779041, + "learning_rate": 5.373070548711632e-06, + "loss": 0.315, + "step": 16961 + }, + { + "epoch": 0.49, + "grad_norm": 1.3067054049129196, + "learning_rate": 5.372602148733493e-06, + "loss": 0.313, + "step": 16962 + }, + { + "epoch": 0.49, + "grad_norm": 1.35483654610359, + "learning_rate": 5.372133745467138e-06, + "loss": 0.3548, + "step": 16963 + }, + { + "epoch": 0.49, + "grad_norm": 1.3854379477497036, + "learning_rate": 5.371665338916698e-06, + "loss": 0.3058, + "step": 16964 + }, + { + "epoch": 0.49, + "grad_norm": 1.162399641659278, + "learning_rate": 5.371196929086306e-06, + "loss": 0.3087, + "step": 16965 + }, + { + "epoch": 0.49, + "grad_norm": 1.3535637035780124, + "learning_rate": 5.370728515980098e-06, + "loss": 0.2975, + "step": 16966 + }, + { + "epoch": 0.49, + "grad_norm": 1.318580138275256, + "learning_rate": 5.370260099602207e-06, + "loss": 0.3487, + "step": 16967 + }, + { + "epoch": 0.49, + "grad_norm": 1.3790189438401546, + "learning_rate": 5.369791679956766e-06, + "loss": 0.3195, + "step": 16968 + }, + { + "epoch": 0.49, + "grad_norm": 1.3418127829557622, + "learning_rate": 5.369323257047909e-06, + "loss": 0.3173, + "step": 16969 + }, + { + "epoch": 0.49, + "grad_norm": 1.3056954343972764, + "learning_rate": 5.36885483087977e-06, + "loss": 0.316, + "step": 16970 + }, + { + "epoch": 0.49, + "grad_norm": 2.9156727123793917, + "learning_rate": 5.368386401456483e-06, + "loss": 0.3335, + "step": 16971 + }, + { + "epoch": 0.49, + "grad_norm": 1.5092659458402644, + "learning_rate": 5.367917968782182e-06, + "loss": 0.2989, + "step": 16972 + }, + { + "epoch": 0.49, + "grad_norm": 1.2344330042621665, + "learning_rate": 5.367449532861003e-06, + "loss": 0.3094, + "step": 16973 + }, + { + "epoch": 0.49, + "grad_norm": 1.2354043234513046, + "learning_rate": 5.3669810936970755e-06, + "loss": 0.2977, + "step": 16974 + }, + { + "epoch": 0.49, + "grad_norm": 1.3151525192716937, + "learning_rate": 5.366512651294537e-06, + "loss": 0.3095, + "step": 16975 + }, + { + "epoch": 0.49, + "grad_norm": 1.2215348715694594, + "learning_rate": 5.366044205657518e-06, + "loss": 0.3, + "step": 16976 + }, + { + "epoch": 0.49, + "grad_norm": 1.3031283961428872, + "learning_rate": 5.365575756790158e-06, + "loss": 0.3183, + "step": 16977 + }, + { + "epoch": 0.49, + "grad_norm": 1.3830456822446777, + "learning_rate": 5.365107304696586e-06, + "loss": 0.2892, + "step": 16978 + }, + { + "epoch": 0.49, + "grad_norm": 1.2387069537259172, + "learning_rate": 5.364638849380939e-06, + "loss": 0.2934, + "step": 16979 + }, + { + "epoch": 0.49, + "grad_norm": 1.2077574046993254, + "learning_rate": 5.364170390847349e-06, + "loss": 0.2935, + "step": 16980 + }, + { + "epoch": 0.49, + "grad_norm": 1.3904908451229483, + "learning_rate": 5.363701929099951e-06, + "loss": 0.3035, + "step": 16981 + }, + { + "epoch": 0.49, + "grad_norm": 1.2790258478616903, + "learning_rate": 5.363233464142881e-06, + "loss": 0.335, + "step": 16982 + }, + { + "epoch": 0.49, + "grad_norm": 1.3093335399139163, + "learning_rate": 5.362764995980271e-06, + "loss": 0.3066, + "step": 16983 + }, + { + "epoch": 0.49, + "grad_norm": 1.2703775975749623, + "learning_rate": 5.362296524616255e-06, + "loss": 0.3174, + "step": 16984 + }, + { + "epoch": 0.49, + "grad_norm": 1.3735594260206851, + "learning_rate": 5.361828050054968e-06, + "loss": 0.3181, + "step": 16985 + }, + { + "epoch": 0.49, + "grad_norm": 1.358311200231085, + "learning_rate": 5.361359572300546e-06, + "loss": 0.3112, + "step": 16986 + }, + { + "epoch": 0.49, + "grad_norm": 2.0885310339544176, + "learning_rate": 5.360891091357121e-06, + "loss": 0.2987, + "step": 16987 + }, + { + "epoch": 0.49, + "grad_norm": 1.243824432412975, + "learning_rate": 5.3604226072288276e-06, + "loss": 0.295, + "step": 16988 + }, + { + "epoch": 0.49, + "grad_norm": 1.2936752192040906, + "learning_rate": 5.359954119919801e-06, + "loss": 0.3404, + "step": 16989 + }, + { + "epoch": 0.49, + "grad_norm": 1.0007141597876754, + "learning_rate": 5.359485629434176e-06, + "loss": 0.5711, + "step": 16990 + }, + { + "epoch": 0.49, + "grad_norm": 1.3341093289295833, + "learning_rate": 5.359017135776085e-06, + "loss": 0.3081, + "step": 16991 + }, + { + "epoch": 0.49, + "grad_norm": 1.2362547714985084, + "learning_rate": 5.358548638949666e-06, + "loss": 0.3047, + "step": 16992 + }, + { + "epoch": 0.49, + "grad_norm": 1.879037030104337, + "learning_rate": 5.35808013895905e-06, + "loss": 0.2989, + "step": 16993 + }, + { + "epoch": 0.49, + "grad_norm": 1.2193039354482342, + "learning_rate": 5.357611635808373e-06, + "loss": 0.2998, + "step": 16994 + }, + { + "epoch": 0.49, + "grad_norm": 1.3494666208310109, + "learning_rate": 5.357143129501769e-06, + "loss": 0.3089, + "step": 16995 + }, + { + "epoch": 0.49, + "grad_norm": 1.3187171136810114, + "learning_rate": 5.356674620043373e-06, + "loss": 0.3146, + "step": 16996 + }, + { + "epoch": 0.49, + "grad_norm": 1.3475359248419068, + "learning_rate": 5.35620610743732e-06, + "loss": 0.3321, + "step": 16997 + }, + { + "epoch": 0.49, + "grad_norm": 1.2562361939851476, + "learning_rate": 5.355737591687743e-06, + "loss": 0.3171, + "step": 16998 + }, + { + "epoch": 0.49, + "grad_norm": 1.450803841235583, + "learning_rate": 5.355269072798777e-06, + "loss": 0.3288, + "step": 16999 + }, + { + "epoch": 0.49, + "grad_norm": 1.8564383062388345, + "learning_rate": 5.3548005507745604e-06, + "loss": 0.3186, + "step": 17000 + }, + { + "epoch": 0.49, + "grad_norm": 1.3016431055270534, + "learning_rate": 5.354332025619224e-06, + "loss": 0.2747, + "step": 17001 + }, + { + "epoch": 0.49, + "grad_norm": 1.1750394465753162, + "learning_rate": 5.353863497336902e-06, + "loss": 0.3151, + "step": 17002 + }, + { + "epoch": 0.49, + "grad_norm": 1.2371357602598676, + "learning_rate": 5.353394965931732e-06, + "loss": 0.3113, + "step": 17003 + }, + { + "epoch": 0.49, + "grad_norm": 1.2640000462619791, + "learning_rate": 5.352926431407848e-06, + "loss": 0.3056, + "step": 17004 + }, + { + "epoch": 0.49, + "grad_norm": 1.165320625490065, + "learning_rate": 5.352457893769385e-06, + "loss": 0.2874, + "step": 17005 + }, + { + "epoch": 0.49, + "grad_norm": 1.194666904243183, + "learning_rate": 5.351989353020475e-06, + "loss": 0.2929, + "step": 17006 + }, + { + "epoch": 0.49, + "grad_norm": 1.3133252748117556, + "learning_rate": 5.351520809165256e-06, + "loss": 0.3046, + "step": 17007 + }, + { + "epoch": 0.49, + "grad_norm": 1.343474749262745, + "learning_rate": 5.351052262207861e-06, + "loss": 0.3286, + "step": 17008 + }, + { + "epoch": 0.49, + "grad_norm": 1.2302474230468459, + "learning_rate": 5.350583712152426e-06, + "loss": 0.2848, + "step": 17009 + }, + { + "epoch": 0.49, + "grad_norm": 1.231107894466977, + "learning_rate": 5.350115159003088e-06, + "loss": 0.3017, + "step": 17010 + }, + { + "epoch": 0.49, + "grad_norm": 1.3525949152741645, + "learning_rate": 5.349646602763978e-06, + "loss": 0.3072, + "step": 17011 + }, + { + "epoch": 0.49, + "grad_norm": 3.0312358784993445, + "learning_rate": 5.349178043439233e-06, + "loss": 0.3345, + "step": 17012 + }, + { + "epoch": 0.49, + "grad_norm": 1.2743682081540146, + "learning_rate": 5.348709481032988e-06, + "loss": 0.2834, + "step": 17013 + }, + { + "epoch": 0.49, + "grad_norm": 1.3771585548616208, + "learning_rate": 5.348240915549379e-06, + "loss": 0.3219, + "step": 17014 + }, + { + "epoch": 0.49, + "grad_norm": 2.0345362813302477, + "learning_rate": 5.347772346992539e-06, + "loss": 0.286, + "step": 17015 + }, + { + "epoch": 0.49, + "grad_norm": 1.6615102175728227, + "learning_rate": 5.347303775366604e-06, + "loss": 0.2972, + "step": 17016 + }, + { + "epoch": 0.49, + "grad_norm": 1.3072615459447703, + "learning_rate": 5.3468352006757095e-06, + "loss": 0.3149, + "step": 17017 + }, + { + "epoch": 0.49, + "grad_norm": 1.2511104957985035, + "learning_rate": 5.3463666229239906e-06, + "loss": 0.2996, + "step": 17018 + }, + { + "epoch": 0.49, + "grad_norm": 1.2954358842727969, + "learning_rate": 5.345898042115584e-06, + "loss": 0.312, + "step": 17019 + }, + { + "epoch": 0.49, + "grad_norm": 1.4510865323769933, + "learning_rate": 5.345429458254622e-06, + "loss": 0.3469, + "step": 17020 + }, + { + "epoch": 0.49, + "grad_norm": 1.231157992484864, + "learning_rate": 5.34496087134524e-06, + "loss": 0.3165, + "step": 17021 + }, + { + "epoch": 0.49, + "grad_norm": 1.2168737188617627, + "learning_rate": 5.344492281391575e-06, + "loss": 0.3041, + "step": 17022 + }, + { + "epoch": 0.49, + "grad_norm": 1.327897940523532, + "learning_rate": 5.344023688397764e-06, + "loss": 0.3261, + "step": 17023 + }, + { + "epoch": 0.49, + "grad_norm": 1.9833906647156125, + "learning_rate": 5.343555092367938e-06, + "loss": 0.305, + "step": 17024 + }, + { + "epoch": 0.49, + "grad_norm": 1.254811732193306, + "learning_rate": 5.343086493306234e-06, + "loss": 0.3099, + "step": 17025 + }, + { + "epoch": 0.49, + "grad_norm": 1.2427655625225293, + "learning_rate": 5.342617891216789e-06, + "loss": 0.2975, + "step": 17026 + }, + { + "epoch": 0.49, + "grad_norm": 1.1956882701194633, + "learning_rate": 5.342149286103737e-06, + "loss": 0.2879, + "step": 17027 + }, + { + "epoch": 0.49, + "grad_norm": 1.3637967747643287, + "learning_rate": 5.341680677971215e-06, + "loss": 0.3114, + "step": 17028 + }, + { + "epoch": 0.49, + "grad_norm": 1.3256544419130103, + "learning_rate": 5.341212066823356e-06, + "loss": 0.3011, + "step": 17029 + }, + { + "epoch": 0.49, + "grad_norm": 1.2654861553000762, + "learning_rate": 5.340743452664297e-06, + "loss": 0.2925, + "step": 17030 + }, + { + "epoch": 0.49, + "grad_norm": 1.435314788663934, + "learning_rate": 5.340274835498171e-06, + "loss": 0.3009, + "step": 17031 + }, + { + "epoch": 0.49, + "grad_norm": 1.3493314148713873, + "learning_rate": 5.33980621532912e-06, + "loss": 0.2901, + "step": 17032 + }, + { + "epoch": 0.49, + "grad_norm": 1.4038999363572617, + "learning_rate": 5.339337592161273e-06, + "loss": 0.3534, + "step": 17033 + }, + { + "epoch": 0.49, + "grad_norm": 1.2421052798120997, + "learning_rate": 5.338868965998769e-06, + "loss": 0.2978, + "step": 17034 + }, + { + "epoch": 0.49, + "grad_norm": 1.6530200510842976, + "learning_rate": 5.338400336845741e-06, + "loss": 0.2869, + "step": 17035 + }, + { + "epoch": 0.49, + "grad_norm": 1.2522784855973075, + "learning_rate": 5.337931704706327e-06, + "loss": 0.3025, + "step": 17036 + }, + { + "epoch": 0.49, + "grad_norm": 1.3057539613514595, + "learning_rate": 5.337463069584663e-06, + "loss": 0.3249, + "step": 17037 + }, + { + "epoch": 0.49, + "grad_norm": 1.794170089426391, + "learning_rate": 5.336994431484883e-06, + "loss": 0.3231, + "step": 17038 + }, + { + "epoch": 0.49, + "grad_norm": 1.3179561527114312, + "learning_rate": 5.336525790411122e-06, + "loss": 0.3117, + "step": 17039 + }, + { + "epoch": 0.49, + "grad_norm": 1.2580423845400261, + "learning_rate": 5.336057146367518e-06, + "loss": 0.313, + "step": 17040 + }, + { + "epoch": 0.49, + "grad_norm": 1.5444779127919832, + "learning_rate": 5.335588499358208e-06, + "loss": 0.2973, + "step": 17041 + }, + { + "epoch": 0.49, + "grad_norm": 1.2157512114758577, + "learning_rate": 5.335119849387324e-06, + "loss": 0.3146, + "step": 17042 + }, + { + "epoch": 0.49, + "grad_norm": 1.3430337376427826, + "learning_rate": 5.334651196459003e-06, + "loss": 0.3123, + "step": 17043 + }, + { + "epoch": 0.49, + "grad_norm": 1.6434660099777474, + "learning_rate": 5.334182540577382e-06, + "loss": 0.309, + "step": 17044 + }, + { + "epoch": 0.49, + "grad_norm": 3.3026658218896214, + "learning_rate": 5.333713881746596e-06, + "loss": 0.3027, + "step": 17045 + }, + { + "epoch": 0.49, + "grad_norm": 1.2988296263398258, + "learning_rate": 5.333245219970783e-06, + "loss": 0.3062, + "step": 17046 + }, + { + "epoch": 0.49, + "grad_norm": 1.2178101287865006, + "learning_rate": 5.332776555254077e-06, + "loss": 0.3131, + "step": 17047 + }, + { + "epoch": 0.49, + "grad_norm": 1.1920286795905668, + "learning_rate": 5.3323078876006125e-06, + "loss": 0.3092, + "step": 17048 + }, + { + "epoch": 0.49, + "grad_norm": 1.9086343998991788, + "learning_rate": 5.331839217014528e-06, + "loss": 0.3033, + "step": 17049 + }, + { + "epoch": 0.49, + "grad_norm": 0.9481211923795914, + "learning_rate": 5.3313705434999585e-06, + "loss": 0.6134, + "step": 17050 + }, + { + "epoch": 0.49, + "grad_norm": 1.3860890688121734, + "learning_rate": 5.330901867061042e-06, + "loss": 0.3152, + "step": 17051 + }, + { + "epoch": 0.49, + "grad_norm": 1.2425471424828964, + "learning_rate": 5.33043318770191e-06, + "loss": 0.2962, + "step": 17052 + }, + { + "epoch": 0.49, + "grad_norm": 1.3095223496261728, + "learning_rate": 5.329964505426702e-06, + "loss": 0.3387, + "step": 17053 + }, + { + "epoch": 0.49, + "grad_norm": 3.012577870908305, + "learning_rate": 5.329495820239555e-06, + "loss": 0.3094, + "step": 17054 + }, + { + "epoch": 0.49, + "grad_norm": 1.3091904168461195, + "learning_rate": 5.3290271321446045e-06, + "loss": 0.2992, + "step": 17055 + }, + { + "epoch": 0.49, + "grad_norm": 1.7535004543585637, + "learning_rate": 5.328558441145983e-06, + "loss": 0.2867, + "step": 17056 + }, + { + "epoch": 0.49, + "grad_norm": 6.386177781294939, + "learning_rate": 5.328089747247831e-06, + "loss": 0.3097, + "step": 17057 + }, + { + "epoch": 0.49, + "grad_norm": 1.2274383117822887, + "learning_rate": 5.327621050454284e-06, + "loss": 0.306, + "step": 17058 + }, + { + "epoch": 0.49, + "grad_norm": 1.3772884249020787, + "learning_rate": 5.3271523507694775e-06, + "loss": 0.2783, + "step": 17059 + }, + { + "epoch": 0.49, + "grad_norm": 1.3610185791617373, + "learning_rate": 5.326683648197547e-06, + "loss": 0.3403, + "step": 17060 + }, + { + "epoch": 0.49, + "grad_norm": 1.327554846919282, + "learning_rate": 5.326214942742631e-06, + "loss": 0.3076, + "step": 17061 + }, + { + "epoch": 0.49, + "grad_norm": 0.9400271929251377, + "learning_rate": 5.325746234408863e-06, + "loss": 0.6053, + "step": 17062 + }, + { + "epoch": 0.49, + "grad_norm": 1.369578193045702, + "learning_rate": 5.325277523200382e-06, + "loss": 0.3209, + "step": 17063 + }, + { + "epoch": 0.49, + "grad_norm": 1.3361413095053463, + "learning_rate": 5.3248088091213226e-06, + "loss": 0.3191, + "step": 17064 + }, + { + "epoch": 0.49, + "grad_norm": 1.572701258450822, + "learning_rate": 5.324340092175822e-06, + "loss": 0.3149, + "step": 17065 + }, + { + "epoch": 0.49, + "grad_norm": 1.2549914144398706, + "learning_rate": 5.323871372368017e-06, + "loss": 0.3187, + "step": 17066 + }, + { + "epoch": 0.5, + "grad_norm": 2.707303337733352, + "learning_rate": 5.3234026497020416e-06, + "loss": 0.3172, + "step": 17067 + }, + { + "epoch": 0.5, + "grad_norm": 1.3796029504187413, + "learning_rate": 5.322933924182035e-06, + "loss": 0.3242, + "step": 17068 + }, + { + "epoch": 0.5, + "grad_norm": 1.3455657543082913, + "learning_rate": 5.3224651958121355e-06, + "loss": 0.3798, + "step": 17069 + }, + { + "epoch": 0.5, + "grad_norm": 1.3963632242010748, + "learning_rate": 5.321996464596475e-06, + "loss": 0.3086, + "step": 17070 + }, + { + "epoch": 0.5, + "grad_norm": 1.4013059944555228, + "learning_rate": 5.321527730539192e-06, + "loss": 0.311, + "step": 17071 + }, + { + "epoch": 0.5, + "grad_norm": 1.5274026489064363, + "learning_rate": 5.321058993644423e-06, + "loss": 0.3176, + "step": 17072 + }, + { + "epoch": 0.5, + "grad_norm": 1.2608873724935905, + "learning_rate": 5.3205902539163055e-06, + "loss": 0.3274, + "step": 17073 + }, + { + "epoch": 0.5, + "grad_norm": 1.340565690583205, + "learning_rate": 5.320121511358976e-06, + "loss": 0.3016, + "step": 17074 + }, + { + "epoch": 0.5, + "grad_norm": 1.279709773584564, + "learning_rate": 5.31965276597657e-06, + "loss": 0.3129, + "step": 17075 + }, + { + "epoch": 0.5, + "grad_norm": 0.8405053960337217, + "learning_rate": 5.319184017773225e-06, + "loss": 0.5057, + "step": 17076 + }, + { + "epoch": 0.5, + "grad_norm": 1.5630118977582517, + "learning_rate": 5.318715266753076e-06, + "loss": 0.3157, + "step": 17077 + }, + { + "epoch": 0.5, + "grad_norm": 1.445419023771088, + "learning_rate": 5.318246512920264e-06, + "loss": 0.3225, + "step": 17078 + }, + { + "epoch": 0.5, + "grad_norm": 1.3049347802169657, + "learning_rate": 5.3177777562789215e-06, + "loss": 0.3245, + "step": 17079 + }, + { + "epoch": 0.5, + "grad_norm": 1.448067834851523, + "learning_rate": 5.3173089968331856e-06, + "loss": 0.3077, + "step": 17080 + }, + { + "epoch": 0.5, + "grad_norm": 1.2607920432365791, + "learning_rate": 5.316840234587195e-06, + "loss": 0.2972, + "step": 17081 + }, + { + "epoch": 0.5, + "grad_norm": 1.2843752948949405, + "learning_rate": 5.316371469545086e-06, + "loss": 0.3069, + "step": 17082 + }, + { + "epoch": 0.5, + "grad_norm": 1.6188432155464523, + "learning_rate": 5.315902701710997e-06, + "loss": 0.3448, + "step": 17083 + }, + { + "epoch": 0.5, + "grad_norm": 1.2629659862438767, + "learning_rate": 5.315433931089062e-06, + "loss": 0.3032, + "step": 17084 + }, + { + "epoch": 0.5, + "grad_norm": 1.3807292573286736, + "learning_rate": 5.314965157683417e-06, + "loss": 0.3198, + "step": 17085 + }, + { + "epoch": 0.5, + "grad_norm": 1.7767031898618246, + "learning_rate": 5.314496381498203e-06, + "loss": 0.3169, + "step": 17086 + }, + { + "epoch": 0.5, + "grad_norm": 1.3141126804518097, + "learning_rate": 5.3140276025375545e-06, + "loss": 0.3028, + "step": 17087 + }, + { + "epoch": 0.5, + "grad_norm": 1.4956815216729191, + "learning_rate": 5.31355882080561e-06, + "loss": 0.3195, + "step": 17088 + }, + { + "epoch": 0.5, + "grad_norm": 1.2331668352362144, + "learning_rate": 5.3130900363065055e-06, + "loss": 0.3358, + "step": 17089 + }, + { + "epoch": 0.5, + "grad_norm": 1.275061231990434, + "learning_rate": 5.312621249044376e-06, + "loss": 0.3036, + "step": 17090 + }, + { + "epoch": 0.5, + "grad_norm": 1.4891437714768916, + "learning_rate": 5.3121524590233605e-06, + "loss": 0.3152, + "step": 17091 + }, + { + "epoch": 0.5, + "grad_norm": 1.3714534847271678, + "learning_rate": 5.311683666247598e-06, + "loss": 0.2938, + "step": 17092 + }, + { + "epoch": 0.5, + "grad_norm": 1.4923789433494088, + "learning_rate": 5.311214870721223e-06, + "loss": 0.3192, + "step": 17093 + }, + { + "epoch": 0.5, + "grad_norm": 1.335998342456834, + "learning_rate": 5.310746072448373e-06, + "loss": 0.3058, + "step": 17094 + }, + { + "epoch": 0.5, + "grad_norm": 1.3729390721760288, + "learning_rate": 5.310277271433185e-06, + "loss": 0.3157, + "step": 17095 + }, + { + "epoch": 0.5, + "grad_norm": 1.3975489246714283, + "learning_rate": 5.309808467679798e-06, + "loss": 0.3119, + "step": 17096 + }, + { + "epoch": 0.5, + "grad_norm": 1.2007875604537315, + "learning_rate": 5.309339661192347e-06, + "loss": 0.3115, + "step": 17097 + }, + { + "epoch": 0.5, + "grad_norm": 1.375527586620026, + "learning_rate": 5.308870851974969e-06, + "loss": 0.3237, + "step": 17098 + }, + { + "epoch": 0.5, + "grad_norm": 1.3028700975495353, + "learning_rate": 5.308402040031804e-06, + "loss": 0.3096, + "step": 17099 + }, + { + "epoch": 0.5, + "grad_norm": 1.249816136809004, + "learning_rate": 5.307933225366986e-06, + "loss": 0.2946, + "step": 17100 + }, + { + "epoch": 0.5, + "grad_norm": 1.548600179224193, + "learning_rate": 5.307464407984655e-06, + "loss": 0.3253, + "step": 17101 + }, + { + "epoch": 0.5, + "grad_norm": 1.5456995394533246, + "learning_rate": 5.306995587888947e-06, + "loss": 0.3306, + "step": 17102 + }, + { + "epoch": 0.5, + "grad_norm": 1.3877646010823796, + "learning_rate": 5.306526765084001e-06, + "loss": 0.3064, + "step": 17103 + }, + { + "epoch": 0.5, + "grad_norm": 1.49651810318266, + "learning_rate": 5.306057939573951e-06, + "loss": 0.3285, + "step": 17104 + }, + { + "epoch": 0.5, + "grad_norm": 2.718556253809425, + "learning_rate": 5.3055891113629355e-06, + "loss": 0.3334, + "step": 17105 + }, + { + "epoch": 0.5, + "grad_norm": 1.3557453051089412, + "learning_rate": 5.305120280455095e-06, + "loss": 0.299, + "step": 17106 + }, + { + "epoch": 0.5, + "grad_norm": 1.3427813085185294, + "learning_rate": 5.304651446854563e-06, + "loss": 0.2748, + "step": 17107 + }, + { + "epoch": 0.5, + "grad_norm": 1.4032008484870315, + "learning_rate": 5.304182610565479e-06, + "loss": 0.3382, + "step": 17108 + }, + { + "epoch": 0.5, + "grad_norm": 1.164667164824449, + "learning_rate": 5.303713771591979e-06, + "loss": 0.3019, + "step": 17109 + }, + { + "epoch": 0.5, + "grad_norm": 1.439825162918473, + "learning_rate": 5.303244929938205e-06, + "loss": 0.2914, + "step": 17110 + }, + { + "epoch": 0.5, + "grad_norm": 1.231576182274259, + "learning_rate": 5.302776085608288e-06, + "loss": 0.3064, + "step": 17111 + }, + { + "epoch": 0.5, + "grad_norm": 1.4033851255751706, + "learning_rate": 5.30230723860637e-06, + "loss": 0.361, + "step": 17112 + }, + { + "epoch": 0.5, + "grad_norm": 1.5826891882167167, + "learning_rate": 5.301838388936587e-06, + "loss": 0.2902, + "step": 17113 + }, + { + "epoch": 0.5, + "grad_norm": 1.1933597814764003, + "learning_rate": 5.301369536603077e-06, + "loss": 0.298, + "step": 17114 + }, + { + "epoch": 0.5, + "grad_norm": 1.288541116014173, + "learning_rate": 5.300900681609976e-06, + "loss": 0.3201, + "step": 17115 + }, + { + "epoch": 0.5, + "grad_norm": 1.1954024006377122, + "learning_rate": 5.300431823961425e-06, + "loss": 0.3017, + "step": 17116 + }, + { + "epoch": 0.5, + "grad_norm": 1.6510126857505167, + "learning_rate": 5.29996296366156e-06, + "loss": 0.3352, + "step": 17117 + }, + { + "epoch": 0.5, + "grad_norm": 1.2053539420234907, + "learning_rate": 5.299494100714517e-06, + "loss": 0.3354, + "step": 17118 + }, + { + "epoch": 0.5, + "grad_norm": 1.4185851770370965, + "learning_rate": 5.2990252351244365e-06, + "loss": 0.3093, + "step": 17119 + }, + { + "epoch": 0.5, + "grad_norm": 1.274298942572941, + "learning_rate": 5.298556366895454e-06, + "loss": 0.3077, + "step": 17120 + }, + { + "epoch": 0.5, + "grad_norm": 1.690451671873913, + "learning_rate": 5.29808749603171e-06, + "loss": 0.2891, + "step": 17121 + }, + { + "epoch": 0.5, + "grad_norm": 1.319103237166238, + "learning_rate": 5.297618622537338e-06, + "loss": 0.3233, + "step": 17122 + }, + { + "epoch": 0.5, + "grad_norm": 1.2215314510054531, + "learning_rate": 5.297149746416479e-06, + "loss": 0.3023, + "step": 17123 + }, + { + "epoch": 0.5, + "grad_norm": 1.599025042829015, + "learning_rate": 5.2966808676732715e-06, + "loss": 0.3186, + "step": 17124 + }, + { + "epoch": 0.5, + "grad_norm": 1.517685624735434, + "learning_rate": 5.296211986311851e-06, + "loss": 0.3155, + "step": 17125 + }, + { + "epoch": 0.5, + "grad_norm": 1.4620324270926093, + "learning_rate": 5.295743102336357e-06, + "loss": 0.324, + "step": 17126 + }, + { + "epoch": 0.5, + "grad_norm": 1.355687806082902, + "learning_rate": 5.295274215750926e-06, + "loss": 0.2988, + "step": 17127 + }, + { + "epoch": 0.5, + "grad_norm": 1.4970574713445284, + "learning_rate": 5.294805326559699e-06, + "loss": 0.3208, + "step": 17128 + }, + { + "epoch": 0.5, + "grad_norm": 1.2204288943705817, + "learning_rate": 5.294336434766809e-06, + "loss": 0.3286, + "step": 17129 + }, + { + "epoch": 0.5, + "grad_norm": 1.4227491605409266, + "learning_rate": 5.293867540376398e-06, + "loss": 0.3131, + "step": 17130 + }, + { + "epoch": 0.5, + "grad_norm": 1.3861968006511238, + "learning_rate": 5.2933986433926034e-06, + "loss": 0.3055, + "step": 17131 + }, + { + "epoch": 0.5, + "grad_norm": 1.5802422422276658, + "learning_rate": 5.292929743819562e-06, + "loss": 0.3003, + "step": 17132 + }, + { + "epoch": 0.5, + "grad_norm": 1.4434182990293454, + "learning_rate": 5.292460841661412e-06, + "loss": 0.3213, + "step": 17133 + }, + { + "epoch": 0.5, + "grad_norm": 1.2313783489142391, + "learning_rate": 5.291991936922292e-06, + "loss": 0.2791, + "step": 17134 + }, + { + "epoch": 0.5, + "grad_norm": 1.3728472256584932, + "learning_rate": 5.291523029606339e-06, + "loss": 0.304, + "step": 17135 + }, + { + "epoch": 0.5, + "grad_norm": 1.1482196801243134, + "learning_rate": 5.291054119717692e-06, + "loss": 0.3036, + "step": 17136 + }, + { + "epoch": 0.5, + "grad_norm": 1.2680163914886409, + "learning_rate": 5.29058520726049e-06, + "loss": 0.2873, + "step": 17137 + }, + { + "epoch": 0.5, + "grad_norm": 1.3057097209883934, + "learning_rate": 5.2901162922388715e-06, + "loss": 0.3123, + "step": 17138 + }, + { + "epoch": 0.5, + "grad_norm": 1.3325296871588397, + "learning_rate": 5.289647374656972e-06, + "loss": 0.3305, + "step": 17139 + }, + { + "epoch": 0.5, + "grad_norm": 2.0058689702365204, + "learning_rate": 5.289178454518931e-06, + "loss": 0.307, + "step": 17140 + }, + { + "epoch": 0.5, + "grad_norm": 1.3011381200787175, + "learning_rate": 5.288709531828888e-06, + "loss": 0.3408, + "step": 17141 + }, + { + "epoch": 0.5, + "grad_norm": 1.1882322227158546, + "learning_rate": 5.28824060659098e-06, + "loss": 0.2855, + "step": 17142 + }, + { + "epoch": 0.5, + "grad_norm": 1.2792072593043793, + "learning_rate": 5.287771678809344e-06, + "loss": 0.3353, + "step": 17143 + }, + { + "epoch": 0.5, + "grad_norm": 1.283163201633871, + "learning_rate": 5.28730274848812e-06, + "loss": 0.3573, + "step": 17144 + }, + { + "epoch": 0.5, + "grad_norm": 1.3283412419654275, + "learning_rate": 5.286833815631448e-06, + "loss": 0.295, + "step": 17145 + }, + { + "epoch": 0.5, + "grad_norm": 1.3457828565056271, + "learning_rate": 5.286364880243463e-06, + "loss": 0.3052, + "step": 17146 + }, + { + "epoch": 0.5, + "grad_norm": 1.340314071222909, + "learning_rate": 5.285895942328306e-06, + "loss": 0.3018, + "step": 17147 + }, + { + "epoch": 0.5, + "grad_norm": 1.2537589181681164, + "learning_rate": 5.2854270018901125e-06, + "loss": 0.2954, + "step": 17148 + }, + { + "epoch": 0.5, + "grad_norm": 1.568688481994215, + "learning_rate": 5.284958058933023e-06, + "loss": 0.306, + "step": 17149 + }, + { + "epoch": 0.5, + "grad_norm": 1.3485821928651163, + "learning_rate": 5.284489113461175e-06, + "loss": 0.2953, + "step": 17150 + }, + { + "epoch": 0.5, + "grad_norm": 1.4202526725493632, + "learning_rate": 5.284020165478709e-06, + "loss": 0.3106, + "step": 17151 + }, + { + "epoch": 0.5, + "grad_norm": 1.4499627089664597, + "learning_rate": 5.2835512149897605e-06, + "loss": 0.3119, + "step": 17152 + }, + { + "epoch": 0.5, + "grad_norm": 1.3623323883502887, + "learning_rate": 5.28308226199847e-06, + "loss": 0.3111, + "step": 17153 + }, + { + "epoch": 0.5, + "grad_norm": 1.3059934054264153, + "learning_rate": 5.282613306508974e-06, + "loss": 0.319, + "step": 17154 + }, + { + "epoch": 0.5, + "grad_norm": 1.4181196106173433, + "learning_rate": 5.282144348525414e-06, + "loss": 0.3234, + "step": 17155 + }, + { + "epoch": 0.5, + "grad_norm": 1.1989985901243505, + "learning_rate": 5.281675388051928e-06, + "loss": 0.2932, + "step": 17156 + }, + { + "epoch": 0.5, + "grad_norm": 1.322055334842008, + "learning_rate": 5.281206425092651e-06, + "loss": 0.3072, + "step": 17157 + }, + { + "epoch": 0.5, + "grad_norm": 1.493721545637182, + "learning_rate": 5.2807374596517255e-06, + "loss": 0.3204, + "step": 17158 + }, + { + "epoch": 0.5, + "grad_norm": 1.510896303077716, + "learning_rate": 5.280268491733289e-06, + "loss": 0.3391, + "step": 17159 + }, + { + "epoch": 0.5, + "grad_norm": 1.330554063601117, + "learning_rate": 5.27979952134148e-06, + "loss": 0.3356, + "step": 17160 + }, + { + "epoch": 0.5, + "grad_norm": 1.393040254920078, + "learning_rate": 5.279330548480437e-06, + "loss": 0.3013, + "step": 17161 + }, + { + "epoch": 0.5, + "grad_norm": 1.778023756548033, + "learning_rate": 5.2788615731542996e-06, + "loss": 0.307, + "step": 17162 + }, + { + "epoch": 0.5, + "grad_norm": 1.305032829511103, + "learning_rate": 5.278392595367204e-06, + "loss": 0.3245, + "step": 17163 + }, + { + "epoch": 0.5, + "grad_norm": 1.2051702563233209, + "learning_rate": 5.27792361512329e-06, + "loss": 0.3206, + "step": 17164 + }, + { + "epoch": 0.5, + "grad_norm": 1.4651496382625426, + "learning_rate": 5.277454632426699e-06, + "loss": 0.3085, + "step": 17165 + }, + { + "epoch": 0.5, + "grad_norm": 1.4682671637316957, + "learning_rate": 5.276985647281567e-06, + "loss": 0.3092, + "step": 17166 + }, + { + "epoch": 0.5, + "grad_norm": 1.2182677766955428, + "learning_rate": 5.276516659692033e-06, + "loss": 0.3046, + "step": 17167 + }, + { + "epoch": 0.5, + "grad_norm": 1.2997114460817278, + "learning_rate": 5.2760476696622375e-06, + "loss": 0.3205, + "step": 17168 + }, + { + "epoch": 0.5, + "grad_norm": 1.3962111126434813, + "learning_rate": 5.275578677196319e-06, + "loss": 0.3121, + "step": 17169 + }, + { + "epoch": 0.5, + "grad_norm": 1.2280476378267506, + "learning_rate": 5.275109682298415e-06, + "loss": 0.3219, + "step": 17170 + }, + { + "epoch": 0.5, + "grad_norm": 1.813948737072494, + "learning_rate": 5.274640684972664e-06, + "loss": 0.3029, + "step": 17171 + }, + { + "epoch": 0.5, + "grad_norm": 1.2623269350195867, + "learning_rate": 5.274171685223206e-06, + "loss": 0.3068, + "step": 17172 + }, + { + "epoch": 0.5, + "grad_norm": 1.2389508776740408, + "learning_rate": 5.27370268305418e-06, + "loss": 0.3166, + "step": 17173 + }, + { + "epoch": 0.5, + "grad_norm": 1.3318354132258445, + "learning_rate": 5.2732336784697265e-06, + "loss": 0.3014, + "step": 17174 + }, + { + "epoch": 0.5, + "grad_norm": 1.6198767199546855, + "learning_rate": 5.272764671473981e-06, + "loss": 0.301, + "step": 17175 + }, + { + "epoch": 0.5, + "grad_norm": 1.326995557212613, + "learning_rate": 5.272295662071084e-06, + "loss": 0.318, + "step": 17176 + }, + { + "epoch": 0.5, + "grad_norm": 1.239688425638496, + "learning_rate": 5.271826650265176e-06, + "loss": 0.3094, + "step": 17177 + }, + { + "epoch": 0.5, + "grad_norm": 1.2497162550741574, + "learning_rate": 5.2713576360603936e-06, + "loss": 0.3053, + "step": 17178 + }, + { + "epoch": 0.5, + "grad_norm": 1.3638159888281087, + "learning_rate": 5.2708886194608775e-06, + "loss": 0.3328, + "step": 17179 + }, + { + "epoch": 0.5, + "grad_norm": 1.3138938883297504, + "learning_rate": 5.270419600470766e-06, + "loss": 0.3259, + "step": 17180 + }, + { + "epoch": 0.5, + "grad_norm": 1.231184892415013, + "learning_rate": 5.269950579094199e-06, + "loss": 0.3013, + "step": 17181 + }, + { + "epoch": 0.5, + "grad_norm": 1.2915159148740294, + "learning_rate": 5.269481555335315e-06, + "loss": 0.2976, + "step": 17182 + }, + { + "epoch": 0.5, + "grad_norm": 1.7991715032901754, + "learning_rate": 5.269012529198254e-06, + "loss": 0.3212, + "step": 17183 + }, + { + "epoch": 0.5, + "grad_norm": 1.7483926890539228, + "learning_rate": 5.268543500687154e-06, + "loss": 0.3204, + "step": 17184 + }, + { + "epoch": 0.5, + "grad_norm": 1.319936239691046, + "learning_rate": 5.2680744698061535e-06, + "loss": 0.318, + "step": 17185 + }, + { + "epoch": 0.5, + "grad_norm": 1.2847207087168953, + "learning_rate": 5.267605436559394e-06, + "loss": 0.295, + "step": 17186 + }, + { + "epoch": 0.5, + "grad_norm": 1.2797237372664485, + "learning_rate": 5.267136400951014e-06, + "loss": 0.3304, + "step": 17187 + }, + { + "epoch": 0.5, + "grad_norm": 2.388333702975635, + "learning_rate": 5.2666673629851526e-06, + "loss": 0.3224, + "step": 17188 + }, + { + "epoch": 0.5, + "grad_norm": 1.2073551166309138, + "learning_rate": 5.266198322665948e-06, + "loss": 0.3308, + "step": 17189 + }, + { + "epoch": 0.5, + "grad_norm": 1.2679238996311406, + "learning_rate": 5.265729279997539e-06, + "loss": 0.3071, + "step": 17190 + }, + { + "epoch": 0.5, + "grad_norm": 1.3889254851743658, + "learning_rate": 5.265260234984067e-06, + "loss": 0.3104, + "step": 17191 + }, + { + "epoch": 0.5, + "grad_norm": 1.2917327088883268, + "learning_rate": 5.264791187629672e-06, + "loss": 0.3095, + "step": 17192 + }, + { + "epoch": 0.5, + "grad_norm": 1.2497374641021404, + "learning_rate": 5.26432213793849e-06, + "loss": 0.3063, + "step": 17193 + }, + { + "epoch": 0.5, + "grad_norm": 1.3059344418845427, + "learning_rate": 5.263853085914662e-06, + "loss": 0.3285, + "step": 17194 + }, + { + "epoch": 0.5, + "grad_norm": 1.2548548204922434, + "learning_rate": 5.263384031562329e-06, + "loss": 0.2859, + "step": 17195 + }, + { + "epoch": 0.5, + "grad_norm": 1.2086129918776722, + "learning_rate": 5.262914974885627e-06, + "loss": 0.3872, + "step": 17196 + }, + { + "epoch": 0.5, + "grad_norm": 1.3778488025673816, + "learning_rate": 5.2624459158887e-06, + "loss": 0.3105, + "step": 17197 + }, + { + "epoch": 0.5, + "grad_norm": 1.337370524947533, + "learning_rate": 5.261976854575683e-06, + "loss": 0.3461, + "step": 17198 + }, + { + "epoch": 0.5, + "grad_norm": 2.3646273327674447, + "learning_rate": 5.2615077909507174e-06, + "loss": 0.3044, + "step": 17199 + }, + { + "epoch": 0.5, + "grad_norm": 1.9710395570160664, + "learning_rate": 5.2610387250179426e-06, + "loss": 0.3197, + "step": 17200 + }, + { + "epoch": 0.5, + "grad_norm": 1.3777832586356316, + "learning_rate": 5.2605696567815e-06, + "loss": 0.3143, + "step": 17201 + }, + { + "epoch": 0.5, + "grad_norm": 1.4720167781048814, + "learning_rate": 5.260100586245527e-06, + "loss": 0.3213, + "step": 17202 + }, + { + "epoch": 0.5, + "grad_norm": 1.3161897252311232, + "learning_rate": 5.259631513414162e-06, + "loss": 0.3072, + "step": 17203 + }, + { + "epoch": 0.5, + "grad_norm": 1.3646287929996233, + "learning_rate": 5.259162438291546e-06, + "loss": 0.3139, + "step": 17204 + }, + { + "epoch": 0.5, + "grad_norm": 4.3425702067430985, + "learning_rate": 5.2586933608818195e-06, + "loss": 0.3037, + "step": 17205 + }, + { + "epoch": 0.5, + "grad_norm": 1.345988877569255, + "learning_rate": 5.2582242811891215e-06, + "loss": 0.3276, + "step": 17206 + }, + { + "epoch": 0.5, + "grad_norm": 1.357118412515483, + "learning_rate": 5.25775519921759e-06, + "loss": 0.3019, + "step": 17207 + }, + { + "epoch": 0.5, + "grad_norm": 1.8362606651645381, + "learning_rate": 5.257286114971366e-06, + "loss": 0.3179, + "step": 17208 + }, + { + "epoch": 0.5, + "grad_norm": 1.471413778249388, + "learning_rate": 5.2568170284545905e-06, + "loss": 0.3064, + "step": 17209 + }, + { + "epoch": 0.5, + "grad_norm": 1.3350997657771162, + "learning_rate": 5.256347939671401e-06, + "loss": 0.3484, + "step": 17210 + }, + { + "epoch": 0.5, + "grad_norm": 1.2867238964384262, + "learning_rate": 5.255878848625938e-06, + "loss": 0.3217, + "step": 17211 + }, + { + "epoch": 0.5, + "grad_norm": 1.388208589126048, + "learning_rate": 5.255409755322342e-06, + "loss": 0.313, + "step": 17212 + }, + { + "epoch": 0.5, + "grad_norm": 1.258234009224529, + "learning_rate": 5.254940659764751e-06, + "loss": 0.2905, + "step": 17213 + }, + { + "epoch": 0.5, + "grad_norm": 1.6207288536350983, + "learning_rate": 5.254471561957307e-06, + "loss": 0.3205, + "step": 17214 + }, + { + "epoch": 0.5, + "grad_norm": 1.456542126706171, + "learning_rate": 5.254002461904149e-06, + "loss": 0.3026, + "step": 17215 + }, + { + "epoch": 0.5, + "grad_norm": 1.4494626449692392, + "learning_rate": 5.253533359609417e-06, + "loss": 0.3022, + "step": 17216 + }, + { + "epoch": 0.5, + "grad_norm": 1.3599280993375031, + "learning_rate": 5.253064255077248e-06, + "loss": 0.3153, + "step": 17217 + }, + { + "epoch": 0.5, + "grad_norm": 1.225281388915489, + "learning_rate": 5.2525951483117844e-06, + "loss": 0.2849, + "step": 17218 + }, + { + "epoch": 0.5, + "grad_norm": 0.9897712224007532, + "learning_rate": 5.252126039317167e-06, + "loss": 0.5758, + "step": 17219 + }, + { + "epoch": 0.5, + "grad_norm": 1.21393297002426, + "learning_rate": 5.251656928097535e-06, + "loss": 0.3071, + "step": 17220 + }, + { + "epoch": 0.5, + "grad_norm": 1.32680639101295, + "learning_rate": 5.251187814657026e-06, + "loss": 0.3068, + "step": 17221 + }, + { + "epoch": 0.5, + "grad_norm": 2.9471594659791758, + "learning_rate": 5.250718698999783e-06, + "loss": 0.3475, + "step": 17222 + }, + { + "epoch": 0.5, + "grad_norm": 1.5574260297787306, + "learning_rate": 5.250249581129944e-06, + "loss": 0.3049, + "step": 17223 + }, + { + "epoch": 0.5, + "grad_norm": 1.2596289793619582, + "learning_rate": 5.24978046105165e-06, + "loss": 0.3289, + "step": 17224 + }, + { + "epoch": 0.5, + "grad_norm": 1.372164332618948, + "learning_rate": 5.249311338769041e-06, + "loss": 0.3101, + "step": 17225 + }, + { + "epoch": 0.5, + "grad_norm": 1.8702402837277148, + "learning_rate": 5.248842214286255e-06, + "loss": 0.3299, + "step": 17226 + }, + { + "epoch": 0.5, + "grad_norm": 1.4684647282477048, + "learning_rate": 5.248373087607434e-06, + "loss": 0.2862, + "step": 17227 + }, + { + "epoch": 0.5, + "grad_norm": 1.27880203998993, + "learning_rate": 5.247903958736721e-06, + "loss": 0.2917, + "step": 17228 + }, + { + "epoch": 0.5, + "grad_norm": 1.219718577051057, + "learning_rate": 5.247434827678249e-06, + "loss": 0.3085, + "step": 17229 + }, + { + "epoch": 0.5, + "grad_norm": 1.5746795843166053, + "learning_rate": 5.2469656944361646e-06, + "loss": 0.2999, + "step": 17230 + }, + { + "epoch": 0.5, + "grad_norm": 1.2766637811030552, + "learning_rate": 5.246496559014604e-06, + "loss": 0.3237, + "step": 17231 + }, + { + "epoch": 0.5, + "grad_norm": 1.4329258899712707, + "learning_rate": 5.246027421417709e-06, + "loss": 0.3348, + "step": 17232 + }, + { + "epoch": 0.5, + "grad_norm": 5.355974314739818, + "learning_rate": 5.245558281649618e-06, + "loss": 0.3014, + "step": 17233 + }, + { + "epoch": 0.5, + "grad_norm": 1.3504665116354813, + "learning_rate": 5.2450891397144745e-06, + "loss": 0.3066, + "step": 17234 + }, + { + "epoch": 0.5, + "grad_norm": 1.403454778103199, + "learning_rate": 5.244619995616414e-06, + "loss": 0.3101, + "step": 17235 + }, + { + "epoch": 0.5, + "grad_norm": 1.4832731929844538, + "learning_rate": 5.244150849359582e-06, + "loss": 0.3105, + "step": 17236 + }, + { + "epoch": 0.5, + "grad_norm": 2.204968362619013, + "learning_rate": 5.243681700948115e-06, + "loss": 0.2946, + "step": 17237 + }, + { + "epoch": 0.5, + "grad_norm": 1.6224234724249265, + "learning_rate": 5.243212550386154e-06, + "loss": 0.2946, + "step": 17238 + }, + { + "epoch": 0.5, + "grad_norm": 1.5098948072290386, + "learning_rate": 5.242743397677839e-06, + "loss": 0.3034, + "step": 17239 + }, + { + "epoch": 0.5, + "grad_norm": 1.2402108536699485, + "learning_rate": 5.242274242827311e-06, + "loss": 0.3084, + "step": 17240 + }, + { + "epoch": 0.5, + "grad_norm": 1.2763509251514145, + "learning_rate": 5.24180508583871e-06, + "loss": 0.2956, + "step": 17241 + }, + { + "epoch": 0.5, + "grad_norm": 1.2058155543461329, + "learning_rate": 5.241335926716179e-06, + "loss": 0.3094, + "step": 17242 + }, + { + "epoch": 0.5, + "grad_norm": 1.790151013519953, + "learning_rate": 5.2408667654638536e-06, + "loss": 0.2989, + "step": 17243 + }, + { + "epoch": 0.5, + "grad_norm": 1.4323056741694575, + "learning_rate": 5.2403976020858775e-06, + "loss": 0.304, + "step": 17244 + }, + { + "epoch": 0.5, + "grad_norm": 1.3882460340180416, + "learning_rate": 5.239928436586388e-06, + "loss": 0.315, + "step": 17245 + }, + { + "epoch": 0.5, + "grad_norm": 1.455076398620317, + "learning_rate": 5.239459268969528e-06, + "loss": 0.2962, + "step": 17246 + }, + { + "epoch": 0.5, + "grad_norm": 1.211508892837057, + "learning_rate": 5.238990099239438e-06, + "loss": 0.3072, + "step": 17247 + }, + { + "epoch": 0.5, + "grad_norm": 1.4706350964778467, + "learning_rate": 5.238520927400257e-06, + "loss": 0.3094, + "step": 17248 + }, + { + "epoch": 0.5, + "grad_norm": 1.4770342436981063, + "learning_rate": 5.238051753456126e-06, + "loss": 0.3207, + "step": 17249 + }, + { + "epoch": 0.5, + "grad_norm": 1.3358498252131528, + "learning_rate": 5.2375825774111865e-06, + "loss": 0.3282, + "step": 17250 + }, + { + "epoch": 0.5, + "grad_norm": 1.2514240471029874, + "learning_rate": 5.237113399269577e-06, + "loss": 0.3173, + "step": 17251 + }, + { + "epoch": 0.5, + "grad_norm": 1.1728673780548822, + "learning_rate": 5.23664421903544e-06, + "loss": 0.2731, + "step": 17252 + }, + { + "epoch": 0.5, + "grad_norm": 3.403340551729005, + "learning_rate": 5.236175036712915e-06, + "loss": 0.3311, + "step": 17253 + }, + { + "epoch": 0.5, + "grad_norm": 1.34809046862506, + "learning_rate": 5.235705852306143e-06, + "loss": 0.3262, + "step": 17254 + }, + { + "epoch": 0.5, + "grad_norm": 1.1741033035019846, + "learning_rate": 5.235236665819264e-06, + "loss": 0.3043, + "step": 17255 + }, + { + "epoch": 0.5, + "grad_norm": 1.6331785565360184, + "learning_rate": 5.23476747725642e-06, + "loss": 0.3172, + "step": 17256 + }, + { + "epoch": 0.5, + "grad_norm": 1.273790696197737, + "learning_rate": 5.2342982866217475e-06, + "loss": 0.3227, + "step": 17257 + }, + { + "epoch": 0.5, + "grad_norm": 1.4652038873055626, + "learning_rate": 5.233829093919393e-06, + "loss": 0.3021, + "step": 17258 + }, + { + "epoch": 0.5, + "grad_norm": 2.549123327974537, + "learning_rate": 5.233359899153492e-06, + "loss": 0.3225, + "step": 17259 + }, + { + "epoch": 0.5, + "grad_norm": 2.726867244826506, + "learning_rate": 5.232890702328189e-06, + "loss": 0.3022, + "step": 17260 + }, + { + "epoch": 0.5, + "grad_norm": 1.3089337156766014, + "learning_rate": 5.232421503447622e-06, + "loss": 0.3222, + "step": 17261 + }, + { + "epoch": 0.5, + "grad_norm": 1.5218571461193822, + "learning_rate": 5.231952302515932e-06, + "loss": 0.3311, + "step": 17262 + }, + { + "epoch": 0.5, + "grad_norm": 1.1924288767371696, + "learning_rate": 5.2314830995372616e-06, + "loss": 0.2938, + "step": 17263 + }, + { + "epoch": 0.5, + "grad_norm": 1.2756531428427966, + "learning_rate": 5.231013894515748e-06, + "loss": 0.3348, + "step": 17264 + }, + { + "epoch": 0.5, + "grad_norm": 1.2861395852248545, + "learning_rate": 5.230544687455538e-06, + "loss": 0.3016, + "step": 17265 + }, + { + "epoch": 0.5, + "grad_norm": 1.2792186933376894, + "learning_rate": 5.230075478360767e-06, + "loss": 0.2954, + "step": 17266 + }, + { + "epoch": 0.5, + "grad_norm": 1.4515267519321569, + "learning_rate": 5.229606267235575e-06, + "loss": 0.2983, + "step": 17267 + }, + { + "epoch": 0.5, + "grad_norm": 1.300347142382546, + "learning_rate": 5.229137054084108e-06, + "loss": 0.3276, + "step": 17268 + }, + { + "epoch": 0.5, + "grad_norm": 1.3445541859053203, + "learning_rate": 5.228667838910502e-06, + "loss": 0.3166, + "step": 17269 + }, + { + "epoch": 0.5, + "grad_norm": 1.2292407231636464, + "learning_rate": 5.228198621718902e-06, + "loss": 0.3102, + "step": 17270 + }, + { + "epoch": 0.5, + "grad_norm": 1.413073547372504, + "learning_rate": 5.227729402513445e-06, + "loss": 0.3411, + "step": 17271 + }, + { + "epoch": 0.5, + "grad_norm": 1.3283013650481812, + "learning_rate": 5.227260181298275e-06, + "loss": 0.2995, + "step": 17272 + }, + { + "epoch": 0.5, + "grad_norm": 1.2291345119982997, + "learning_rate": 5.22679095807753e-06, + "loss": 0.299, + "step": 17273 + }, + { + "epoch": 0.5, + "grad_norm": 1.3035156945162059, + "learning_rate": 5.2263217328553516e-06, + "loss": 0.3224, + "step": 17274 + }, + { + "epoch": 0.5, + "grad_norm": 1.3878515255698491, + "learning_rate": 5.225852505635883e-06, + "loss": 0.3517, + "step": 17275 + }, + { + "epoch": 0.5, + "grad_norm": 1.2430699890925865, + "learning_rate": 5.225383276423264e-06, + "loss": 0.3643, + "step": 17276 + }, + { + "epoch": 0.5, + "grad_norm": 1.3125805607430951, + "learning_rate": 5.224914045221633e-06, + "loss": 0.3202, + "step": 17277 + }, + { + "epoch": 0.5, + "grad_norm": 1.1753419818525277, + "learning_rate": 5.224444812035133e-06, + "loss": 0.2926, + "step": 17278 + }, + { + "epoch": 0.5, + "grad_norm": 1.3289090457026829, + "learning_rate": 5.223975576867908e-06, + "loss": 0.3203, + "step": 17279 + }, + { + "epoch": 0.5, + "grad_norm": 1.2686139518247186, + "learning_rate": 5.223506339724094e-06, + "loss": 0.2959, + "step": 17280 + }, + { + "epoch": 0.5, + "grad_norm": 1.2264197122673353, + "learning_rate": 5.223037100607833e-06, + "loss": 0.3119, + "step": 17281 + }, + { + "epoch": 0.5, + "grad_norm": 1.3153364955085218, + "learning_rate": 5.222567859523268e-06, + "loss": 0.3296, + "step": 17282 + }, + { + "epoch": 0.5, + "grad_norm": 1.271073697032564, + "learning_rate": 5.2220986164745405e-06, + "loss": 0.2983, + "step": 17283 + }, + { + "epoch": 0.5, + "grad_norm": 1.4101843971888968, + "learning_rate": 5.221629371465789e-06, + "loss": 0.3082, + "step": 17284 + }, + { + "epoch": 0.5, + "grad_norm": 1.2061703959023284, + "learning_rate": 5.221160124501155e-06, + "loss": 0.3137, + "step": 17285 + }, + { + "epoch": 0.5, + "grad_norm": 1.104439962453844, + "learning_rate": 5.220690875584783e-06, + "loss": 0.2937, + "step": 17286 + }, + { + "epoch": 0.5, + "grad_norm": 1.2208873531229478, + "learning_rate": 5.220221624720811e-06, + "loss": 0.3166, + "step": 17287 + }, + { + "epoch": 0.5, + "grad_norm": 1.4337715837350307, + "learning_rate": 5.219752371913378e-06, + "loss": 0.3251, + "step": 17288 + }, + { + "epoch": 0.5, + "grad_norm": 1.660199932566104, + "learning_rate": 5.2192831171666315e-06, + "loss": 0.3083, + "step": 17289 + }, + { + "epoch": 0.5, + "grad_norm": 1.158688147880719, + "learning_rate": 5.218813860484706e-06, + "loss": 0.2974, + "step": 17290 + }, + { + "epoch": 0.5, + "grad_norm": 1.1882196550101614, + "learning_rate": 5.218344601871748e-06, + "loss": 0.3097, + "step": 17291 + }, + { + "epoch": 0.5, + "grad_norm": 1.2922607077113437, + "learning_rate": 5.217875341331895e-06, + "loss": 0.3014, + "step": 17292 + }, + { + "epoch": 0.5, + "grad_norm": 1.2374600493154366, + "learning_rate": 5.21740607886929e-06, + "loss": 0.3143, + "step": 17293 + }, + { + "epoch": 0.5, + "grad_norm": 1.2554662046508878, + "learning_rate": 5.216936814488074e-06, + "loss": 0.303, + "step": 17294 + }, + { + "epoch": 0.5, + "grad_norm": 1.208036019684504, + "learning_rate": 5.216467548192389e-06, + "loss": 0.3021, + "step": 17295 + }, + { + "epoch": 0.5, + "grad_norm": 1.3513534838668508, + "learning_rate": 5.215998279986374e-06, + "loss": 0.2941, + "step": 17296 + }, + { + "epoch": 0.5, + "grad_norm": 1.3393943311103664, + "learning_rate": 5.215529009874173e-06, + "loss": 0.3269, + "step": 17297 + }, + { + "epoch": 0.5, + "grad_norm": 1.5434613931142278, + "learning_rate": 5.215059737859926e-06, + "loss": 0.3073, + "step": 17298 + }, + { + "epoch": 0.5, + "grad_norm": 1.2681026635042927, + "learning_rate": 5.214590463947773e-06, + "loss": 0.297, + "step": 17299 + }, + { + "epoch": 0.5, + "grad_norm": 0.9181452102321371, + "learning_rate": 5.214121188141859e-06, + "loss": 0.5961, + "step": 17300 + }, + { + "epoch": 0.5, + "grad_norm": 1.3871722933445572, + "learning_rate": 5.213651910446322e-06, + "loss": 0.3321, + "step": 17301 + }, + { + "epoch": 0.5, + "grad_norm": 1.6190328870139636, + "learning_rate": 5.213182630865305e-06, + "loss": 0.3125, + "step": 17302 + }, + { + "epoch": 0.5, + "grad_norm": 1.2593754023275727, + "learning_rate": 5.212713349402948e-06, + "loss": 0.329, + "step": 17303 + }, + { + "epoch": 0.5, + "grad_norm": 1.5410711323006683, + "learning_rate": 5.212244066063395e-06, + "loss": 0.3469, + "step": 17304 + }, + { + "epoch": 0.5, + "grad_norm": 1.7541077509289198, + "learning_rate": 5.211774780850784e-06, + "loss": 0.3267, + "step": 17305 + }, + { + "epoch": 0.5, + "grad_norm": 1.2769977143802227, + "learning_rate": 5.2113054937692574e-06, + "loss": 0.3189, + "step": 17306 + }, + { + "epoch": 0.5, + "grad_norm": 1.1663432488991254, + "learning_rate": 5.21083620482296e-06, + "loss": 0.3109, + "step": 17307 + }, + { + "epoch": 0.5, + "grad_norm": 1.1904016286688048, + "learning_rate": 5.210366914016029e-06, + "loss": 0.3239, + "step": 17308 + }, + { + "epoch": 0.5, + "grad_norm": 1.2363262468513205, + "learning_rate": 5.209897621352608e-06, + "loss": 0.3175, + "step": 17309 + }, + { + "epoch": 0.5, + "grad_norm": 1.4806846883345126, + "learning_rate": 5.209428326836838e-06, + "loss": 0.329, + "step": 17310 + }, + { + "epoch": 0.5, + "grad_norm": 1.1508773326808133, + "learning_rate": 5.208959030472862e-06, + "loss": 0.2797, + "step": 17311 + }, + { + "epoch": 0.5, + "grad_norm": 1.295758717768704, + "learning_rate": 5.2084897322648196e-06, + "loss": 0.305, + "step": 17312 + }, + { + "epoch": 0.5, + "grad_norm": 1.35237666572733, + "learning_rate": 5.208020432216853e-06, + "loss": 0.3533, + "step": 17313 + }, + { + "epoch": 0.5, + "grad_norm": 0.9943845340266817, + "learning_rate": 5.207551130333104e-06, + "loss": 0.6133, + "step": 17314 + }, + { + "epoch": 0.5, + "grad_norm": 1.2125215176992818, + "learning_rate": 5.207081826617713e-06, + "loss": 0.3056, + "step": 17315 + }, + { + "epoch": 0.5, + "grad_norm": 1.270039614889822, + "learning_rate": 5.206612521074823e-06, + "loss": 0.3103, + "step": 17316 + }, + { + "epoch": 0.5, + "grad_norm": 1.3026439909122618, + "learning_rate": 5.206143213708575e-06, + "loss": 0.309, + "step": 17317 + }, + { + "epoch": 0.5, + "grad_norm": 1.2200424223762596, + "learning_rate": 5.205673904523112e-06, + "loss": 0.3209, + "step": 17318 + }, + { + "epoch": 0.5, + "grad_norm": 1.568078071393226, + "learning_rate": 5.2052045935225725e-06, + "loss": 0.3314, + "step": 17319 + }, + { + "epoch": 0.5, + "grad_norm": 1.3594692055612008, + "learning_rate": 5.204735280711103e-06, + "loss": 0.2938, + "step": 17320 + }, + { + "epoch": 0.5, + "grad_norm": 1.2684018285779937, + "learning_rate": 5.20426596609284e-06, + "loss": 0.374, + "step": 17321 + }, + { + "epoch": 0.5, + "grad_norm": 1.1833922485810826, + "learning_rate": 5.2037966496719284e-06, + "loss": 0.3084, + "step": 17322 + }, + { + "epoch": 0.5, + "grad_norm": 1.6296259572581104, + "learning_rate": 5.203327331452508e-06, + "loss": 0.3011, + "step": 17323 + }, + { + "epoch": 0.5, + "grad_norm": 4.40687445443471, + "learning_rate": 5.2028580114387224e-06, + "loss": 0.3045, + "step": 17324 + }, + { + "epoch": 0.5, + "grad_norm": 1.897180814403466, + "learning_rate": 5.202388689634714e-06, + "loss": 0.3101, + "step": 17325 + }, + { + "epoch": 0.5, + "grad_norm": 1.2497807513325514, + "learning_rate": 5.201919366044621e-06, + "loss": 0.3197, + "step": 17326 + }, + { + "epoch": 0.5, + "grad_norm": 1.4008252721531809, + "learning_rate": 5.201450040672588e-06, + "loss": 0.3333, + "step": 17327 + }, + { + "epoch": 0.5, + "grad_norm": 1.2907037043153342, + "learning_rate": 5.200980713522757e-06, + "loss": 0.3199, + "step": 17328 + }, + { + "epoch": 0.5, + "grad_norm": 1.2315595773643202, + "learning_rate": 5.200511384599269e-06, + "loss": 0.3059, + "step": 17329 + }, + { + "epoch": 0.5, + "grad_norm": 1.149641817032805, + "learning_rate": 5.200042053906267e-06, + "loss": 0.3105, + "step": 17330 + }, + { + "epoch": 0.5, + "grad_norm": 1.421176769790754, + "learning_rate": 5.199572721447889e-06, + "loss": 0.3034, + "step": 17331 + }, + { + "epoch": 0.5, + "grad_norm": 1.1798065231059889, + "learning_rate": 5.19910338722828e-06, + "loss": 0.3118, + "step": 17332 + }, + { + "epoch": 0.5, + "grad_norm": 1.1874910881554428, + "learning_rate": 5.198634051251582e-06, + "loss": 0.3287, + "step": 17333 + }, + { + "epoch": 0.5, + "grad_norm": 1.2744099015543606, + "learning_rate": 5.198164713521937e-06, + "loss": 0.3052, + "step": 17334 + }, + { + "epoch": 0.5, + "grad_norm": 1.245339280990128, + "learning_rate": 5.197695374043485e-06, + "loss": 0.3119, + "step": 17335 + }, + { + "epoch": 0.5, + "grad_norm": 1.3586798986459605, + "learning_rate": 5.19722603282037e-06, + "loss": 0.3404, + "step": 17336 + }, + { + "epoch": 0.5, + "grad_norm": 1.5305088272798772, + "learning_rate": 5.196756689856731e-06, + "loss": 0.3077, + "step": 17337 + }, + { + "epoch": 0.5, + "grad_norm": 1.4986249240789533, + "learning_rate": 5.1962873451567145e-06, + "loss": 0.303, + "step": 17338 + }, + { + "epoch": 0.5, + "grad_norm": 1.144366037793171, + "learning_rate": 5.195817998724459e-06, + "loss": 0.3121, + "step": 17339 + }, + { + "epoch": 0.5, + "grad_norm": 1.271430605318691, + "learning_rate": 5.195348650564107e-06, + "loss": 0.3032, + "step": 17340 + }, + { + "epoch": 0.5, + "grad_norm": 1.249455577427133, + "learning_rate": 5.1948793006798014e-06, + "loss": 0.2979, + "step": 17341 + }, + { + "epoch": 0.5, + "grad_norm": 1.2337511461284962, + "learning_rate": 5.194409949075685e-06, + "loss": 0.2932, + "step": 17342 + }, + { + "epoch": 0.5, + "grad_norm": 1.243614593602113, + "learning_rate": 5.1939405957558985e-06, + "loss": 0.3164, + "step": 17343 + }, + { + "epoch": 0.5, + "grad_norm": 1.4488749561205991, + "learning_rate": 5.193471240724583e-06, + "loss": 0.3003, + "step": 17344 + }, + { + "epoch": 0.5, + "grad_norm": 1.7408304597271664, + "learning_rate": 5.193001883985882e-06, + "loss": 0.2966, + "step": 17345 + }, + { + "epoch": 0.5, + "grad_norm": 1.2639023785044523, + "learning_rate": 5.192532525543937e-06, + "loss": 0.3131, + "step": 17346 + }, + { + "epoch": 0.5, + "grad_norm": 1.3273971485928078, + "learning_rate": 5.1920631654028894e-06, + "loss": 0.3423, + "step": 17347 + }, + { + "epoch": 0.5, + "grad_norm": 1.365462185863385, + "learning_rate": 5.1915938035668845e-06, + "loss": 0.2986, + "step": 17348 + }, + { + "epoch": 0.5, + "grad_norm": 1.2813993596092779, + "learning_rate": 5.19112444004006e-06, + "loss": 0.3112, + "step": 17349 + }, + { + "epoch": 0.5, + "grad_norm": 3.887259847940012, + "learning_rate": 5.1906550748265614e-06, + "loss": 0.305, + "step": 17350 + }, + { + "epoch": 0.5, + "grad_norm": 1.45472864660297, + "learning_rate": 5.190185707930528e-06, + "loss": 0.317, + "step": 17351 + }, + { + "epoch": 0.5, + "grad_norm": 1.1365461780300377, + "learning_rate": 5.189716339356107e-06, + "loss": 0.3064, + "step": 17352 + }, + { + "epoch": 0.5, + "grad_norm": 1.3468321789956967, + "learning_rate": 5.189246969107434e-06, + "loss": 0.309, + "step": 17353 + }, + { + "epoch": 0.5, + "grad_norm": 1.396535743245245, + "learning_rate": 5.1887775971886555e-06, + "loss": 0.3064, + "step": 17354 + }, + { + "epoch": 0.5, + "grad_norm": 1.1852542907546664, + "learning_rate": 5.188308223603913e-06, + "loss": 0.3041, + "step": 17355 + }, + { + "epoch": 0.5, + "grad_norm": 1.16506833482267, + "learning_rate": 5.187838848357348e-06, + "loss": 0.2985, + "step": 17356 + }, + { + "epoch": 0.5, + "grad_norm": 1.2740732490082372, + "learning_rate": 5.187369471453105e-06, + "loss": 0.2832, + "step": 17357 + }, + { + "epoch": 0.5, + "grad_norm": 1.5255217345263359, + "learning_rate": 5.186900092895322e-06, + "loss": 0.3483, + "step": 17358 + }, + { + "epoch": 0.5, + "grad_norm": 1.201513780337883, + "learning_rate": 5.1864307126881444e-06, + "loss": 0.3161, + "step": 17359 + }, + { + "epoch": 0.5, + "grad_norm": 1.233686398838031, + "learning_rate": 5.1859613308357125e-06, + "loss": 0.3127, + "step": 17360 + }, + { + "epoch": 0.5, + "grad_norm": 1.3591699858138797, + "learning_rate": 5.185491947342172e-06, + "loss": 0.3477, + "step": 17361 + }, + { + "epoch": 0.5, + "grad_norm": 1.3718322822645694, + "learning_rate": 5.1850225622116615e-06, + "loss": 0.3231, + "step": 17362 + }, + { + "epoch": 0.5, + "grad_norm": 1.365417632070314, + "learning_rate": 5.184553175448324e-06, + "loss": 0.2989, + "step": 17363 + }, + { + "epoch": 0.5, + "grad_norm": 1.3247118217308114, + "learning_rate": 5.184083787056303e-06, + "loss": 0.2907, + "step": 17364 + }, + { + "epoch": 0.5, + "grad_norm": 2.134572280879774, + "learning_rate": 5.183614397039741e-06, + "loss": 0.3034, + "step": 17365 + }, + { + "epoch": 0.5, + "grad_norm": 1.2494333814401033, + "learning_rate": 5.183145005402781e-06, + "loss": 0.3179, + "step": 17366 + }, + { + "epoch": 0.5, + "grad_norm": 1.1870824649176088, + "learning_rate": 5.1826756121495636e-06, + "loss": 0.2948, + "step": 17367 + }, + { + "epoch": 0.5, + "grad_norm": 1.2209853629913312, + "learning_rate": 5.182206217284231e-06, + "loss": 0.2919, + "step": 17368 + }, + { + "epoch": 0.5, + "grad_norm": 1.4293381073483444, + "learning_rate": 5.181736820810927e-06, + "loss": 0.3299, + "step": 17369 + }, + { + "epoch": 0.5, + "grad_norm": 1.2308983540670988, + "learning_rate": 5.181267422733794e-06, + "loss": 0.3178, + "step": 17370 + }, + { + "epoch": 0.5, + "grad_norm": 1.2852957900260968, + "learning_rate": 5.180798023056975e-06, + "loss": 0.3134, + "step": 17371 + }, + { + "epoch": 0.5, + "grad_norm": 1.5181226413063924, + "learning_rate": 5.1803286217846106e-06, + "loss": 0.3428, + "step": 17372 + }, + { + "epoch": 0.5, + "grad_norm": 1.4751188534854993, + "learning_rate": 5.179859218920843e-06, + "loss": 0.2927, + "step": 17373 + }, + { + "epoch": 0.5, + "grad_norm": 1.4088065578942348, + "learning_rate": 5.179389814469817e-06, + "loss": 0.3308, + "step": 17374 + }, + { + "epoch": 0.5, + "grad_norm": 1.2669678445851018, + "learning_rate": 5.178920408435674e-06, + "loss": 0.3182, + "step": 17375 + }, + { + "epoch": 0.5, + "grad_norm": 1.2988241921360157, + "learning_rate": 5.178451000822555e-06, + "loss": 0.3107, + "step": 17376 + }, + { + "epoch": 0.5, + "grad_norm": 2.219384364640137, + "learning_rate": 5.177981591634605e-06, + "loss": 0.2901, + "step": 17377 + }, + { + "epoch": 0.5, + "grad_norm": 1.3195767064027233, + "learning_rate": 5.177512180875964e-06, + "loss": 0.2934, + "step": 17378 + }, + { + "epoch": 0.5, + "grad_norm": 1.4080298390886579, + "learning_rate": 5.177042768550779e-06, + "loss": 0.3208, + "step": 17379 + }, + { + "epoch": 0.5, + "grad_norm": 4.080121511189659, + "learning_rate": 5.176573354663187e-06, + "loss": 0.3351, + "step": 17380 + }, + { + "epoch": 0.5, + "grad_norm": 1.2360869265756678, + "learning_rate": 5.176103939217334e-06, + "loss": 0.3098, + "step": 17381 + }, + { + "epoch": 0.5, + "grad_norm": 1.1821625484882425, + "learning_rate": 5.175634522217361e-06, + "loss": 0.2921, + "step": 17382 + }, + { + "epoch": 0.5, + "grad_norm": 1.2706636736414882, + "learning_rate": 5.175165103667412e-06, + "loss": 0.2984, + "step": 17383 + }, + { + "epoch": 0.5, + "grad_norm": 1.2140742489201466, + "learning_rate": 5.17469568357163e-06, + "loss": 0.3268, + "step": 17384 + }, + { + "epoch": 0.5, + "grad_norm": 1.11376076893658, + "learning_rate": 5.174226261934157e-06, + "loss": 0.2855, + "step": 17385 + }, + { + "epoch": 0.5, + "grad_norm": 1.4046424156029882, + "learning_rate": 5.1737568387591345e-06, + "loss": 0.2892, + "step": 17386 + }, + { + "epoch": 0.5, + "grad_norm": 0.9455179214972619, + "learning_rate": 5.173287414050705e-06, + "loss": 0.624, + "step": 17387 + }, + { + "epoch": 0.5, + "grad_norm": 1.23648972096057, + "learning_rate": 5.172817987813013e-06, + "loss": 0.2849, + "step": 17388 + }, + { + "epoch": 0.5, + "grad_norm": 1.2871179797348826, + "learning_rate": 5.1723485600502e-06, + "loss": 0.3187, + "step": 17389 + }, + { + "epoch": 0.5, + "grad_norm": 1.4508958145985034, + "learning_rate": 5.17187913076641e-06, + "loss": 0.3273, + "step": 17390 + }, + { + "epoch": 0.5, + "grad_norm": 1.5641944452740333, + "learning_rate": 5.171409699965783e-06, + "loss": 0.2864, + "step": 17391 + }, + { + "epoch": 0.5, + "grad_norm": 1.36735653253766, + "learning_rate": 5.170940267652465e-06, + "loss": 0.3017, + "step": 17392 + }, + { + "epoch": 0.5, + "grad_norm": 1.2451835624489027, + "learning_rate": 5.1704708338305965e-06, + "loss": 0.3372, + "step": 17393 + }, + { + "epoch": 0.5, + "grad_norm": 1.208798926777564, + "learning_rate": 5.170001398504321e-06, + "loss": 0.3193, + "step": 17394 + }, + { + "epoch": 0.5, + "grad_norm": 1.2676389325055042, + "learning_rate": 5.169531961677782e-06, + "loss": 0.3104, + "step": 17395 + }, + { + "epoch": 0.5, + "grad_norm": 1.4493593717476227, + "learning_rate": 5.169062523355119e-06, + "loss": 0.3119, + "step": 17396 + }, + { + "epoch": 0.5, + "grad_norm": 1.5292538611802762, + "learning_rate": 5.168593083540481e-06, + "loss": 0.317, + "step": 17397 + }, + { + "epoch": 0.5, + "grad_norm": 1.267411336852366, + "learning_rate": 5.168123642238007e-06, + "loss": 0.3149, + "step": 17398 + }, + { + "epoch": 0.5, + "grad_norm": 1.4154025538236847, + "learning_rate": 5.167654199451838e-06, + "loss": 0.2909, + "step": 17399 + }, + { + "epoch": 0.5, + "grad_norm": 1.4716164484209675, + "learning_rate": 5.167184755186119e-06, + "loss": 0.2877, + "step": 17400 + }, + { + "epoch": 0.5, + "grad_norm": 1.0183806522772623, + "learning_rate": 5.1667153094449925e-06, + "loss": 0.5564, + "step": 17401 + }, + { + "epoch": 0.5, + "grad_norm": 1.2630376453591472, + "learning_rate": 5.1662458622326015e-06, + "loss": 0.2981, + "step": 17402 + }, + { + "epoch": 0.5, + "grad_norm": 1.2764703197845773, + "learning_rate": 5.165776413553091e-06, + "loss": 0.3107, + "step": 17403 + }, + { + "epoch": 0.5, + "grad_norm": 1.5030453411152418, + "learning_rate": 5.1653069634106e-06, + "loss": 0.3015, + "step": 17404 + }, + { + "epoch": 0.5, + "grad_norm": 1.3151720911546017, + "learning_rate": 5.164837511809273e-06, + "loss": 0.3069, + "step": 17405 + }, + { + "epoch": 0.5, + "grad_norm": 1.3292321099406854, + "learning_rate": 5.164368058753254e-06, + "loss": 0.3016, + "step": 17406 + }, + { + "epoch": 0.5, + "grad_norm": 1.3407549965518812, + "learning_rate": 5.163898604246685e-06, + "loss": 0.3332, + "step": 17407 + }, + { + "epoch": 0.5, + "grad_norm": 1.2856783202846203, + "learning_rate": 5.1634291482937085e-06, + "loss": 0.3101, + "step": 17408 + }, + { + "epoch": 0.5, + "grad_norm": 1.3345660635576277, + "learning_rate": 5.162959690898468e-06, + "loss": 0.2945, + "step": 17409 + }, + { + "epoch": 0.5, + "grad_norm": 1.2917103411576625, + "learning_rate": 5.162490232065106e-06, + "loss": 0.3319, + "step": 17410 + }, + { + "epoch": 0.51, + "grad_norm": 1.2530031331171434, + "learning_rate": 5.162020771797768e-06, + "loss": 0.3049, + "step": 17411 + }, + { + "epoch": 0.51, + "grad_norm": 1.3144126553022268, + "learning_rate": 5.161551310100594e-06, + "loss": 0.305, + "step": 17412 + }, + { + "epoch": 0.51, + "grad_norm": 1.2313344797996169, + "learning_rate": 5.161081846977727e-06, + "loss": 0.3046, + "step": 17413 + }, + { + "epoch": 0.51, + "grad_norm": 1.2268056857738616, + "learning_rate": 5.160612382433311e-06, + "loss": 0.3164, + "step": 17414 + }, + { + "epoch": 0.51, + "grad_norm": 1.3150390299945551, + "learning_rate": 5.160142916471489e-06, + "loss": 0.3209, + "step": 17415 + }, + { + "epoch": 0.51, + "grad_norm": 2.0148491968024183, + "learning_rate": 5.159673449096405e-06, + "loss": 0.3374, + "step": 17416 + }, + { + "epoch": 0.51, + "grad_norm": 1.5525242916640793, + "learning_rate": 5.1592039803121995e-06, + "loss": 0.2963, + "step": 17417 + }, + { + "epoch": 0.51, + "grad_norm": 1.34338328332968, + "learning_rate": 5.158734510123018e-06, + "loss": 0.3251, + "step": 17418 + }, + { + "epoch": 0.51, + "grad_norm": 1.4352613857837437, + "learning_rate": 5.158265038533001e-06, + "loss": 0.3116, + "step": 17419 + }, + { + "epoch": 0.51, + "grad_norm": 1.2990120995795058, + "learning_rate": 5.1577955655462955e-06, + "loss": 0.3104, + "step": 17420 + }, + { + "epoch": 0.51, + "grad_norm": 1.3471377839541991, + "learning_rate": 5.157326091167042e-06, + "loss": 0.3119, + "step": 17421 + }, + { + "epoch": 0.51, + "grad_norm": 1.3299670285267888, + "learning_rate": 5.156856615399384e-06, + "loss": 0.2979, + "step": 17422 + }, + { + "epoch": 0.51, + "grad_norm": 1.2887024400400977, + "learning_rate": 5.156387138247464e-06, + "loss": 0.2947, + "step": 17423 + }, + { + "epoch": 0.51, + "grad_norm": 1.2927031271755813, + "learning_rate": 5.155917659715426e-06, + "loss": 0.3263, + "step": 17424 + }, + { + "epoch": 0.51, + "grad_norm": 1.489010231545207, + "learning_rate": 5.155448179807413e-06, + "loss": 0.2816, + "step": 17425 + }, + { + "epoch": 0.51, + "grad_norm": 1.2421796533560443, + "learning_rate": 5.15497869852757e-06, + "loss": 0.3122, + "step": 17426 + }, + { + "epoch": 0.51, + "grad_norm": 1.1913253907978736, + "learning_rate": 5.154509215880035e-06, + "loss": 0.2865, + "step": 17427 + }, + { + "epoch": 0.51, + "grad_norm": 1.3219255956408504, + "learning_rate": 5.154039731868956e-06, + "loss": 0.3141, + "step": 17428 + }, + { + "epoch": 0.51, + "grad_norm": 1.5107797616733292, + "learning_rate": 5.153570246498474e-06, + "loss": 0.3437, + "step": 17429 + }, + { + "epoch": 0.51, + "grad_norm": 1.3491165330674453, + "learning_rate": 5.1531007597727335e-06, + "loss": 0.3268, + "step": 17430 + }, + { + "epoch": 0.51, + "grad_norm": 1.4424793123219124, + "learning_rate": 5.1526312716958774e-06, + "loss": 0.3245, + "step": 17431 + }, + { + "epoch": 0.51, + "grad_norm": 1.2958293219378458, + "learning_rate": 5.152161782272047e-06, + "loss": 0.3116, + "step": 17432 + }, + { + "epoch": 0.51, + "grad_norm": 1.256703850965711, + "learning_rate": 5.1516922915053875e-06, + "loss": 0.2952, + "step": 17433 + }, + { + "epoch": 0.51, + "grad_norm": 1.1772095533636788, + "learning_rate": 5.1512227994000445e-06, + "loss": 0.2939, + "step": 17434 + }, + { + "epoch": 0.51, + "grad_norm": 1.253571256527519, + "learning_rate": 5.150753305960157e-06, + "loss": 0.301, + "step": 17435 + }, + { + "epoch": 0.51, + "grad_norm": 1.4372740070898855, + "learning_rate": 5.150283811189869e-06, + "loss": 0.2937, + "step": 17436 + }, + { + "epoch": 0.51, + "grad_norm": 1.5357218623785496, + "learning_rate": 5.149814315093326e-06, + "loss": 0.2892, + "step": 17437 + }, + { + "epoch": 0.51, + "grad_norm": 1.301730176701491, + "learning_rate": 5.149344817674669e-06, + "loss": 0.3038, + "step": 17438 + }, + { + "epoch": 0.51, + "grad_norm": 1.2265089244310625, + "learning_rate": 5.148875318938043e-06, + "loss": 0.3033, + "step": 17439 + }, + { + "epoch": 0.51, + "grad_norm": 1.2541908563792996, + "learning_rate": 5.148405818887591e-06, + "loss": 0.3004, + "step": 17440 + }, + { + "epoch": 0.51, + "grad_norm": 1.2932850277860077, + "learning_rate": 5.147936317527456e-06, + "loss": 0.3193, + "step": 17441 + }, + { + "epoch": 0.51, + "grad_norm": 1.1832135992591064, + "learning_rate": 5.147466814861779e-06, + "loss": 0.2928, + "step": 17442 + }, + { + "epoch": 0.51, + "grad_norm": 1.2930153452763757, + "learning_rate": 5.146997310894708e-06, + "loss": 0.3241, + "step": 17443 + }, + { + "epoch": 0.51, + "grad_norm": 1.2283685959629167, + "learning_rate": 5.146527805630384e-06, + "loss": 0.2915, + "step": 17444 + }, + { + "epoch": 0.51, + "grad_norm": 1.2193223667716169, + "learning_rate": 5.14605829907295e-06, + "loss": 0.3094, + "step": 17445 + }, + { + "epoch": 0.51, + "grad_norm": 1.360797789656671, + "learning_rate": 5.145588791226549e-06, + "loss": 0.3123, + "step": 17446 + }, + { + "epoch": 0.51, + "grad_norm": 1.888733249846036, + "learning_rate": 5.145119282095326e-06, + "loss": 0.3049, + "step": 17447 + }, + { + "epoch": 0.51, + "grad_norm": 1.4923599793813163, + "learning_rate": 5.144649771683425e-06, + "loss": 0.3086, + "step": 17448 + }, + { + "epoch": 0.51, + "grad_norm": 1.4476341322248052, + "learning_rate": 5.144180259994988e-06, + "loss": 0.2965, + "step": 17449 + }, + { + "epoch": 0.51, + "grad_norm": 1.246573716109053, + "learning_rate": 5.143710747034156e-06, + "loss": 0.3229, + "step": 17450 + }, + { + "epoch": 0.51, + "grad_norm": 1.4873792836690802, + "learning_rate": 5.1432412328050775e-06, + "loss": 0.3209, + "step": 17451 + }, + { + "epoch": 0.51, + "grad_norm": 1.2449840970735229, + "learning_rate": 5.142771717311893e-06, + "loss": 0.3204, + "step": 17452 + }, + { + "epoch": 0.51, + "grad_norm": 1.6603237976272889, + "learning_rate": 5.142302200558746e-06, + "loss": 0.3245, + "step": 17453 + }, + { + "epoch": 0.51, + "grad_norm": 1.205402896833124, + "learning_rate": 5.141832682549782e-06, + "loss": 0.2996, + "step": 17454 + }, + { + "epoch": 0.51, + "grad_norm": 1.9675386330027969, + "learning_rate": 5.141363163289141e-06, + "loss": 0.2985, + "step": 17455 + }, + { + "epoch": 0.51, + "grad_norm": 1.2351850296445317, + "learning_rate": 5.14089364278097e-06, + "loss": 0.2999, + "step": 17456 + }, + { + "epoch": 0.51, + "grad_norm": 1.1819833298623308, + "learning_rate": 5.1404241210294095e-06, + "loss": 0.2919, + "step": 17457 + }, + { + "epoch": 0.51, + "grad_norm": 1.3440648362313978, + "learning_rate": 5.139954598038607e-06, + "loss": 0.3349, + "step": 17458 + }, + { + "epoch": 0.51, + "grad_norm": 4.660385278683695, + "learning_rate": 5.139485073812701e-06, + "loss": 0.3092, + "step": 17459 + }, + { + "epoch": 0.51, + "grad_norm": 1.379602745035103, + "learning_rate": 5.139015548355839e-06, + "loss": 0.3124, + "step": 17460 + }, + { + "epoch": 0.51, + "grad_norm": 0.9691470624389487, + "learning_rate": 5.138546021672163e-06, + "loss": 0.6187, + "step": 17461 + }, + { + "epoch": 0.51, + "grad_norm": 1.258195313045904, + "learning_rate": 5.138076493765817e-06, + "loss": 0.2945, + "step": 17462 + }, + { + "epoch": 0.51, + "grad_norm": 1.2880125826130167, + "learning_rate": 5.137606964640944e-06, + "loss": 0.2954, + "step": 17463 + }, + { + "epoch": 0.51, + "grad_norm": 1.1406557530151105, + "learning_rate": 5.137137434301688e-06, + "loss": 0.3076, + "step": 17464 + }, + { + "epoch": 0.51, + "grad_norm": 1.276862079797179, + "learning_rate": 5.1366679027521925e-06, + "loss": 0.2995, + "step": 17465 + }, + { + "epoch": 0.51, + "grad_norm": 1.37634829576279, + "learning_rate": 5.136198369996602e-06, + "loss": 0.3314, + "step": 17466 + }, + { + "epoch": 0.51, + "grad_norm": 1.2070473526555716, + "learning_rate": 5.135728836039059e-06, + "loss": 0.3208, + "step": 17467 + }, + { + "epoch": 0.51, + "grad_norm": 1.2830213192472215, + "learning_rate": 5.1352593008837086e-06, + "loss": 0.3057, + "step": 17468 + }, + { + "epoch": 0.51, + "grad_norm": 0.9289528793100799, + "learning_rate": 5.134789764534692e-06, + "loss": 0.6135, + "step": 17469 + }, + { + "epoch": 0.51, + "grad_norm": 1.6537508521221953, + "learning_rate": 5.134320226996154e-06, + "loss": 0.3218, + "step": 17470 + }, + { + "epoch": 0.51, + "grad_norm": 1.2027222508599589, + "learning_rate": 5.13385068827224e-06, + "loss": 0.2815, + "step": 17471 + }, + { + "epoch": 0.51, + "grad_norm": 0.9094742697024216, + "learning_rate": 5.133381148367092e-06, + "loss": 0.6224, + "step": 17472 + }, + { + "epoch": 0.51, + "grad_norm": 1.3020381468172988, + "learning_rate": 5.132911607284852e-06, + "loss": 0.3021, + "step": 17473 + }, + { + "epoch": 0.51, + "grad_norm": 1.4826941761337513, + "learning_rate": 5.132442065029665e-06, + "loss": 0.3279, + "step": 17474 + }, + { + "epoch": 0.51, + "grad_norm": 1.2508945596158656, + "learning_rate": 5.1319725216056784e-06, + "loss": 0.3002, + "step": 17475 + }, + { + "epoch": 0.51, + "grad_norm": 1.202057119829189, + "learning_rate": 5.131502977017032e-06, + "loss": 0.3028, + "step": 17476 + }, + { + "epoch": 0.51, + "grad_norm": 1.3597726398377366, + "learning_rate": 5.131033431267869e-06, + "loss": 0.3073, + "step": 17477 + }, + { + "epoch": 0.51, + "grad_norm": 1.6184542156772328, + "learning_rate": 5.130563884362335e-06, + "loss": 0.3137, + "step": 17478 + }, + { + "epoch": 0.51, + "grad_norm": 1.2648423449208281, + "learning_rate": 5.130094336304573e-06, + "loss": 0.3157, + "step": 17479 + }, + { + "epoch": 0.51, + "grad_norm": 1.2217357590025233, + "learning_rate": 5.1296247870987295e-06, + "loss": 0.3126, + "step": 17480 + }, + { + "epoch": 0.51, + "grad_norm": 1.3938508294316163, + "learning_rate": 5.129155236748943e-06, + "loss": 0.3125, + "step": 17481 + }, + { + "epoch": 0.51, + "grad_norm": 1.224011241452551, + "learning_rate": 5.128685685259362e-06, + "loss": 0.3296, + "step": 17482 + }, + { + "epoch": 0.51, + "grad_norm": 1.326407347544653, + "learning_rate": 5.128216132634127e-06, + "loss": 0.3235, + "step": 17483 + }, + { + "epoch": 0.51, + "grad_norm": 1.3060062935290082, + "learning_rate": 5.127746578877384e-06, + "loss": 0.3053, + "step": 17484 + }, + { + "epoch": 0.51, + "grad_norm": 1.1749330670289082, + "learning_rate": 5.127277023993276e-06, + "loss": 0.2915, + "step": 17485 + }, + { + "epoch": 0.51, + "grad_norm": 1.551722640777582, + "learning_rate": 5.126807467985946e-06, + "loss": 0.3118, + "step": 17486 + }, + { + "epoch": 0.51, + "grad_norm": 1.7168297874826586, + "learning_rate": 5.126337910859539e-06, + "loss": 0.2985, + "step": 17487 + }, + { + "epoch": 0.51, + "grad_norm": 1.1688087058919294, + "learning_rate": 5.125868352618198e-06, + "loss": 0.2889, + "step": 17488 + }, + { + "epoch": 0.51, + "grad_norm": 1.1805380885320642, + "learning_rate": 5.125398793266069e-06, + "loss": 0.308, + "step": 17489 + }, + { + "epoch": 0.51, + "grad_norm": 1.3648737748097033, + "learning_rate": 5.124929232807294e-06, + "loss": 0.297, + "step": 17490 + }, + { + "epoch": 0.51, + "grad_norm": 0.9197189045519248, + "learning_rate": 5.124459671246016e-06, + "loss": 0.5791, + "step": 17491 + }, + { + "epoch": 0.51, + "grad_norm": 1.2836584412365142, + "learning_rate": 5.123990108586379e-06, + "loss": 0.3045, + "step": 17492 + }, + { + "epoch": 0.51, + "grad_norm": 1.4289041756390564, + "learning_rate": 5.123520544832531e-06, + "loss": 0.3047, + "step": 17493 + }, + { + "epoch": 0.51, + "grad_norm": 1.247086931998917, + "learning_rate": 5.123050979988612e-06, + "loss": 0.2989, + "step": 17494 + }, + { + "epoch": 0.51, + "grad_norm": 1.2669361274208673, + "learning_rate": 5.122581414058765e-06, + "loss": 0.3127, + "step": 17495 + }, + { + "epoch": 0.51, + "grad_norm": 1.2038385118198003, + "learning_rate": 5.1221118470471365e-06, + "loss": 0.2923, + "step": 17496 + }, + { + "epoch": 0.51, + "grad_norm": 1.2273770868595708, + "learning_rate": 5.121642278957871e-06, + "loss": 0.3013, + "step": 17497 + }, + { + "epoch": 0.51, + "grad_norm": 1.3159770015299845, + "learning_rate": 5.121172709795109e-06, + "loss": 0.2879, + "step": 17498 + }, + { + "epoch": 0.51, + "grad_norm": 1.3129252755741596, + "learning_rate": 5.120703139562998e-06, + "loss": 0.3226, + "step": 17499 + }, + { + "epoch": 0.51, + "grad_norm": 1.2048524888180543, + "learning_rate": 5.12023356826568e-06, + "loss": 0.2973, + "step": 17500 + }, + { + "epoch": 0.51, + "grad_norm": 1.190271912769569, + "learning_rate": 5.119763995907299e-06, + "loss": 0.2944, + "step": 17501 + }, + { + "epoch": 0.51, + "grad_norm": 1.2275765222662618, + "learning_rate": 5.119294422492e-06, + "loss": 0.3187, + "step": 17502 + }, + { + "epoch": 0.51, + "grad_norm": 0.9342016951509914, + "learning_rate": 5.118824848023926e-06, + "loss": 0.5789, + "step": 17503 + }, + { + "epoch": 0.51, + "grad_norm": 1.4066745301159655, + "learning_rate": 5.118355272507221e-06, + "loss": 0.3189, + "step": 17504 + }, + { + "epoch": 0.51, + "grad_norm": 1.5476406949075385, + "learning_rate": 5.11788569594603e-06, + "loss": 0.3035, + "step": 17505 + }, + { + "epoch": 0.51, + "grad_norm": 1.3912394783148443, + "learning_rate": 5.117416118344496e-06, + "loss": 0.301, + "step": 17506 + }, + { + "epoch": 0.51, + "grad_norm": 1.2309358939189274, + "learning_rate": 5.116946539706764e-06, + "loss": 0.2828, + "step": 17507 + }, + { + "epoch": 0.51, + "grad_norm": 1.3962014767878297, + "learning_rate": 5.116476960036977e-06, + "loss": 0.2949, + "step": 17508 + }, + { + "epoch": 0.51, + "grad_norm": 1.2525105244548342, + "learning_rate": 5.1160073793392795e-06, + "loss": 0.2951, + "step": 17509 + }, + { + "epoch": 0.51, + "grad_norm": 1.3283884945871516, + "learning_rate": 5.115537797617816e-06, + "loss": 0.318, + "step": 17510 + }, + { + "epoch": 0.51, + "grad_norm": 1.443522936934456, + "learning_rate": 5.11506821487673e-06, + "loss": 0.3251, + "step": 17511 + }, + { + "epoch": 0.51, + "grad_norm": 1.3639808474741821, + "learning_rate": 5.114598631120167e-06, + "loss": 0.3163, + "step": 17512 + }, + { + "epoch": 0.51, + "grad_norm": 1.4629083843559412, + "learning_rate": 5.114129046352268e-06, + "loss": 0.2935, + "step": 17513 + }, + { + "epoch": 0.51, + "grad_norm": 1.2527564767484851, + "learning_rate": 5.11365946057718e-06, + "loss": 0.3128, + "step": 17514 + }, + { + "epoch": 0.51, + "grad_norm": 1.2588618324123384, + "learning_rate": 5.113189873799044e-06, + "loss": 0.306, + "step": 17515 + }, + { + "epoch": 0.51, + "grad_norm": 1.4132486883028983, + "learning_rate": 5.112720286022007e-06, + "loss": 0.3202, + "step": 17516 + }, + { + "epoch": 0.51, + "grad_norm": 1.4023308949577302, + "learning_rate": 5.112250697250214e-06, + "loss": 0.3148, + "step": 17517 + }, + { + "epoch": 0.51, + "grad_norm": 1.5945252143263964, + "learning_rate": 5.1117811074878055e-06, + "loss": 0.2965, + "step": 17518 + }, + { + "epoch": 0.51, + "grad_norm": 0.9998650371054705, + "learning_rate": 5.111311516738928e-06, + "loss": 0.6823, + "step": 17519 + }, + { + "epoch": 0.51, + "grad_norm": 1.9985392008190888, + "learning_rate": 5.110841925007725e-06, + "loss": 0.3213, + "step": 17520 + }, + { + "epoch": 0.51, + "grad_norm": 1.2390152323223185, + "learning_rate": 5.110372332298341e-06, + "loss": 0.3009, + "step": 17521 + }, + { + "epoch": 0.51, + "grad_norm": 1.3236988378261716, + "learning_rate": 5.10990273861492e-06, + "loss": 0.3133, + "step": 17522 + }, + { + "epoch": 0.51, + "grad_norm": 1.3030912006501494, + "learning_rate": 5.109433143961606e-06, + "loss": 0.3296, + "step": 17523 + }, + { + "epoch": 0.51, + "grad_norm": 1.2457418815036547, + "learning_rate": 5.108963548342543e-06, + "loss": 0.3227, + "step": 17524 + }, + { + "epoch": 0.51, + "grad_norm": 1.3534012872406789, + "learning_rate": 5.108493951761877e-06, + "loss": 0.3007, + "step": 17525 + }, + { + "epoch": 0.51, + "grad_norm": 1.201410307541842, + "learning_rate": 5.10802435422375e-06, + "loss": 0.3097, + "step": 17526 + }, + { + "epoch": 0.51, + "grad_norm": 1.1883512171268504, + "learning_rate": 5.107554755732306e-06, + "loss": 0.3089, + "step": 17527 + }, + { + "epoch": 0.51, + "grad_norm": 1.612302963271736, + "learning_rate": 5.10708515629169e-06, + "loss": 0.3073, + "step": 17528 + }, + { + "epoch": 0.51, + "grad_norm": 1.1688655186072077, + "learning_rate": 5.106615555906047e-06, + "loss": 0.2937, + "step": 17529 + }, + { + "epoch": 0.51, + "grad_norm": 1.3102435570831201, + "learning_rate": 5.106145954579521e-06, + "loss": 0.3396, + "step": 17530 + }, + { + "epoch": 0.51, + "grad_norm": 1.2389454202564023, + "learning_rate": 5.105676352316255e-06, + "loss": 0.2818, + "step": 17531 + }, + { + "epoch": 0.51, + "grad_norm": 1.378464638875856, + "learning_rate": 5.105206749120393e-06, + "loss": 0.3058, + "step": 17532 + }, + { + "epoch": 0.51, + "grad_norm": 1.2614110278517332, + "learning_rate": 5.104737144996082e-06, + "loss": 0.3099, + "step": 17533 + }, + { + "epoch": 0.51, + "grad_norm": 1.3352081426713667, + "learning_rate": 5.104267539947463e-06, + "loss": 0.3011, + "step": 17534 + }, + { + "epoch": 0.51, + "grad_norm": 1.431638612740622, + "learning_rate": 5.103797933978683e-06, + "loss": 0.3098, + "step": 17535 + }, + { + "epoch": 0.51, + "grad_norm": 1.3891155138695745, + "learning_rate": 5.103328327093885e-06, + "loss": 0.3027, + "step": 17536 + }, + { + "epoch": 0.51, + "grad_norm": 1.372862527665765, + "learning_rate": 5.102858719297213e-06, + "loss": 0.3309, + "step": 17537 + }, + { + "epoch": 0.51, + "grad_norm": 1.2748196902700384, + "learning_rate": 5.102389110592811e-06, + "loss": 0.3056, + "step": 17538 + }, + { + "epoch": 0.51, + "grad_norm": 1.7382565895991018, + "learning_rate": 5.101919500984825e-06, + "loss": 0.316, + "step": 17539 + }, + { + "epoch": 0.51, + "grad_norm": 1.1998651971432321, + "learning_rate": 5.101449890477399e-06, + "loss": 0.302, + "step": 17540 + }, + { + "epoch": 0.51, + "grad_norm": 2.11894670708673, + "learning_rate": 5.100980279074675e-06, + "loss": 0.2804, + "step": 17541 + }, + { + "epoch": 0.51, + "grad_norm": 1.2995379478145195, + "learning_rate": 5.1005106667808e-06, + "loss": 0.3375, + "step": 17542 + }, + { + "epoch": 0.51, + "grad_norm": 1.4185420397527768, + "learning_rate": 5.100041053599917e-06, + "loss": 0.3023, + "step": 17543 + }, + { + "epoch": 0.51, + "grad_norm": 1.4244880402861895, + "learning_rate": 5.099571439536171e-06, + "loss": 0.3304, + "step": 17544 + }, + { + "epoch": 0.51, + "grad_norm": 1.253443890621236, + "learning_rate": 5.099101824593706e-06, + "loss": 0.2938, + "step": 17545 + }, + { + "epoch": 0.51, + "grad_norm": 1.2756816694512816, + "learning_rate": 5.098632208776664e-06, + "loss": 0.3179, + "step": 17546 + }, + { + "epoch": 0.51, + "grad_norm": 1.4329892764470922, + "learning_rate": 5.098162592089193e-06, + "loss": 0.3052, + "step": 17547 + }, + { + "epoch": 0.51, + "grad_norm": 1.275686341738141, + "learning_rate": 5.097692974535438e-06, + "loss": 0.3132, + "step": 17548 + }, + { + "epoch": 0.51, + "grad_norm": 1.5569475889213908, + "learning_rate": 5.097223356119538e-06, + "loss": 0.3152, + "step": 17549 + }, + { + "epoch": 0.51, + "grad_norm": 1.246784318316103, + "learning_rate": 5.096753736845643e-06, + "loss": 0.3125, + "step": 17550 + }, + { + "epoch": 0.51, + "grad_norm": 1.2755328582481043, + "learning_rate": 5.0962841167178945e-06, + "loss": 0.3081, + "step": 17551 + }, + { + "epoch": 0.51, + "grad_norm": 1.2606653260663367, + "learning_rate": 5.095814495740437e-06, + "loss": 0.3146, + "step": 17552 + }, + { + "epoch": 0.51, + "grad_norm": 0.9794279282824742, + "learning_rate": 5.095344873917417e-06, + "loss": 0.6442, + "step": 17553 + }, + { + "epoch": 0.51, + "grad_norm": 1.348198612438172, + "learning_rate": 5.0948752512529785e-06, + "loss": 0.3097, + "step": 17554 + }, + { + "epoch": 0.51, + "grad_norm": 1.2504551113101035, + "learning_rate": 5.094405627751262e-06, + "loss": 0.3031, + "step": 17555 + }, + { + "epoch": 0.51, + "grad_norm": 1.3039672242967757, + "learning_rate": 5.0939360034164165e-06, + "loss": 0.3031, + "step": 17556 + }, + { + "epoch": 0.51, + "grad_norm": 1.2466655877478892, + "learning_rate": 5.093466378252584e-06, + "loss": 0.2878, + "step": 17557 + }, + { + "epoch": 0.51, + "grad_norm": 1.775393684476223, + "learning_rate": 5.09299675226391e-06, + "loss": 0.2986, + "step": 17558 + }, + { + "epoch": 0.51, + "grad_norm": 1.2364026366945242, + "learning_rate": 5.092527125454538e-06, + "loss": 0.3225, + "step": 17559 + }, + { + "epoch": 0.51, + "grad_norm": 1.2875446567550326, + "learning_rate": 5.0920574978286144e-06, + "loss": 0.3031, + "step": 17560 + }, + { + "epoch": 0.51, + "grad_norm": 1.2995703665231912, + "learning_rate": 5.091587869390281e-06, + "loss": 0.2914, + "step": 17561 + }, + { + "epoch": 0.51, + "grad_norm": 3.5868747210796808, + "learning_rate": 5.091118240143684e-06, + "loss": 0.3179, + "step": 17562 + }, + { + "epoch": 0.51, + "grad_norm": 1.283631882723004, + "learning_rate": 5.090648610092968e-06, + "loss": 0.2965, + "step": 17563 + }, + { + "epoch": 0.51, + "grad_norm": 0.9334609513637714, + "learning_rate": 5.090178979242276e-06, + "loss": 0.5867, + "step": 17564 + }, + { + "epoch": 0.51, + "grad_norm": 1.4306326264308338, + "learning_rate": 5.089709347595754e-06, + "loss": 0.3087, + "step": 17565 + }, + { + "epoch": 0.51, + "grad_norm": 1.3166928887531513, + "learning_rate": 5.089239715157547e-06, + "loss": 0.2892, + "step": 17566 + }, + { + "epoch": 0.51, + "grad_norm": 1.2460509386921856, + "learning_rate": 5.088770081931799e-06, + "loss": 0.2883, + "step": 17567 + }, + { + "epoch": 0.51, + "grad_norm": 1.4244679040869355, + "learning_rate": 5.0883004479226515e-06, + "loss": 0.3229, + "step": 17568 + }, + { + "epoch": 0.51, + "grad_norm": 1.3408058491925703, + "learning_rate": 5.0878308131342535e-06, + "loss": 0.3272, + "step": 17569 + }, + { + "epoch": 0.51, + "grad_norm": 1.3510569365245795, + "learning_rate": 5.087361177570747e-06, + "loss": 0.3125, + "step": 17570 + }, + { + "epoch": 0.51, + "grad_norm": 1.2515728303078535, + "learning_rate": 5.0868915412362765e-06, + "loss": 0.2897, + "step": 17571 + }, + { + "epoch": 0.51, + "grad_norm": 1.408900226566829, + "learning_rate": 5.086421904134988e-06, + "loss": 0.3157, + "step": 17572 + }, + { + "epoch": 0.51, + "grad_norm": 1.2742570975597531, + "learning_rate": 5.085952266271025e-06, + "loss": 0.3209, + "step": 17573 + }, + { + "epoch": 0.51, + "grad_norm": 1.2491022358973884, + "learning_rate": 5.085482627648531e-06, + "loss": 0.2868, + "step": 17574 + }, + { + "epoch": 0.51, + "grad_norm": 1.3771767566371178, + "learning_rate": 5.085012988271653e-06, + "loss": 0.2896, + "step": 17575 + }, + { + "epoch": 0.51, + "grad_norm": 1.2252329713299943, + "learning_rate": 5.084543348144536e-06, + "loss": 0.3256, + "step": 17576 + }, + { + "epoch": 0.51, + "grad_norm": 1.2608419376608793, + "learning_rate": 5.0840737072713205e-06, + "loss": 0.2844, + "step": 17577 + }, + { + "epoch": 0.51, + "grad_norm": 1.3164144407346499, + "learning_rate": 5.0836040656561555e-06, + "loss": 0.3207, + "step": 17578 + }, + { + "epoch": 0.51, + "grad_norm": 1.2411044259856139, + "learning_rate": 5.0831344233031824e-06, + "loss": 0.3305, + "step": 17579 + }, + { + "epoch": 0.51, + "grad_norm": 1.2859611150460093, + "learning_rate": 5.082664780216548e-06, + "loss": 0.3028, + "step": 17580 + }, + { + "epoch": 0.51, + "grad_norm": 1.1739006844703397, + "learning_rate": 5.082195136400396e-06, + "loss": 0.3061, + "step": 17581 + }, + { + "epoch": 0.51, + "grad_norm": 0.9322323589571325, + "learning_rate": 5.081725491858871e-06, + "loss": 0.5954, + "step": 17582 + }, + { + "epoch": 0.51, + "grad_norm": 1.5423282888822825, + "learning_rate": 5.081255846596117e-06, + "loss": 0.3527, + "step": 17583 + }, + { + "epoch": 0.51, + "grad_norm": 1.2020342750062705, + "learning_rate": 5.080786200616279e-06, + "loss": 0.3238, + "step": 17584 + }, + { + "epoch": 0.51, + "grad_norm": 1.5133387790470743, + "learning_rate": 5.080316553923504e-06, + "loss": 0.3339, + "step": 17585 + }, + { + "epoch": 0.51, + "grad_norm": 0.9288029015633518, + "learning_rate": 5.079846906521932e-06, + "loss": 0.6307, + "step": 17586 + }, + { + "epoch": 0.51, + "grad_norm": 1.4522012091642829, + "learning_rate": 5.0793772584157105e-06, + "loss": 0.3607, + "step": 17587 + }, + { + "epoch": 0.51, + "grad_norm": 1.3523863503867337, + "learning_rate": 5.078907609608984e-06, + "loss": 0.3105, + "step": 17588 + }, + { + "epoch": 0.51, + "grad_norm": 1.4061559123202156, + "learning_rate": 5.078437960105898e-06, + "loss": 0.3117, + "step": 17589 + }, + { + "epoch": 0.51, + "grad_norm": 1.3207305649019485, + "learning_rate": 5.077968309910596e-06, + "loss": 0.3149, + "step": 17590 + }, + { + "epoch": 0.51, + "grad_norm": 6.038789147019887, + "learning_rate": 5.0774986590272215e-06, + "loss": 0.298, + "step": 17591 + }, + { + "epoch": 0.51, + "grad_norm": 1.1517705278303692, + "learning_rate": 5.077029007459921e-06, + "loss": 0.2909, + "step": 17592 + }, + { + "epoch": 0.51, + "grad_norm": 2.164748339071936, + "learning_rate": 5.076559355212839e-06, + "loss": 0.3314, + "step": 17593 + }, + { + "epoch": 0.51, + "grad_norm": 1.3729186721183295, + "learning_rate": 5.07608970229012e-06, + "loss": 0.33, + "step": 17594 + }, + { + "epoch": 0.51, + "grad_norm": 1.2755896388566763, + "learning_rate": 5.07562004869591e-06, + "loss": 0.3045, + "step": 17595 + }, + { + "epoch": 0.51, + "grad_norm": 1.248589876518641, + "learning_rate": 5.0751503944343496e-06, + "loss": 0.3151, + "step": 17596 + }, + { + "epoch": 0.51, + "grad_norm": 1.276666518160317, + "learning_rate": 5.0746807395095865e-06, + "loss": 0.312, + "step": 17597 + }, + { + "epoch": 0.51, + "grad_norm": 1.121087224554264, + "learning_rate": 5.074211083925766e-06, + "loss": 0.2983, + "step": 17598 + }, + { + "epoch": 0.51, + "grad_norm": 1.2874160006065636, + "learning_rate": 5.073741427687031e-06, + "loss": 0.2998, + "step": 17599 + }, + { + "epoch": 0.51, + "grad_norm": 1.2929427048368574, + "learning_rate": 5.073271770797527e-06, + "loss": 0.3006, + "step": 17600 + }, + { + "epoch": 0.51, + "grad_norm": 1.2712430587676515, + "learning_rate": 5.072802113261401e-06, + "loss": 0.2929, + "step": 17601 + }, + { + "epoch": 0.51, + "grad_norm": 1.3311777569828986, + "learning_rate": 5.072332455082793e-06, + "loss": 0.3161, + "step": 17602 + }, + { + "epoch": 0.51, + "grad_norm": 1.4581628980476964, + "learning_rate": 5.071862796265852e-06, + "loss": 0.3183, + "step": 17603 + }, + { + "epoch": 0.51, + "grad_norm": 1.3653491584703374, + "learning_rate": 5.071393136814719e-06, + "loss": 0.3113, + "step": 17604 + }, + { + "epoch": 0.51, + "grad_norm": 1.3243195049800673, + "learning_rate": 5.070923476733542e-06, + "loss": 0.3095, + "step": 17605 + }, + { + "epoch": 0.51, + "grad_norm": 1.483541546955266, + "learning_rate": 5.070453816026464e-06, + "loss": 0.2946, + "step": 17606 + }, + { + "epoch": 0.51, + "grad_norm": 1.4528649803756062, + "learning_rate": 5.06998415469763e-06, + "loss": 0.3121, + "step": 17607 + }, + { + "epoch": 0.51, + "grad_norm": 1.243896426946209, + "learning_rate": 5.069514492751188e-06, + "loss": 0.2966, + "step": 17608 + }, + { + "epoch": 0.51, + "grad_norm": 1.2303603443595459, + "learning_rate": 5.069044830191277e-06, + "loss": 0.3044, + "step": 17609 + }, + { + "epoch": 0.51, + "grad_norm": 1.5961089007466678, + "learning_rate": 5.068575167022046e-06, + "loss": 0.3307, + "step": 17610 + }, + { + "epoch": 0.51, + "grad_norm": 1.2721570861383003, + "learning_rate": 5.068105503247637e-06, + "loss": 0.2931, + "step": 17611 + }, + { + "epoch": 0.51, + "grad_norm": 1.2525297566100406, + "learning_rate": 5.067635838872197e-06, + "loss": 0.3147, + "step": 17612 + }, + { + "epoch": 0.51, + "grad_norm": 1.5811904969338295, + "learning_rate": 5.06716617389987e-06, + "loss": 0.3091, + "step": 17613 + }, + { + "epoch": 0.51, + "grad_norm": 1.2695021603251337, + "learning_rate": 5.0666965083348005e-06, + "loss": 0.3075, + "step": 17614 + }, + { + "epoch": 0.51, + "grad_norm": 1.2581752426765982, + "learning_rate": 5.066226842181133e-06, + "loss": 0.3135, + "step": 17615 + }, + { + "epoch": 0.51, + "grad_norm": 1.2455513875241344, + "learning_rate": 5.0657571754430135e-06, + "loss": 0.3218, + "step": 17616 + }, + { + "epoch": 0.51, + "grad_norm": 1.1836768446463417, + "learning_rate": 5.065287508124586e-06, + "loss": 0.3138, + "step": 17617 + }, + { + "epoch": 0.51, + "grad_norm": 1.6049043979297848, + "learning_rate": 5.064817840229996e-06, + "loss": 0.3335, + "step": 17618 + }, + { + "epoch": 0.51, + "grad_norm": 2.279328300065431, + "learning_rate": 5.064348171763388e-06, + "loss": 0.33, + "step": 17619 + }, + { + "epoch": 0.51, + "grad_norm": 1.202156438234863, + "learning_rate": 5.063878502728906e-06, + "loss": 0.3085, + "step": 17620 + }, + { + "epoch": 0.51, + "grad_norm": 1.4112374287486413, + "learning_rate": 5.063408833130696e-06, + "loss": 0.3143, + "step": 17621 + }, + { + "epoch": 0.51, + "grad_norm": 1.1839563937707633, + "learning_rate": 5.062939162972902e-06, + "loss": 0.2809, + "step": 17622 + }, + { + "epoch": 0.51, + "grad_norm": 1.39662285662392, + "learning_rate": 5.062469492259671e-06, + "loss": 0.3095, + "step": 17623 + }, + { + "epoch": 0.51, + "grad_norm": 1.654938700632991, + "learning_rate": 5.061999820995144e-06, + "loss": 0.295, + "step": 17624 + }, + { + "epoch": 0.51, + "grad_norm": 1.2184961801580851, + "learning_rate": 5.061530149183468e-06, + "loss": 0.3169, + "step": 17625 + }, + { + "epoch": 0.51, + "grad_norm": 1.9752564746777437, + "learning_rate": 5.061060476828788e-06, + "loss": 0.3191, + "step": 17626 + }, + { + "epoch": 0.51, + "grad_norm": 0.9641109073618888, + "learning_rate": 5.060590803935248e-06, + "loss": 0.5721, + "step": 17627 + }, + { + "epoch": 0.51, + "grad_norm": 1.2945200339276441, + "learning_rate": 5.060121130506994e-06, + "loss": 0.2928, + "step": 17628 + }, + { + "epoch": 0.51, + "grad_norm": 1.6438032156627205, + "learning_rate": 5.05965145654817e-06, + "loss": 0.2959, + "step": 17629 + }, + { + "epoch": 0.51, + "grad_norm": 1.3987774665409987, + "learning_rate": 5.059181782062922e-06, + "loss": 0.2789, + "step": 17630 + }, + { + "epoch": 0.51, + "grad_norm": 1.3519307836407244, + "learning_rate": 5.058712107055395e-06, + "loss": 0.3433, + "step": 17631 + }, + { + "epoch": 0.51, + "grad_norm": 1.3945239758536305, + "learning_rate": 5.058242431529731e-06, + "loss": 0.3096, + "step": 17632 + }, + { + "epoch": 0.51, + "grad_norm": 1.2774494867699042, + "learning_rate": 5.057772755490077e-06, + "loss": 0.3389, + "step": 17633 + }, + { + "epoch": 0.51, + "grad_norm": 1.3369083949361011, + "learning_rate": 5.057303078940579e-06, + "loss": 0.3239, + "step": 17634 + }, + { + "epoch": 0.51, + "grad_norm": 1.3963806524366313, + "learning_rate": 5.056833401885381e-06, + "loss": 0.3154, + "step": 17635 + }, + { + "epoch": 0.51, + "grad_norm": 1.4059798919922495, + "learning_rate": 5.056363724328626e-06, + "loss": 0.307, + "step": 17636 + }, + { + "epoch": 0.51, + "grad_norm": 1.249369867535773, + "learning_rate": 5.055894046274462e-06, + "loss": 0.314, + "step": 17637 + }, + { + "epoch": 0.51, + "grad_norm": 1.2718729075278434, + "learning_rate": 5.055424367727031e-06, + "loss": 0.3272, + "step": 17638 + }, + { + "epoch": 0.51, + "grad_norm": 1.4192241740678313, + "learning_rate": 5.05495468869048e-06, + "loss": 0.3138, + "step": 17639 + }, + { + "epoch": 0.51, + "grad_norm": 1.6277383948096265, + "learning_rate": 5.054485009168955e-06, + "loss": 0.2951, + "step": 17640 + }, + { + "epoch": 0.51, + "grad_norm": 1.1774836708886394, + "learning_rate": 5.054015329166596e-06, + "loss": 0.3022, + "step": 17641 + }, + { + "epoch": 0.51, + "grad_norm": 1.3622264469242003, + "learning_rate": 5.053545648687553e-06, + "loss": 0.2952, + "step": 17642 + }, + { + "epoch": 0.51, + "grad_norm": 1.1946650625452795, + "learning_rate": 5.053075967735969e-06, + "loss": 0.3081, + "step": 17643 + }, + { + "epoch": 0.51, + "grad_norm": 1.2388501693223488, + "learning_rate": 5.052606286315987e-06, + "loss": 0.3204, + "step": 17644 + }, + { + "epoch": 0.51, + "grad_norm": 1.3415563612899786, + "learning_rate": 5.052136604431757e-06, + "loss": 0.314, + "step": 17645 + }, + { + "epoch": 0.51, + "grad_norm": 2.4634408452201813, + "learning_rate": 5.051666922087419e-06, + "loss": 0.3065, + "step": 17646 + }, + { + "epoch": 0.51, + "grad_norm": 1.3777544091550016, + "learning_rate": 5.05119723928712e-06, + "loss": 0.2939, + "step": 17647 + }, + { + "epoch": 0.51, + "grad_norm": 1.3080221300422195, + "learning_rate": 5.0507275560350055e-06, + "loss": 0.2895, + "step": 17648 + }, + { + "epoch": 0.51, + "grad_norm": 1.2088902176305294, + "learning_rate": 5.05025787233522e-06, + "loss": 0.3327, + "step": 17649 + }, + { + "epoch": 0.51, + "grad_norm": 1.2437047095118556, + "learning_rate": 5.0497881881919074e-06, + "loss": 0.3045, + "step": 17650 + }, + { + "epoch": 0.51, + "grad_norm": 1.3789431818076756, + "learning_rate": 5.049318503609214e-06, + "loss": 0.3412, + "step": 17651 + }, + { + "epoch": 0.51, + "grad_norm": 2.2930374467858563, + "learning_rate": 5.048848818591285e-06, + "loss": 0.3205, + "step": 17652 + }, + { + "epoch": 0.51, + "grad_norm": 1.40444046792068, + "learning_rate": 5.048379133142262e-06, + "loss": 0.3186, + "step": 17653 + }, + { + "epoch": 0.51, + "grad_norm": 1.2717381448650618, + "learning_rate": 5.047909447266295e-06, + "loss": 0.2859, + "step": 17654 + }, + { + "epoch": 0.51, + "grad_norm": 1.5780711671038314, + "learning_rate": 5.0474397609675255e-06, + "loss": 0.2962, + "step": 17655 + }, + { + "epoch": 0.51, + "grad_norm": 1.2835350668782939, + "learning_rate": 5.046970074250099e-06, + "loss": 0.2994, + "step": 17656 + }, + { + "epoch": 0.51, + "grad_norm": 1.4208722210897138, + "learning_rate": 5.0465003871181625e-06, + "loss": 0.3151, + "step": 17657 + }, + { + "epoch": 0.51, + "grad_norm": 1.410476350597971, + "learning_rate": 5.04603069957586e-06, + "loss": 0.3283, + "step": 17658 + }, + { + "epoch": 0.51, + "grad_norm": 1.2355543082433735, + "learning_rate": 5.045561011627334e-06, + "loss": 0.3137, + "step": 17659 + }, + { + "epoch": 0.51, + "grad_norm": 1.2767458209385405, + "learning_rate": 5.045091323276733e-06, + "loss": 0.2985, + "step": 17660 + }, + { + "epoch": 0.51, + "grad_norm": 1.3496196390466781, + "learning_rate": 5.0446216345282e-06, + "loss": 0.3115, + "step": 17661 + }, + { + "epoch": 0.51, + "grad_norm": 1.411957011442903, + "learning_rate": 5.04415194538588e-06, + "loss": 0.3332, + "step": 17662 + }, + { + "epoch": 0.51, + "grad_norm": 1.3666926897372753, + "learning_rate": 5.04368225585392e-06, + "loss": 0.322, + "step": 17663 + }, + { + "epoch": 0.51, + "grad_norm": 1.9555905517654673, + "learning_rate": 5.043212565936463e-06, + "loss": 0.2955, + "step": 17664 + }, + { + "epoch": 0.51, + "grad_norm": 0.9519617156580034, + "learning_rate": 5.042742875637655e-06, + "loss": 0.6002, + "step": 17665 + }, + { + "epoch": 0.51, + "grad_norm": 1.4679005629904103, + "learning_rate": 5.04227318496164e-06, + "loss": 0.3209, + "step": 17666 + }, + { + "epoch": 0.51, + "grad_norm": 1.8034730499845468, + "learning_rate": 5.041803493912564e-06, + "loss": 0.3246, + "step": 17667 + }, + { + "epoch": 0.51, + "grad_norm": 1.4630572735663807, + "learning_rate": 5.041333802494572e-06, + "loss": 0.3049, + "step": 17668 + }, + { + "epoch": 0.51, + "grad_norm": 1.4668246849192461, + "learning_rate": 5.040864110711808e-06, + "loss": 0.3085, + "step": 17669 + }, + { + "epoch": 0.51, + "grad_norm": 1.4612496467432023, + "learning_rate": 5.040394418568418e-06, + "loss": 0.3045, + "step": 17670 + }, + { + "epoch": 0.51, + "grad_norm": 1.2374512450153212, + "learning_rate": 5.039924726068547e-06, + "loss": 0.3081, + "step": 17671 + }, + { + "epoch": 0.51, + "grad_norm": 1.4624043815071528, + "learning_rate": 5.03945503321634e-06, + "loss": 0.2915, + "step": 17672 + }, + { + "epoch": 0.51, + "grad_norm": 1.39121928665549, + "learning_rate": 5.038985340015943e-06, + "loss": 0.2978, + "step": 17673 + }, + { + "epoch": 0.51, + "grad_norm": 1.3475420123384074, + "learning_rate": 5.038515646471498e-06, + "loss": 0.3733, + "step": 17674 + }, + { + "epoch": 0.51, + "grad_norm": 1.1805424531166042, + "learning_rate": 5.038045952587152e-06, + "loss": 0.3008, + "step": 17675 + }, + { + "epoch": 0.51, + "grad_norm": 1.4174097216713355, + "learning_rate": 5.0375762583670515e-06, + "loss": 0.3194, + "step": 17676 + }, + { + "epoch": 0.51, + "grad_norm": 1.287890125690916, + "learning_rate": 5.037106563815339e-06, + "loss": 0.3181, + "step": 17677 + }, + { + "epoch": 0.51, + "grad_norm": 1.3848908644011142, + "learning_rate": 5.036636868936161e-06, + "loss": 0.298, + "step": 17678 + }, + { + "epoch": 0.51, + "grad_norm": 1.305048792564267, + "learning_rate": 5.036167173733662e-06, + "loss": 0.3072, + "step": 17679 + }, + { + "epoch": 0.51, + "grad_norm": 1.3866532032176562, + "learning_rate": 5.035697478211988e-06, + "loss": 0.32, + "step": 17680 + }, + { + "epoch": 0.51, + "grad_norm": 1.2131671994932132, + "learning_rate": 5.035227782375284e-06, + "loss": 0.2983, + "step": 17681 + }, + { + "epoch": 0.51, + "grad_norm": 1.291117355097336, + "learning_rate": 5.034758086227692e-06, + "loss": 0.3109, + "step": 17682 + }, + { + "epoch": 0.51, + "grad_norm": 1.23507991901226, + "learning_rate": 5.034288389773361e-06, + "loss": 0.3106, + "step": 17683 + }, + { + "epoch": 0.51, + "grad_norm": 1.4537279735341404, + "learning_rate": 5.033818693016433e-06, + "loss": 0.333, + "step": 17684 + }, + { + "epoch": 0.51, + "grad_norm": 1.1851241966757184, + "learning_rate": 5.033348995961057e-06, + "loss": 0.3099, + "step": 17685 + }, + { + "epoch": 0.51, + "grad_norm": 1.277043602565441, + "learning_rate": 5.032879298611375e-06, + "loss": 0.3315, + "step": 17686 + }, + { + "epoch": 0.51, + "grad_norm": 1.148208664483527, + "learning_rate": 5.032409600971533e-06, + "loss": 0.309, + "step": 17687 + }, + { + "epoch": 0.51, + "grad_norm": 1.2588589685869584, + "learning_rate": 5.031939903045675e-06, + "loss": 0.3192, + "step": 17688 + }, + { + "epoch": 0.51, + "grad_norm": 1.348513443544384, + "learning_rate": 5.031470204837947e-06, + "loss": 0.3122, + "step": 17689 + }, + { + "epoch": 0.51, + "grad_norm": 1.214535665065464, + "learning_rate": 5.031000506352495e-06, + "loss": 0.3206, + "step": 17690 + }, + { + "epoch": 0.51, + "grad_norm": 1.2827695138540802, + "learning_rate": 5.030530807593463e-06, + "loss": 0.3052, + "step": 17691 + }, + { + "epoch": 0.51, + "grad_norm": 1.3508742980360162, + "learning_rate": 5.030061108564995e-06, + "loss": 0.3181, + "step": 17692 + }, + { + "epoch": 0.51, + "grad_norm": 1.227370517230203, + "learning_rate": 5.029591409271239e-06, + "loss": 0.2858, + "step": 17693 + }, + { + "epoch": 0.51, + "grad_norm": 1.4855661160567248, + "learning_rate": 5.029121709716338e-06, + "loss": 0.3288, + "step": 17694 + }, + { + "epoch": 0.51, + "grad_norm": 1.2446455258298628, + "learning_rate": 5.028652009904439e-06, + "loss": 0.2907, + "step": 17695 + }, + { + "epoch": 0.51, + "grad_norm": 1.217622734259837, + "learning_rate": 5.028182309839683e-06, + "loss": 0.3232, + "step": 17696 + }, + { + "epoch": 0.51, + "grad_norm": 1.1926344954266817, + "learning_rate": 5.02771260952622e-06, + "loss": 0.3079, + "step": 17697 + }, + { + "epoch": 0.51, + "grad_norm": 1.479822715511866, + "learning_rate": 5.027242908968191e-06, + "loss": 0.2821, + "step": 17698 + }, + { + "epoch": 0.51, + "grad_norm": 1.4076954093712168, + "learning_rate": 5.0267732081697455e-06, + "loss": 0.312, + "step": 17699 + }, + { + "epoch": 0.51, + "grad_norm": 1.3982190796621472, + "learning_rate": 5.026303507135023e-06, + "loss": 0.3135, + "step": 17700 + }, + { + "epoch": 0.51, + "grad_norm": 1.2073525454143708, + "learning_rate": 5.025833805868173e-06, + "loss": 0.305, + "step": 17701 + }, + { + "epoch": 0.51, + "grad_norm": 1.4048528889563507, + "learning_rate": 5.02536410437334e-06, + "loss": 0.3001, + "step": 17702 + }, + { + "epoch": 0.51, + "grad_norm": 1.260403243447449, + "learning_rate": 5.024894402654667e-06, + "loss": 0.3179, + "step": 17703 + }, + { + "epoch": 0.51, + "grad_norm": 3.8267594285408832, + "learning_rate": 5.0244247007163035e-06, + "loss": 0.2882, + "step": 17704 + }, + { + "epoch": 0.51, + "grad_norm": 1.336868229894482, + "learning_rate": 5.023954998562389e-06, + "loss": 0.2904, + "step": 17705 + }, + { + "epoch": 0.51, + "grad_norm": 1.2758193093265084, + "learning_rate": 5.023485296197073e-06, + "loss": 0.3153, + "step": 17706 + }, + { + "epoch": 0.51, + "grad_norm": 1.2471548684406513, + "learning_rate": 5.0230155936245e-06, + "loss": 0.3123, + "step": 17707 + }, + { + "epoch": 0.51, + "grad_norm": 1.2964814483837588, + "learning_rate": 5.022545890848811e-06, + "loss": 0.3178, + "step": 17708 + }, + { + "epoch": 0.51, + "grad_norm": 1.2692780774386072, + "learning_rate": 5.022076187874157e-06, + "loss": 0.3165, + "step": 17709 + }, + { + "epoch": 0.51, + "grad_norm": 1.3312657272556414, + "learning_rate": 5.021606484704679e-06, + "loss": 0.3268, + "step": 17710 + }, + { + "epoch": 0.51, + "grad_norm": 1.3932200074963106, + "learning_rate": 5.0211367813445234e-06, + "loss": 0.2768, + "step": 17711 + }, + { + "epoch": 0.51, + "grad_norm": 1.3630113043918672, + "learning_rate": 5.020667077797836e-06, + "loss": 0.3088, + "step": 17712 + }, + { + "epoch": 0.51, + "grad_norm": 1.3702795817449738, + "learning_rate": 5.020197374068761e-06, + "loss": 0.3244, + "step": 17713 + }, + { + "epoch": 0.51, + "grad_norm": 1.4623310406592123, + "learning_rate": 5.019727670161445e-06, + "loss": 0.3183, + "step": 17714 + }, + { + "epoch": 0.51, + "grad_norm": 1.193285685645512, + "learning_rate": 5.019257966080031e-06, + "loss": 0.3124, + "step": 17715 + }, + { + "epoch": 0.51, + "grad_norm": 1.2528577556786016, + "learning_rate": 5.018788261828664e-06, + "loss": 0.3284, + "step": 17716 + }, + { + "epoch": 0.51, + "grad_norm": 1.1614122514921759, + "learning_rate": 5.018318557411493e-06, + "loss": 0.3017, + "step": 17717 + }, + { + "epoch": 0.51, + "grad_norm": 1.4561865827483635, + "learning_rate": 5.01784885283266e-06, + "loss": 0.327, + "step": 17718 + }, + { + "epoch": 0.51, + "grad_norm": 1.3159984034542653, + "learning_rate": 5.01737914809631e-06, + "loss": 0.3247, + "step": 17719 + }, + { + "epoch": 0.51, + "grad_norm": 2.0147233783408156, + "learning_rate": 5.016909443206588e-06, + "loss": 0.3031, + "step": 17720 + }, + { + "epoch": 0.51, + "grad_norm": 1.2926682950071484, + "learning_rate": 5.016439738167642e-06, + "loss": 0.2961, + "step": 17721 + }, + { + "epoch": 0.51, + "grad_norm": 1.230372450640137, + "learning_rate": 5.015970032983613e-06, + "loss": 0.3023, + "step": 17722 + }, + { + "epoch": 0.51, + "grad_norm": 1.4241271377550848, + "learning_rate": 5.015500327658651e-06, + "loss": 0.3092, + "step": 17723 + }, + { + "epoch": 0.51, + "grad_norm": 1.2215581923827163, + "learning_rate": 5.015030622196896e-06, + "loss": 0.3216, + "step": 17724 + }, + { + "epoch": 0.51, + "grad_norm": 1.8081022773180597, + "learning_rate": 5.0145609166024966e-06, + "loss": 0.2776, + "step": 17725 + }, + { + "epoch": 0.51, + "grad_norm": 1.2206858845724198, + "learning_rate": 5.0140912108795955e-06, + "loss": 0.3042, + "step": 17726 + }, + { + "epoch": 0.51, + "grad_norm": 1.323767606666139, + "learning_rate": 5.013621505032342e-06, + "loss": 0.3251, + "step": 17727 + }, + { + "epoch": 0.51, + "grad_norm": 1.1880806778618296, + "learning_rate": 5.013151799064877e-06, + "loss": 0.3249, + "step": 17728 + }, + { + "epoch": 0.51, + "grad_norm": 1.6207099913277423, + "learning_rate": 5.012682092981347e-06, + "loss": 0.3139, + "step": 17729 + }, + { + "epoch": 0.51, + "grad_norm": 1.5300393864717088, + "learning_rate": 5.012212386785897e-06, + "loss": 0.319, + "step": 17730 + }, + { + "epoch": 0.51, + "grad_norm": 2.188948312641431, + "learning_rate": 5.011742680482674e-06, + "loss": 0.3097, + "step": 17731 + }, + { + "epoch": 0.51, + "grad_norm": 1.3279729347775548, + "learning_rate": 5.011272974075821e-06, + "loss": 0.3197, + "step": 17732 + }, + { + "epoch": 0.51, + "grad_norm": 1.2932356325226055, + "learning_rate": 5.010803267569483e-06, + "loss": 0.304, + "step": 17733 + }, + { + "epoch": 0.51, + "grad_norm": 1.2976669106799512, + "learning_rate": 5.010333560967807e-06, + "loss": 0.3145, + "step": 17734 + }, + { + "epoch": 0.51, + "grad_norm": 3.7238971684135436, + "learning_rate": 5.009863854274938e-06, + "loss": 0.2898, + "step": 17735 + }, + { + "epoch": 0.51, + "grad_norm": 1.2473745066310642, + "learning_rate": 5.00939414749502e-06, + "loss": 0.2999, + "step": 17736 + }, + { + "epoch": 0.51, + "grad_norm": 1.3642791111142647, + "learning_rate": 5.008924440632198e-06, + "loss": 0.3104, + "step": 17737 + }, + { + "epoch": 0.51, + "grad_norm": 1.5076864943241992, + "learning_rate": 5.008454733690618e-06, + "loss": 0.3146, + "step": 17738 + }, + { + "epoch": 0.51, + "grad_norm": 1.3309639361819148, + "learning_rate": 5.0079850266744236e-06, + "loss": 0.3162, + "step": 17739 + }, + { + "epoch": 0.51, + "grad_norm": 1.3836595343459437, + "learning_rate": 5.007515319587762e-06, + "loss": 0.3239, + "step": 17740 + }, + { + "epoch": 0.51, + "grad_norm": 1.459297577630349, + "learning_rate": 5.007045612434779e-06, + "loss": 0.3268, + "step": 17741 + }, + { + "epoch": 0.51, + "grad_norm": 1.3053742319361075, + "learning_rate": 5.006575905219618e-06, + "loss": 0.3221, + "step": 17742 + }, + { + "epoch": 0.51, + "grad_norm": 1.4575534999586628, + "learning_rate": 5.006106197946422e-06, + "loss": 0.2987, + "step": 17743 + }, + { + "epoch": 0.51, + "grad_norm": 1.3835575879008648, + "learning_rate": 5.00563649061934e-06, + "loss": 0.3035, + "step": 17744 + }, + { + "epoch": 0.51, + "grad_norm": 1.494127075077747, + "learning_rate": 5.005166783242518e-06, + "loss": 0.2906, + "step": 17745 + }, + { + "epoch": 0.51, + "grad_norm": 1.3624975890551676, + "learning_rate": 5.0046970758200975e-06, + "loss": 0.3191, + "step": 17746 + }, + { + "epoch": 0.51, + "grad_norm": 1.4285674564332842, + "learning_rate": 5.004227368356224e-06, + "loss": 0.3165, + "step": 17747 + }, + { + "epoch": 0.51, + "grad_norm": 1.244117328940389, + "learning_rate": 5.0037576608550455e-06, + "loss": 0.2988, + "step": 17748 + }, + { + "epoch": 0.51, + "grad_norm": 1.1629564786392406, + "learning_rate": 5.003287953320706e-06, + "loss": 0.3069, + "step": 17749 + }, + { + "epoch": 0.51, + "grad_norm": 1.271790851654529, + "learning_rate": 5.0028182457573506e-06, + "loss": 0.3286, + "step": 17750 + }, + { + "epoch": 0.51, + "grad_norm": 1.3450817676154718, + "learning_rate": 5.002348538169123e-06, + "loss": 0.3102, + "step": 17751 + }, + { + "epoch": 0.51, + "grad_norm": 1.2313461004574981, + "learning_rate": 5.001878830560169e-06, + "loss": 0.2976, + "step": 17752 + }, + { + "epoch": 0.51, + "grad_norm": 1.297558595863298, + "learning_rate": 5.001409122934635e-06, + "loss": 0.3052, + "step": 17753 + }, + { + "epoch": 0.51, + "grad_norm": 1.6094389231959978, + "learning_rate": 5.000939415296666e-06, + "loss": 0.3002, + "step": 17754 + }, + { + "epoch": 0.51, + "grad_norm": 0.9333232979432884, + "learning_rate": 5.000469707650406e-06, + "loss": 0.5811, + "step": 17755 + }, + { + "epoch": 0.52, + "grad_norm": 1.4520482273333413, + "learning_rate": 5e-06, + "loss": 0.2992, + "step": 17756 + }, + { + "epoch": 0.52, + "grad_norm": 1.2010324607300331, + "learning_rate": 4.999530292349595e-06, + "loss": 0.3219, + "step": 17757 + }, + { + "epoch": 0.52, + "grad_norm": 1.2140466233243952, + "learning_rate": 4.999060584703335e-06, + "loss": 0.311, + "step": 17758 + }, + { + "epoch": 0.52, + "grad_norm": 1.4288087068784483, + "learning_rate": 4.998590877065366e-06, + "loss": 0.3003, + "step": 17759 + }, + { + "epoch": 0.52, + "grad_norm": 1.2557340334310017, + "learning_rate": 4.9981211694398325e-06, + "loss": 0.2996, + "step": 17760 + }, + { + "epoch": 0.52, + "grad_norm": 1.2551981619474253, + "learning_rate": 4.997651461830879e-06, + "loss": 0.2877, + "step": 17761 + }, + { + "epoch": 0.52, + "grad_norm": 1.3429953253399394, + "learning_rate": 4.997181754242651e-06, + "loss": 0.3185, + "step": 17762 + }, + { + "epoch": 0.52, + "grad_norm": 1.3280308486225687, + "learning_rate": 4.996712046679295e-06, + "loss": 0.2908, + "step": 17763 + }, + { + "epoch": 0.52, + "grad_norm": 1.4441837890342273, + "learning_rate": 4.9962423391449545e-06, + "loss": 0.2971, + "step": 17764 + }, + { + "epoch": 0.52, + "grad_norm": 1.2909943681106006, + "learning_rate": 4.995772631643776e-06, + "loss": 0.288, + "step": 17765 + }, + { + "epoch": 0.52, + "grad_norm": 1.310561108454275, + "learning_rate": 4.995302924179903e-06, + "loss": 0.3175, + "step": 17766 + }, + { + "epoch": 0.52, + "grad_norm": 1.3299601869410005, + "learning_rate": 4.994833216757484e-06, + "loss": 0.2989, + "step": 17767 + }, + { + "epoch": 0.52, + "grad_norm": 1.3539576869777838, + "learning_rate": 4.99436350938066e-06, + "loss": 0.3021, + "step": 17768 + }, + { + "epoch": 0.52, + "grad_norm": 1.3871560340325793, + "learning_rate": 4.993893802053578e-06, + "loss": 0.3194, + "step": 17769 + }, + { + "epoch": 0.52, + "grad_norm": 1.376176847142585, + "learning_rate": 4.993424094780383e-06, + "loss": 0.2996, + "step": 17770 + }, + { + "epoch": 0.52, + "grad_norm": 1.232439388359028, + "learning_rate": 4.9929543875652235e-06, + "loss": 0.3258, + "step": 17771 + }, + { + "epoch": 0.52, + "grad_norm": 1.476755920445079, + "learning_rate": 4.9924846804122395e-06, + "loss": 0.3259, + "step": 17772 + }, + { + "epoch": 0.52, + "grad_norm": 1.0104452178645709, + "learning_rate": 4.992014973325579e-06, + "loss": 0.6241, + "step": 17773 + }, + { + "epoch": 0.52, + "grad_norm": 1.256303128543997, + "learning_rate": 4.991545266309385e-06, + "loss": 0.2857, + "step": 17774 + }, + { + "epoch": 0.52, + "grad_norm": 1.2487909158426285, + "learning_rate": 4.991075559367805e-06, + "loss": 0.3001, + "step": 17775 + }, + { + "epoch": 0.52, + "grad_norm": 1.2404293647934357, + "learning_rate": 4.990605852504982e-06, + "loss": 0.319, + "step": 17776 + }, + { + "epoch": 0.52, + "grad_norm": 1.320762315914438, + "learning_rate": 4.990136145725064e-06, + "loss": 0.3009, + "step": 17777 + }, + { + "epoch": 0.52, + "grad_norm": 1.2472153923281004, + "learning_rate": 4.989666439032194e-06, + "loss": 0.2977, + "step": 17778 + }, + { + "epoch": 0.52, + "grad_norm": 1.1936192933809833, + "learning_rate": 4.989196732430518e-06, + "loss": 0.2913, + "step": 17779 + }, + { + "epoch": 0.52, + "grad_norm": 2.1090377472438515, + "learning_rate": 4.98872702592418e-06, + "loss": 0.3277, + "step": 17780 + }, + { + "epoch": 0.52, + "grad_norm": 1.5071057155419532, + "learning_rate": 4.988257319517327e-06, + "loss": 0.3181, + "step": 17781 + }, + { + "epoch": 0.52, + "grad_norm": 1.2280432121829534, + "learning_rate": 4.987787613214104e-06, + "loss": 0.2983, + "step": 17782 + }, + { + "epoch": 0.52, + "grad_norm": 1.339426140656602, + "learning_rate": 4.9873179070186546e-06, + "loss": 0.3161, + "step": 17783 + }, + { + "epoch": 0.52, + "grad_norm": 1.967078359020218, + "learning_rate": 4.986848200935125e-06, + "loss": 0.2971, + "step": 17784 + }, + { + "epoch": 0.52, + "grad_norm": 1.3271124501312404, + "learning_rate": 4.986378494967659e-06, + "loss": 0.3173, + "step": 17785 + }, + { + "epoch": 0.52, + "grad_norm": 1.8965552900753735, + "learning_rate": 4.985908789120405e-06, + "loss": 0.3038, + "step": 17786 + }, + { + "epoch": 0.52, + "grad_norm": 1.3794624640267723, + "learning_rate": 4.985439083397505e-06, + "loss": 0.3139, + "step": 17787 + }, + { + "epoch": 0.52, + "grad_norm": 1.5192618026226572, + "learning_rate": 4.984969377803105e-06, + "loss": 0.3232, + "step": 17788 + }, + { + "epoch": 0.52, + "grad_norm": 1.3944326116343486, + "learning_rate": 4.984499672341351e-06, + "loss": 0.3193, + "step": 17789 + }, + { + "epoch": 0.52, + "grad_norm": 1.3148975288114364, + "learning_rate": 4.9840299670163876e-06, + "loss": 0.3107, + "step": 17790 + }, + { + "epoch": 0.52, + "grad_norm": 1.2456079920062966, + "learning_rate": 4.9835602618323596e-06, + "loss": 0.3132, + "step": 17791 + }, + { + "epoch": 0.52, + "grad_norm": 1.1908771122366053, + "learning_rate": 4.983090556793412e-06, + "loss": 0.2911, + "step": 17792 + }, + { + "epoch": 0.52, + "grad_norm": 1.201991058628766, + "learning_rate": 4.982620851903691e-06, + "loss": 0.2979, + "step": 17793 + }, + { + "epoch": 0.52, + "grad_norm": 1.2382969458905648, + "learning_rate": 4.982151147167341e-06, + "loss": 0.3017, + "step": 17794 + }, + { + "epoch": 0.52, + "grad_norm": 1.2520823540706028, + "learning_rate": 4.981681442588508e-06, + "loss": 0.3209, + "step": 17795 + }, + { + "epoch": 0.52, + "grad_norm": 1.343640851849536, + "learning_rate": 4.981211738171335e-06, + "loss": 0.3106, + "step": 17796 + }, + { + "epoch": 0.52, + "grad_norm": 1.191523762003055, + "learning_rate": 4.98074203391997e-06, + "loss": 0.283, + "step": 17797 + }, + { + "epoch": 0.52, + "grad_norm": 1.177690014275132, + "learning_rate": 4.980272329838556e-06, + "loss": 0.2945, + "step": 17798 + }, + { + "epoch": 0.52, + "grad_norm": 1.8004045960590591, + "learning_rate": 4.9798026259312406e-06, + "loss": 0.2735, + "step": 17799 + }, + { + "epoch": 0.52, + "grad_norm": 1.472883808560754, + "learning_rate": 4.979332922202166e-06, + "loss": 0.3159, + "step": 17800 + }, + { + "epoch": 0.52, + "grad_norm": 1.3233290722274291, + "learning_rate": 4.978863218655479e-06, + "loss": 0.3169, + "step": 17801 + }, + { + "epoch": 0.52, + "grad_norm": 1.265504456659074, + "learning_rate": 4.978393515295323e-06, + "loss": 0.3022, + "step": 17802 + }, + { + "epoch": 0.52, + "grad_norm": 1.2004939165869457, + "learning_rate": 4.977923812125845e-06, + "loss": 0.3193, + "step": 17803 + }, + { + "epoch": 0.52, + "grad_norm": 1.302422609514607, + "learning_rate": 4.97745410915119e-06, + "loss": 0.3049, + "step": 17804 + }, + { + "epoch": 0.52, + "grad_norm": 1.4354645724322574, + "learning_rate": 4.976984406375504e-06, + "loss": 0.3085, + "step": 17805 + }, + { + "epoch": 0.52, + "grad_norm": 1.5296999343941422, + "learning_rate": 4.976514703802929e-06, + "loss": 0.3002, + "step": 17806 + }, + { + "epoch": 0.52, + "grad_norm": 1.3373553724419536, + "learning_rate": 4.9760450014376115e-06, + "loss": 0.356, + "step": 17807 + }, + { + "epoch": 0.52, + "grad_norm": 1.2671174346068605, + "learning_rate": 4.975575299283698e-06, + "loss": 0.2842, + "step": 17808 + }, + { + "epoch": 0.52, + "grad_norm": 1.4321475319792538, + "learning_rate": 4.9751055973453335e-06, + "loss": 0.3397, + "step": 17809 + }, + { + "epoch": 0.52, + "grad_norm": 1.187756676482363, + "learning_rate": 4.974635895626661e-06, + "loss": 0.3079, + "step": 17810 + }, + { + "epoch": 0.52, + "grad_norm": 1.4154661402329098, + "learning_rate": 4.974166194131828e-06, + "loss": 0.3188, + "step": 17811 + }, + { + "epoch": 0.52, + "grad_norm": 1.5921672619554599, + "learning_rate": 4.973696492864978e-06, + "loss": 0.3651, + "step": 17812 + }, + { + "epoch": 0.52, + "grad_norm": 1.2902130531260392, + "learning_rate": 4.973226791830257e-06, + "loss": 0.2925, + "step": 17813 + }, + { + "epoch": 0.52, + "grad_norm": 1.2936613307218632, + "learning_rate": 4.972757091031811e-06, + "loss": 0.3095, + "step": 17814 + }, + { + "epoch": 0.52, + "grad_norm": 1.340284793297318, + "learning_rate": 4.972287390473782e-06, + "loss": 0.2982, + "step": 17815 + }, + { + "epoch": 0.52, + "grad_norm": 1.736466141655436, + "learning_rate": 4.971817690160318e-06, + "loss": 0.2794, + "step": 17816 + }, + { + "epoch": 0.52, + "grad_norm": 1.3427184623126172, + "learning_rate": 4.971347990095563e-06, + "loss": 0.3065, + "step": 17817 + }, + { + "epoch": 0.52, + "grad_norm": 1.8330063755563162, + "learning_rate": 4.970878290283664e-06, + "loss": 0.2968, + "step": 17818 + }, + { + "epoch": 0.52, + "grad_norm": 1.4885718329281357, + "learning_rate": 4.970408590728762e-06, + "loss": 0.3336, + "step": 17819 + }, + { + "epoch": 0.52, + "grad_norm": 1.5269989596867068, + "learning_rate": 4.969938891435005e-06, + "loss": 0.3186, + "step": 17820 + }, + { + "epoch": 0.52, + "grad_norm": 1.616735291118866, + "learning_rate": 4.969469192406537e-06, + "loss": 0.3227, + "step": 17821 + }, + { + "epoch": 0.52, + "grad_norm": 1.3254994525239108, + "learning_rate": 4.968999493647506e-06, + "loss": 0.3482, + "step": 17822 + }, + { + "epoch": 0.52, + "grad_norm": 1.4837120708525324, + "learning_rate": 4.968529795162054e-06, + "loss": 0.2908, + "step": 17823 + }, + { + "epoch": 0.52, + "grad_norm": 1.2699954232483837, + "learning_rate": 4.968060096954326e-06, + "loss": 0.3386, + "step": 17824 + }, + { + "epoch": 0.52, + "grad_norm": 1.2114870674982214, + "learning_rate": 4.967590399028468e-06, + "loss": 0.3177, + "step": 17825 + }, + { + "epoch": 0.52, + "grad_norm": 1.1972403920200012, + "learning_rate": 4.967120701388626e-06, + "loss": 0.3075, + "step": 17826 + }, + { + "epoch": 0.52, + "grad_norm": 1.5106473091909054, + "learning_rate": 4.966651004038945e-06, + "loss": 0.3191, + "step": 17827 + }, + { + "epoch": 0.52, + "grad_norm": 1.5002939843303187, + "learning_rate": 4.9661813069835685e-06, + "loss": 0.3001, + "step": 17828 + }, + { + "epoch": 0.52, + "grad_norm": 1.4131571554005424, + "learning_rate": 4.965711610226642e-06, + "loss": 0.2918, + "step": 17829 + }, + { + "epoch": 0.52, + "grad_norm": 1.3111745929428709, + "learning_rate": 4.9652419137723105e-06, + "loss": 0.2982, + "step": 17830 + }, + { + "epoch": 0.52, + "grad_norm": 1.3410034272743385, + "learning_rate": 4.96477221762472e-06, + "loss": 0.295, + "step": 17831 + }, + { + "epoch": 0.52, + "grad_norm": 1.4274818196043018, + "learning_rate": 4.9643025217880144e-06, + "loss": 0.2913, + "step": 17832 + }, + { + "epoch": 0.52, + "grad_norm": 1.3818352370029479, + "learning_rate": 4.96383282626634e-06, + "loss": 0.3112, + "step": 17833 + }, + { + "epoch": 0.52, + "grad_norm": 1.1946582082138633, + "learning_rate": 4.963363131063841e-06, + "loss": 0.2996, + "step": 17834 + }, + { + "epoch": 0.52, + "grad_norm": 1.481802906152943, + "learning_rate": 4.962893436184662e-06, + "loss": 0.2987, + "step": 17835 + }, + { + "epoch": 0.52, + "grad_norm": 1.2512016643832868, + "learning_rate": 4.96242374163295e-06, + "loss": 0.3023, + "step": 17836 + }, + { + "epoch": 0.52, + "grad_norm": 1.2848535705365174, + "learning_rate": 4.961954047412849e-06, + "loss": 0.3072, + "step": 17837 + }, + { + "epoch": 0.52, + "grad_norm": 1.201810295651129, + "learning_rate": 4.961484353528504e-06, + "loss": 0.3126, + "step": 17838 + }, + { + "epoch": 0.52, + "grad_norm": 1.2003206097538661, + "learning_rate": 4.9610146599840595e-06, + "loss": 0.3021, + "step": 17839 + }, + { + "epoch": 0.52, + "grad_norm": 1.3022560219797255, + "learning_rate": 4.9605449667836605e-06, + "loss": 0.2974, + "step": 17840 + }, + { + "epoch": 0.52, + "grad_norm": 1.8574483081829083, + "learning_rate": 4.960075273931454e-06, + "loss": 0.3085, + "step": 17841 + }, + { + "epoch": 0.52, + "grad_norm": 1.862483295469099, + "learning_rate": 4.959605581431583e-06, + "loss": 0.2849, + "step": 17842 + }, + { + "epoch": 0.52, + "grad_norm": 1.3001830572574875, + "learning_rate": 4.9591358892881925e-06, + "loss": 0.2957, + "step": 17843 + }, + { + "epoch": 0.52, + "grad_norm": 1.3161982155190006, + "learning_rate": 4.958666197505429e-06, + "loss": 0.3212, + "step": 17844 + }, + { + "epoch": 0.52, + "grad_norm": 1.3169083447110508, + "learning_rate": 4.958196506087437e-06, + "loss": 0.3012, + "step": 17845 + }, + { + "epoch": 0.52, + "grad_norm": 1.3827189546749845, + "learning_rate": 4.957726815038361e-06, + "loss": 0.2945, + "step": 17846 + }, + { + "epoch": 0.52, + "grad_norm": 1.559017621912269, + "learning_rate": 4.957257124362346e-06, + "loss": 0.2864, + "step": 17847 + }, + { + "epoch": 0.52, + "grad_norm": 1.2381511997742076, + "learning_rate": 4.956787434063537e-06, + "loss": 0.3152, + "step": 17848 + }, + { + "epoch": 0.52, + "grad_norm": 1.3321068697088625, + "learning_rate": 4.95631774414608e-06, + "loss": 0.3052, + "step": 17849 + }, + { + "epoch": 0.52, + "grad_norm": 1.3384532863267677, + "learning_rate": 4.95584805461412e-06, + "loss": 0.3322, + "step": 17850 + }, + { + "epoch": 0.52, + "grad_norm": 1.352516454300272, + "learning_rate": 4.955378365471801e-06, + "loss": 0.3067, + "step": 17851 + }, + { + "epoch": 0.52, + "grad_norm": 1.328045187115512, + "learning_rate": 4.954908676723267e-06, + "loss": 0.3069, + "step": 17852 + }, + { + "epoch": 0.52, + "grad_norm": 1.61493218871673, + "learning_rate": 4.954438988372666e-06, + "loss": 0.3427, + "step": 17853 + }, + { + "epoch": 0.52, + "grad_norm": 1.5903007908003084, + "learning_rate": 4.953969300424142e-06, + "loss": 0.3171, + "step": 17854 + }, + { + "epoch": 0.52, + "grad_norm": 1.399914755161308, + "learning_rate": 4.953499612881839e-06, + "loss": 0.2997, + "step": 17855 + }, + { + "epoch": 0.52, + "grad_norm": 1.3330764556740418, + "learning_rate": 4.953029925749903e-06, + "loss": 0.299, + "step": 17856 + }, + { + "epoch": 0.52, + "grad_norm": 1.2273821038483632, + "learning_rate": 4.952560239032477e-06, + "loss": 0.3163, + "step": 17857 + }, + { + "epoch": 0.52, + "grad_norm": 1.8540065775808854, + "learning_rate": 4.952090552733708e-06, + "loss": 0.2889, + "step": 17858 + }, + { + "epoch": 0.52, + "grad_norm": 1.437092853786137, + "learning_rate": 4.951620866857739e-06, + "loss": 0.2936, + "step": 17859 + }, + { + "epoch": 0.52, + "grad_norm": 1.2892652925901475, + "learning_rate": 4.951151181408719e-06, + "loss": 0.2954, + "step": 17860 + }, + { + "epoch": 0.52, + "grad_norm": 1.2006149925730185, + "learning_rate": 4.9506814963907885e-06, + "loss": 0.3068, + "step": 17861 + }, + { + "epoch": 0.52, + "grad_norm": 1.5027653940576158, + "learning_rate": 4.950211811808094e-06, + "loss": 0.2924, + "step": 17862 + }, + { + "epoch": 0.52, + "grad_norm": 1.4792809509814826, + "learning_rate": 4.949742127664781e-06, + "loss": 0.3143, + "step": 17863 + }, + { + "epoch": 0.52, + "grad_norm": 1.4056356880009497, + "learning_rate": 4.949272443964996e-06, + "loss": 0.3067, + "step": 17864 + }, + { + "epoch": 0.52, + "grad_norm": 1.2611992111330679, + "learning_rate": 4.948802760712881e-06, + "loss": 0.2985, + "step": 17865 + }, + { + "epoch": 0.52, + "grad_norm": 1.3414594480965558, + "learning_rate": 4.9483330779125824e-06, + "loss": 0.3181, + "step": 17866 + }, + { + "epoch": 0.52, + "grad_norm": 1.2325169376098017, + "learning_rate": 4.947863395568244e-06, + "loss": 0.3149, + "step": 17867 + }, + { + "epoch": 0.52, + "grad_norm": 1.4705032546437737, + "learning_rate": 4.947393713684014e-06, + "loss": 0.3232, + "step": 17868 + }, + { + "epoch": 0.52, + "grad_norm": 1.224500072168746, + "learning_rate": 4.946924032264034e-06, + "loss": 0.2987, + "step": 17869 + }, + { + "epoch": 0.52, + "grad_norm": 1.45364720104134, + "learning_rate": 4.9464543513124485e-06, + "loss": 0.3303, + "step": 17870 + }, + { + "epoch": 0.52, + "grad_norm": 1.4695638673561924, + "learning_rate": 4.9459846708334044e-06, + "loss": 0.3161, + "step": 17871 + }, + { + "epoch": 0.52, + "grad_norm": 1.2489104590046178, + "learning_rate": 4.945514990831048e-06, + "loss": 0.3, + "step": 17872 + }, + { + "epoch": 0.52, + "grad_norm": 1.384805659943131, + "learning_rate": 4.945045311309522e-06, + "loss": 0.314, + "step": 17873 + }, + { + "epoch": 0.52, + "grad_norm": 1.3401425255059793, + "learning_rate": 4.94457563227297e-06, + "loss": 0.3114, + "step": 17874 + }, + { + "epoch": 0.52, + "grad_norm": 1.9808886682861362, + "learning_rate": 4.944105953725539e-06, + "loss": 0.3074, + "step": 17875 + }, + { + "epoch": 0.52, + "grad_norm": 2.0867704182221836, + "learning_rate": 4.943636275671374e-06, + "loss": 0.302, + "step": 17876 + }, + { + "epoch": 0.52, + "grad_norm": 1.3144744461925373, + "learning_rate": 4.943166598114621e-06, + "loss": 0.291, + "step": 17877 + }, + { + "epoch": 0.52, + "grad_norm": 1.2655912120693396, + "learning_rate": 4.942696921059421e-06, + "loss": 0.3003, + "step": 17878 + }, + { + "epoch": 0.52, + "grad_norm": 1.4487118141479203, + "learning_rate": 4.942227244509923e-06, + "loss": 0.2875, + "step": 17879 + }, + { + "epoch": 0.52, + "grad_norm": 1.4136470170402522, + "learning_rate": 4.941757568470269e-06, + "loss": 0.333, + "step": 17880 + }, + { + "epoch": 0.52, + "grad_norm": 1.2660140029249736, + "learning_rate": 4.941287892944607e-06, + "loss": 0.3143, + "step": 17881 + }, + { + "epoch": 0.52, + "grad_norm": 1.2978152672399688, + "learning_rate": 4.940818217937078e-06, + "loss": 0.2973, + "step": 17882 + }, + { + "epoch": 0.52, + "grad_norm": 1.7698712959471328, + "learning_rate": 4.940348543451832e-06, + "loss": 0.3088, + "step": 17883 + }, + { + "epoch": 0.52, + "grad_norm": 1.4915063768503745, + "learning_rate": 4.939878869493008e-06, + "loss": 0.3038, + "step": 17884 + }, + { + "epoch": 0.52, + "grad_norm": 1.30862790437352, + "learning_rate": 4.9394091960647535e-06, + "loss": 0.3076, + "step": 17885 + }, + { + "epoch": 0.52, + "grad_norm": 1.258412813651649, + "learning_rate": 4.938939523171215e-06, + "loss": 0.3047, + "step": 17886 + }, + { + "epoch": 0.52, + "grad_norm": 1.4427207772172173, + "learning_rate": 4.9384698508165354e-06, + "loss": 0.2951, + "step": 17887 + }, + { + "epoch": 0.52, + "grad_norm": 2.32721129751795, + "learning_rate": 4.938000179004859e-06, + "loss": 0.3137, + "step": 17888 + }, + { + "epoch": 0.52, + "grad_norm": 0.9536464083176805, + "learning_rate": 4.937530507740333e-06, + "loss": 0.5774, + "step": 17889 + }, + { + "epoch": 0.52, + "grad_norm": 1.3625321564001198, + "learning_rate": 4.937060837027099e-06, + "loss": 0.2903, + "step": 17890 + }, + { + "epoch": 0.52, + "grad_norm": 1.3060388953019961, + "learning_rate": 4.936591166869305e-06, + "loss": 0.3184, + "step": 17891 + }, + { + "epoch": 0.52, + "grad_norm": 1.4587885845354198, + "learning_rate": 4.936121497271096e-06, + "loss": 0.302, + "step": 17892 + }, + { + "epoch": 0.52, + "grad_norm": 1.5158529903309779, + "learning_rate": 4.935651828236614e-06, + "loss": 0.3132, + "step": 17893 + }, + { + "epoch": 0.52, + "grad_norm": 1.230846150787605, + "learning_rate": 4.9351821597700054e-06, + "loss": 0.2944, + "step": 17894 + }, + { + "epoch": 0.52, + "grad_norm": 1.486558646425453, + "learning_rate": 4.9347124918754145e-06, + "loss": 0.2984, + "step": 17895 + }, + { + "epoch": 0.52, + "grad_norm": 1.6829161057405815, + "learning_rate": 4.934242824556988e-06, + "loss": 0.2905, + "step": 17896 + }, + { + "epoch": 0.52, + "grad_norm": 1.2741417192127393, + "learning_rate": 4.933773157818868e-06, + "loss": 0.2972, + "step": 17897 + }, + { + "epoch": 0.52, + "grad_norm": 1.313473473375462, + "learning_rate": 4.933303491665201e-06, + "loss": 0.3014, + "step": 17898 + }, + { + "epoch": 0.52, + "grad_norm": 4.4650178055506435, + "learning_rate": 4.932833826100131e-06, + "loss": 0.3282, + "step": 17899 + }, + { + "epoch": 0.52, + "grad_norm": 1.1981623409615896, + "learning_rate": 4.932364161127805e-06, + "loss": 0.3023, + "step": 17900 + }, + { + "epoch": 0.52, + "grad_norm": 1.2358459353841729, + "learning_rate": 4.931894496752365e-06, + "loss": 0.3173, + "step": 17901 + }, + { + "epoch": 0.52, + "grad_norm": 1.462498917539269, + "learning_rate": 4.931424832977956e-06, + "loss": 0.332, + "step": 17902 + }, + { + "epoch": 0.52, + "grad_norm": 1.2363053016004886, + "learning_rate": 4.9309551698087235e-06, + "loss": 0.3059, + "step": 17903 + }, + { + "epoch": 0.52, + "grad_norm": 1.240090322211992, + "learning_rate": 4.930485507248815e-06, + "loss": 0.2929, + "step": 17904 + }, + { + "epoch": 0.52, + "grad_norm": 1.3627709512703958, + "learning_rate": 4.93001584530237e-06, + "loss": 0.299, + "step": 17905 + }, + { + "epoch": 0.52, + "grad_norm": 1.1791602933263978, + "learning_rate": 4.929546183973537e-06, + "loss": 0.2927, + "step": 17906 + }, + { + "epoch": 0.52, + "grad_norm": 1.343085601500648, + "learning_rate": 4.929076523266458e-06, + "loss": 0.304, + "step": 17907 + }, + { + "epoch": 0.52, + "grad_norm": 1.3321463935886841, + "learning_rate": 4.928606863185281e-06, + "loss": 0.3264, + "step": 17908 + }, + { + "epoch": 0.52, + "grad_norm": 1.2436197142982173, + "learning_rate": 4.92813720373415e-06, + "loss": 0.2911, + "step": 17909 + }, + { + "epoch": 0.52, + "grad_norm": 1.3495050129211823, + "learning_rate": 4.927667544917208e-06, + "loss": 0.3072, + "step": 17910 + }, + { + "epoch": 0.52, + "grad_norm": 1.3187626613433776, + "learning_rate": 4.927197886738603e-06, + "loss": 0.3145, + "step": 17911 + }, + { + "epoch": 0.52, + "grad_norm": 1.3378390428651916, + "learning_rate": 4.926728229202475e-06, + "loss": 0.2943, + "step": 17912 + }, + { + "epoch": 0.52, + "grad_norm": 1.3282559952252373, + "learning_rate": 4.926258572312971e-06, + "loss": 0.2978, + "step": 17913 + }, + { + "epoch": 0.52, + "grad_norm": 1.5015890981491729, + "learning_rate": 4.925788916074236e-06, + "loss": 0.3265, + "step": 17914 + }, + { + "epoch": 0.52, + "grad_norm": 2.166648000507846, + "learning_rate": 4.925319260490416e-06, + "loss": 0.3144, + "step": 17915 + }, + { + "epoch": 0.52, + "grad_norm": 1.3780789951254082, + "learning_rate": 4.924849605565653e-06, + "loss": 0.2999, + "step": 17916 + }, + { + "epoch": 0.52, + "grad_norm": 1.2828779861799942, + "learning_rate": 4.924379951304094e-06, + "loss": 0.2995, + "step": 17917 + }, + { + "epoch": 0.52, + "grad_norm": 1.337845494984615, + "learning_rate": 4.923910297709881e-06, + "loss": 0.3022, + "step": 17918 + }, + { + "epoch": 0.52, + "grad_norm": 1.240802346715622, + "learning_rate": 4.923440644787163e-06, + "loss": 0.3045, + "step": 17919 + }, + { + "epoch": 0.52, + "grad_norm": 1.249548716703961, + "learning_rate": 4.92297099254008e-06, + "loss": 0.2919, + "step": 17920 + }, + { + "epoch": 0.52, + "grad_norm": 1.2634153079229882, + "learning_rate": 4.92250134097278e-06, + "loss": 0.2922, + "step": 17921 + }, + { + "epoch": 0.52, + "grad_norm": 1.3535343685792556, + "learning_rate": 4.922031690089406e-06, + "loss": 0.3033, + "step": 17922 + }, + { + "epoch": 0.52, + "grad_norm": 1.22027277926713, + "learning_rate": 4.921562039894104e-06, + "loss": 0.3292, + "step": 17923 + }, + { + "epoch": 0.52, + "grad_norm": 1.2651147851361546, + "learning_rate": 4.9210923903910174e-06, + "loss": 0.2981, + "step": 17924 + }, + { + "epoch": 0.52, + "grad_norm": 1.2553288473653443, + "learning_rate": 4.92062274158429e-06, + "loss": 0.2963, + "step": 17925 + }, + { + "epoch": 0.52, + "grad_norm": 1.3446789262709085, + "learning_rate": 4.9201530934780695e-06, + "loss": 0.3223, + "step": 17926 + }, + { + "epoch": 0.52, + "grad_norm": 1.2517071458866644, + "learning_rate": 4.919683446076498e-06, + "loss": 0.2967, + "step": 17927 + }, + { + "epoch": 0.52, + "grad_norm": 1.7491200494804773, + "learning_rate": 4.919213799383722e-06, + "loss": 0.3506, + "step": 17928 + }, + { + "epoch": 0.52, + "grad_norm": 1.247244184945486, + "learning_rate": 4.918744153403884e-06, + "loss": 0.3096, + "step": 17929 + }, + { + "epoch": 0.52, + "grad_norm": 1.3280929999304885, + "learning_rate": 4.91827450814113e-06, + "loss": 0.3111, + "step": 17930 + }, + { + "epoch": 0.52, + "grad_norm": 1.4928589108185402, + "learning_rate": 4.917804863599605e-06, + "loss": 0.2995, + "step": 17931 + }, + { + "epoch": 0.52, + "grad_norm": 1.3998135567527836, + "learning_rate": 4.917335219783453e-06, + "loss": 0.2945, + "step": 17932 + }, + { + "epoch": 0.52, + "grad_norm": 1.2957138154658967, + "learning_rate": 4.916865576696818e-06, + "loss": 0.2943, + "step": 17933 + }, + { + "epoch": 0.52, + "grad_norm": 1.4014868160931666, + "learning_rate": 4.916395934343845e-06, + "loss": 0.3128, + "step": 17934 + }, + { + "epoch": 0.52, + "grad_norm": 1.2736297244648163, + "learning_rate": 4.915926292728679e-06, + "loss": 0.3159, + "step": 17935 + }, + { + "epoch": 0.52, + "grad_norm": 1.5891729844232851, + "learning_rate": 4.915456651855465e-06, + "loss": 0.3305, + "step": 17936 + }, + { + "epoch": 0.52, + "grad_norm": 1.5044693470541202, + "learning_rate": 4.914987011728347e-06, + "loss": 0.3014, + "step": 17937 + }, + { + "epoch": 0.52, + "grad_norm": 1.275238961465051, + "learning_rate": 4.914517372351468e-06, + "loss": 0.3096, + "step": 17938 + }, + { + "epoch": 0.52, + "grad_norm": 4.429420974547866, + "learning_rate": 4.914047733728978e-06, + "loss": 0.3001, + "step": 17939 + }, + { + "epoch": 0.52, + "grad_norm": 1.599427473664545, + "learning_rate": 4.913578095865014e-06, + "loss": 0.3244, + "step": 17940 + }, + { + "epoch": 0.52, + "grad_norm": 1.3170522331043035, + "learning_rate": 4.913108458763725e-06, + "loss": 0.3085, + "step": 17941 + }, + { + "epoch": 0.52, + "grad_norm": 1.37084541237482, + "learning_rate": 4.912638822429257e-06, + "loss": 0.3224, + "step": 17942 + }, + { + "epoch": 0.52, + "grad_norm": 1.604863425386245, + "learning_rate": 4.91216918686575e-06, + "loss": 0.325, + "step": 17943 + }, + { + "epoch": 0.52, + "grad_norm": 1.343520698006707, + "learning_rate": 4.91169955207735e-06, + "loss": 0.3283, + "step": 17944 + }, + { + "epoch": 0.52, + "grad_norm": 1.347779526148395, + "learning_rate": 4.911229918068204e-06, + "loss": 0.3042, + "step": 17945 + }, + { + "epoch": 0.52, + "grad_norm": 1.2311492934674522, + "learning_rate": 4.910760284842454e-06, + "loss": 0.2738, + "step": 17946 + }, + { + "epoch": 0.52, + "grad_norm": 1.2718527299160767, + "learning_rate": 4.910290652404247e-06, + "loss": 0.3091, + "step": 17947 + }, + { + "epoch": 0.52, + "grad_norm": 1.2652368076749043, + "learning_rate": 4.909821020757725e-06, + "loss": 0.3034, + "step": 17948 + }, + { + "epoch": 0.52, + "grad_norm": 1.3727394683071197, + "learning_rate": 4.909351389907034e-06, + "loss": 0.2947, + "step": 17949 + }, + { + "epoch": 0.52, + "grad_norm": 1.9948673327298525, + "learning_rate": 4.908881759856317e-06, + "loss": 0.3163, + "step": 17950 + }, + { + "epoch": 0.52, + "grad_norm": 1.26384263249236, + "learning_rate": 4.908412130609721e-06, + "loss": 0.3226, + "step": 17951 + }, + { + "epoch": 0.52, + "grad_norm": 1.3624095836842907, + "learning_rate": 4.907942502171388e-06, + "loss": 0.3094, + "step": 17952 + }, + { + "epoch": 0.52, + "grad_norm": 1.4915006268006197, + "learning_rate": 4.9074728745454635e-06, + "loss": 0.3167, + "step": 17953 + }, + { + "epoch": 0.52, + "grad_norm": 1.6287603459271887, + "learning_rate": 4.907003247736091e-06, + "loss": 0.3144, + "step": 17954 + }, + { + "epoch": 0.52, + "grad_norm": 1.2785437056030553, + "learning_rate": 4.906533621747418e-06, + "loss": 0.2947, + "step": 17955 + }, + { + "epoch": 0.52, + "grad_norm": 1.3209840997735782, + "learning_rate": 4.906063996583585e-06, + "loss": 0.2959, + "step": 17956 + }, + { + "epoch": 0.52, + "grad_norm": 1.2981000595837018, + "learning_rate": 4.9055943722487384e-06, + "loss": 0.3011, + "step": 17957 + }, + { + "epoch": 0.52, + "grad_norm": 1.2565395154600414, + "learning_rate": 4.905124748747023e-06, + "loss": 0.2918, + "step": 17958 + }, + { + "epoch": 0.52, + "grad_norm": 1.2453340062460856, + "learning_rate": 4.904655126082584e-06, + "loss": 0.2834, + "step": 17959 + }, + { + "epoch": 0.52, + "grad_norm": 1.7976215988152198, + "learning_rate": 4.904185504259563e-06, + "loss": 0.3061, + "step": 17960 + }, + { + "epoch": 0.52, + "grad_norm": 1.2675764669686431, + "learning_rate": 4.9037158832821055e-06, + "loss": 0.3057, + "step": 17961 + }, + { + "epoch": 0.52, + "grad_norm": 1.4462389667961977, + "learning_rate": 4.903246263154358e-06, + "loss": 0.2931, + "step": 17962 + }, + { + "epoch": 0.52, + "grad_norm": 1.3534202338757235, + "learning_rate": 4.902776643880461e-06, + "loss": 0.306, + "step": 17963 + }, + { + "epoch": 0.52, + "grad_norm": 1.3211629475684514, + "learning_rate": 4.9023070254645646e-06, + "loss": 0.2823, + "step": 17964 + }, + { + "epoch": 0.52, + "grad_norm": 1.2508025274624024, + "learning_rate": 4.901837407910807e-06, + "loss": 0.3099, + "step": 17965 + }, + { + "epoch": 0.52, + "grad_norm": 1.3607597564763372, + "learning_rate": 4.901367791223336e-06, + "loss": 0.316, + "step": 17966 + }, + { + "epoch": 0.52, + "grad_norm": 2.362888828505844, + "learning_rate": 4.900898175406298e-06, + "loss": 0.2968, + "step": 17967 + }, + { + "epoch": 0.52, + "grad_norm": 1.2611458339601709, + "learning_rate": 4.900428560463832e-06, + "loss": 0.3132, + "step": 17968 + }, + { + "epoch": 0.52, + "grad_norm": 1.5396628596309183, + "learning_rate": 4.899958946400085e-06, + "loss": 0.298, + "step": 17969 + }, + { + "epoch": 0.52, + "grad_norm": 1.7871912536507781, + "learning_rate": 4.899489333219202e-06, + "loss": 0.3051, + "step": 17970 + }, + { + "epoch": 0.52, + "grad_norm": 1.4296594250641277, + "learning_rate": 4.899019720925326e-06, + "loss": 0.337, + "step": 17971 + }, + { + "epoch": 0.52, + "grad_norm": 1.2641835002389787, + "learning_rate": 4.8985501095226026e-06, + "loss": 0.3005, + "step": 17972 + }, + { + "epoch": 0.52, + "grad_norm": 1.438636512504418, + "learning_rate": 4.898080499015176e-06, + "loss": 0.3161, + "step": 17973 + }, + { + "epoch": 0.52, + "grad_norm": 1.9303574910596246, + "learning_rate": 4.89761088940719e-06, + "loss": 0.3233, + "step": 17974 + }, + { + "epoch": 0.52, + "grad_norm": 1.3122819723705423, + "learning_rate": 4.897141280702788e-06, + "loss": 0.3158, + "step": 17975 + }, + { + "epoch": 0.52, + "grad_norm": 1.3933006736890663, + "learning_rate": 4.896671672906117e-06, + "loss": 0.3001, + "step": 17976 + }, + { + "epoch": 0.52, + "grad_norm": 2.1309992543684375, + "learning_rate": 4.8962020660213184e-06, + "loss": 0.3171, + "step": 17977 + }, + { + "epoch": 0.52, + "grad_norm": 1.3381361783492878, + "learning_rate": 4.895732460052539e-06, + "loss": 0.321, + "step": 17978 + }, + { + "epoch": 0.52, + "grad_norm": 1.371645644409727, + "learning_rate": 4.89526285500392e-06, + "loss": 0.3085, + "step": 17979 + }, + { + "epoch": 0.52, + "grad_norm": 1.5176170958009885, + "learning_rate": 4.894793250879608e-06, + "loss": 0.3224, + "step": 17980 + }, + { + "epoch": 0.52, + "grad_norm": 1.272140442177049, + "learning_rate": 4.894323647683747e-06, + "loss": 0.2929, + "step": 17981 + }, + { + "epoch": 0.52, + "grad_norm": 1.6777878086028948, + "learning_rate": 4.893854045420481e-06, + "loss": 0.291, + "step": 17982 + }, + { + "epoch": 0.52, + "grad_norm": 1.4628498228318816, + "learning_rate": 4.893384444093955e-06, + "loss": 0.3076, + "step": 17983 + }, + { + "epoch": 0.52, + "grad_norm": 1.3833677993048235, + "learning_rate": 4.892914843708311e-06, + "loss": 0.317, + "step": 17984 + }, + { + "epoch": 0.52, + "grad_norm": 1.6743176163124278, + "learning_rate": 4.892445244267696e-06, + "loss": 0.3294, + "step": 17985 + }, + { + "epoch": 0.52, + "grad_norm": 1.3134594686651235, + "learning_rate": 4.891975645776251e-06, + "loss": 0.2996, + "step": 17986 + }, + { + "epoch": 0.52, + "grad_norm": 1.2006057098888092, + "learning_rate": 4.891506048238125e-06, + "loss": 0.3072, + "step": 17987 + }, + { + "epoch": 0.52, + "grad_norm": 1.776653068232362, + "learning_rate": 4.891036451657457e-06, + "loss": 0.3484, + "step": 17988 + }, + { + "epoch": 0.52, + "grad_norm": 1.209688157415977, + "learning_rate": 4.890566856038395e-06, + "loss": 0.3002, + "step": 17989 + }, + { + "epoch": 0.52, + "grad_norm": 1.3401602934060712, + "learning_rate": 4.89009726138508e-06, + "loss": 0.2871, + "step": 17990 + }, + { + "epoch": 0.52, + "grad_norm": 0.9226773093115621, + "learning_rate": 4.8896276677016595e-06, + "loss": 0.6184, + "step": 17991 + }, + { + "epoch": 0.52, + "grad_norm": 1.3570620048673774, + "learning_rate": 4.889158074992276e-06, + "loss": 0.2928, + "step": 17992 + }, + { + "epoch": 0.52, + "grad_norm": 1.3297039478576593, + "learning_rate": 4.888688483261073e-06, + "loss": 0.326, + "step": 17993 + }, + { + "epoch": 0.52, + "grad_norm": 1.2213131088705824, + "learning_rate": 4.8882188925121944e-06, + "loss": 0.3034, + "step": 17994 + }, + { + "epoch": 0.52, + "grad_norm": 1.2882507877592035, + "learning_rate": 4.887749302749789e-06, + "loss": 0.2955, + "step": 17995 + }, + { + "epoch": 0.52, + "grad_norm": 1.3390156600502647, + "learning_rate": 4.887279713977995e-06, + "loss": 0.3127, + "step": 17996 + }, + { + "epoch": 0.52, + "grad_norm": 1.245107415243777, + "learning_rate": 4.886810126200959e-06, + "loss": 0.3129, + "step": 17997 + }, + { + "epoch": 0.52, + "grad_norm": 1.3167882825968624, + "learning_rate": 4.8863405394228235e-06, + "loss": 0.3323, + "step": 17998 + }, + { + "epoch": 0.52, + "grad_norm": 1.2539102053907552, + "learning_rate": 4.885870953647735e-06, + "loss": 0.2965, + "step": 17999 + }, + { + "epoch": 0.52, + "grad_norm": 3.014408342153114, + "learning_rate": 4.885401368879836e-06, + "loss": 0.3199, + "step": 18000 + }, + { + "epoch": 0.52, + "grad_norm": 1.752280879423083, + "learning_rate": 4.884931785123272e-06, + "loss": 0.2922, + "step": 18001 + }, + { + "epoch": 0.52, + "grad_norm": 1.4553589177398991, + "learning_rate": 4.884462202382186e-06, + "loss": 0.2713, + "step": 18002 + }, + { + "epoch": 0.52, + "grad_norm": 1.3866568218455233, + "learning_rate": 4.883992620660722e-06, + "loss": 0.3067, + "step": 18003 + }, + { + "epoch": 0.52, + "grad_norm": 1.2926939586684765, + "learning_rate": 4.883523039963025e-06, + "loss": 0.2923, + "step": 18004 + }, + { + "epoch": 0.52, + "grad_norm": 1.2056987100036802, + "learning_rate": 4.883053460293237e-06, + "loss": 0.3277, + "step": 18005 + }, + { + "epoch": 0.52, + "grad_norm": 1.5343221927694104, + "learning_rate": 4.882583881655506e-06, + "loss": 0.2945, + "step": 18006 + }, + { + "epoch": 0.52, + "grad_norm": 1.3212509957278613, + "learning_rate": 4.882114304053972e-06, + "loss": 0.3268, + "step": 18007 + }, + { + "epoch": 0.52, + "grad_norm": 1.377280655061186, + "learning_rate": 4.88164472749278e-06, + "loss": 0.3179, + "step": 18008 + }, + { + "epoch": 0.52, + "grad_norm": 1.6212742808272644, + "learning_rate": 4.881175151976075e-06, + "loss": 0.3073, + "step": 18009 + }, + { + "epoch": 0.52, + "grad_norm": 3.2382774269563233, + "learning_rate": 4.880705577508003e-06, + "loss": 0.3303, + "step": 18010 + }, + { + "epoch": 0.52, + "grad_norm": 1.271950095890388, + "learning_rate": 4.880236004092703e-06, + "loss": 0.2965, + "step": 18011 + }, + { + "epoch": 0.52, + "grad_norm": 1.3295852165013453, + "learning_rate": 4.879766431734321e-06, + "loss": 0.2997, + "step": 18012 + }, + { + "epoch": 0.52, + "grad_norm": 1.20601401426565, + "learning_rate": 4.879296860437003e-06, + "loss": 0.3109, + "step": 18013 + }, + { + "epoch": 0.52, + "grad_norm": 1.3812528117488732, + "learning_rate": 4.8788272902048925e-06, + "loss": 0.328, + "step": 18014 + }, + { + "epoch": 0.52, + "grad_norm": 1.2585231669734374, + "learning_rate": 4.878357721042131e-06, + "loss": 0.2927, + "step": 18015 + }, + { + "epoch": 0.52, + "grad_norm": 1.7435037479932063, + "learning_rate": 4.8778881529528635e-06, + "loss": 0.2965, + "step": 18016 + }, + { + "epoch": 0.52, + "grad_norm": 2.6974462013238756, + "learning_rate": 4.877418585941235e-06, + "loss": 0.3118, + "step": 18017 + }, + { + "epoch": 0.52, + "grad_norm": 1.3376396171225227, + "learning_rate": 4.8769490200113886e-06, + "loss": 0.3378, + "step": 18018 + }, + { + "epoch": 0.52, + "grad_norm": 1.2439805672775615, + "learning_rate": 4.87647945516747e-06, + "loss": 0.2921, + "step": 18019 + }, + { + "epoch": 0.52, + "grad_norm": 1.2509972593433067, + "learning_rate": 4.87600989141362e-06, + "loss": 0.3111, + "step": 18020 + }, + { + "epoch": 0.52, + "grad_norm": 1.4475660027922053, + "learning_rate": 4.875540328753985e-06, + "loss": 0.3308, + "step": 18021 + }, + { + "epoch": 0.52, + "grad_norm": 1.356121985395413, + "learning_rate": 4.875070767192706e-06, + "loss": 0.3032, + "step": 18022 + }, + { + "epoch": 0.52, + "grad_norm": 1.3643872631527194, + "learning_rate": 4.874601206733933e-06, + "loss": 0.3195, + "step": 18023 + }, + { + "epoch": 0.52, + "grad_norm": 1.2535019493385087, + "learning_rate": 4.874131647381803e-06, + "loss": 0.2958, + "step": 18024 + }, + { + "epoch": 0.52, + "grad_norm": 1.3285603062706428, + "learning_rate": 4.873662089140463e-06, + "loss": 0.3335, + "step": 18025 + }, + { + "epoch": 0.52, + "grad_norm": 1.2773150776088147, + "learning_rate": 4.873192532014056e-06, + "loss": 0.3043, + "step": 18026 + }, + { + "epoch": 0.52, + "grad_norm": 1.4197107335174786, + "learning_rate": 4.8727229760067266e-06, + "loss": 0.316, + "step": 18027 + }, + { + "epoch": 0.52, + "grad_norm": 1.3073981655647893, + "learning_rate": 4.872253421122618e-06, + "loss": 0.3009, + "step": 18028 + }, + { + "epoch": 0.52, + "grad_norm": 1.5453721900405397, + "learning_rate": 4.871783867365876e-06, + "loss": 0.3155, + "step": 18029 + }, + { + "epoch": 0.52, + "grad_norm": 1.333625042026758, + "learning_rate": 4.871314314740641e-06, + "loss": 0.293, + "step": 18030 + }, + { + "epoch": 0.52, + "grad_norm": 1.3297367222453453, + "learning_rate": 4.870844763251058e-06, + "loss": 0.2994, + "step": 18031 + }, + { + "epoch": 0.52, + "grad_norm": 1.4195286233267632, + "learning_rate": 4.870375212901273e-06, + "loss": 0.2941, + "step": 18032 + }, + { + "epoch": 0.52, + "grad_norm": 1.3533161590413012, + "learning_rate": 4.869905663695428e-06, + "loss": 0.3174, + "step": 18033 + }, + { + "epoch": 0.52, + "grad_norm": 1.2642234279042697, + "learning_rate": 4.869436115637666e-06, + "loss": 0.2994, + "step": 18034 + }, + { + "epoch": 0.52, + "grad_norm": 1.247265855253385, + "learning_rate": 4.868966568732132e-06, + "loss": 0.309, + "step": 18035 + }, + { + "epoch": 0.52, + "grad_norm": 1.2452618225569199, + "learning_rate": 4.8684970229829695e-06, + "loss": 0.2948, + "step": 18036 + }, + { + "epoch": 0.52, + "grad_norm": 1.3410171758179381, + "learning_rate": 4.868027478394324e-06, + "loss": 0.2774, + "step": 18037 + }, + { + "epoch": 0.52, + "grad_norm": 1.2911337483046055, + "learning_rate": 4.867557934970335e-06, + "loss": 0.2966, + "step": 18038 + }, + { + "epoch": 0.52, + "grad_norm": 1.2582906677020937, + "learning_rate": 4.86708839271515e-06, + "loss": 0.3096, + "step": 18039 + }, + { + "epoch": 0.52, + "grad_norm": 1.3232322252932895, + "learning_rate": 4.866618851632911e-06, + "loss": 0.3049, + "step": 18040 + }, + { + "epoch": 0.52, + "grad_norm": 1.1780671488902281, + "learning_rate": 4.866149311727761e-06, + "loss": 0.2951, + "step": 18041 + }, + { + "epoch": 0.52, + "grad_norm": 1.1693498288860202, + "learning_rate": 4.865679773003847e-06, + "loss": 0.3053, + "step": 18042 + }, + { + "epoch": 0.52, + "grad_norm": 1.2731269005148853, + "learning_rate": 4.8652102354653095e-06, + "loss": 0.3038, + "step": 18043 + }, + { + "epoch": 0.52, + "grad_norm": 1.3582867423613636, + "learning_rate": 4.864740699116292e-06, + "loss": 0.3181, + "step": 18044 + }, + { + "epoch": 0.52, + "grad_norm": 1.4178586799692325, + "learning_rate": 4.864271163960941e-06, + "loss": 0.3228, + "step": 18045 + }, + { + "epoch": 0.52, + "grad_norm": 1.79560212990933, + "learning_rate": 4.863801630003399e-06, + "loss": 0.2839, + "step": 18046 + }, + { + "epoch": 0.52, + "grad_norm": 1.32658023742568, + "learning_rate": 4.8633320972478075e-06, + "loss": 0.3196, + "step": 18047 + }, + { + "epoch": 0.52, + "grad_norm": 1.3419645964059401, + "learning_rate": 4.862862565698313e-06, + "loss": 0.2846, + "step": 18048 + }, + { + "epoch": 0.52, + "grad_norm": 0.9911318888937661, + "learning_rate": 4.862393035359057e-06, + "loss": 0.5983, + "step": 18049 + }, + { + "epoch": 0.52, + "grad_norm": 1.307966915364942, + "learning_rate": 4.861923506234185e-06, + "loss": 0.3002, + "step": 18050 + }, + { + "epoch": 0.52, + "grad_norm": 1.2063519974290577, + "learning_rate": 4.861453978327838e-06, + "loss": 0.3002, + "step": 18051 + }, + { + "epoch": 0.52, + "grad_norm": 1.5498226994198048, + "learning_rate": 4.860984451644164e-06, + "loss": 0.3197, + "step": 18052 + }, + { + "epoch": 0.52, + "grad_norm": 1.6184014977812715, + "learning_rate": 4.8605149261873015e-06, + "loss": 0.3237, + "step": 18053 + }, + { + "epoch": 0.52, + "grad_norm": 1.3275556493528209, + "learning_rate": 4.860045401961397e-06, + "loss": 0.3154, + "step": 18054 + }, + { + "epoch": 0.52, + "grad_norm": 0.9340343747805335, + "learning_rate": 4.859575878970592e-06, + "loss": 0.5786, + "step": 18055 + }, + { + "epoch": 0.52, + "grad_norm": 1.289356007752792, + "learning_rate": 4.859106357219033e-06, + "loss": 0.3231, + "step": 18056 + }, + { + "epoch": 0.52, + "grad_norm": 1.5732067802237408, + "learning_rate": 4.858636836710861e-06, + "loss": 0.3223, + "step": 18057 + }, + { + "epoch": 0.52, + "grad_norm": 1.375830980700696, + "learning_rate": 4.858167317450221e-06, + "loss": 0.2915, + "step": 18058 + }, + { + "epoch": 0.52, + "grad_norm": 1.1801727127983952, + "learning_rate": 4.857697799441255e-06, + "loss": 0.2963, + "step": 18059 + }, + { + "epoch": 0.52, + "grad_norm": 1.4303076218986495, + "learning_rate": 4.857228282688108e-06, + "loss": 0.3577, + "step": 18060 + }, + { + "epoch": 0.52, + "grad_norm": 1.3973899658684639, + "learning_rate": 4.856758767194924e-06, + "loss": 0.2759, + "step": 18061 + }, + { + "epoch": 0.52, + "grad_norm": 1.411951347552949, + "learning_rate": 4.856289252965845e-06, + "loss": 0.3158, + "step": 18062 + }, + { + "epoch": 0.52, + "grad_norm": 1.3961215530044233, + "learning_rate": 4.855819740005015e-06, + "loss": 0.2955, + "step": 18063 + }, + { + "epoch": 0.52, + "grad_norm": 1.2920954786082008, + "learning_rate": 4.8553502283165765e-06, + "loss": 0.3142, + "step": 18064 + }, + { + "epoch": 0.52, + "grad_norm": 1.2586630296670016, + "learning_rate": 4.854880717904675e-06, + "loss": 0.3066, + "step": 18065 + }, + { + "epoch": 0.52, + "grad_norm": 1.9589825356957642, + "learning_rate": 4.854411208773452e-06, + "loss": 0.2927, + "step": 18066 + }, + { + "epoch": 0.52, + "grad_norm": 1.8948861130333885, + "learning_rate": 4.853941700927052e-06, + "loss": 0.3006, + "step": 18067 + }, + { + "epoch": 0.52, + "grad_norm": 1.4892466354039096, + "learning_rate": 4.853472194369617e-06, + "loss": 0.3089, + "step": 18068 + }, + { + "epoch": 0.52, + "grad_norm": 1.1961556603897603, + "learning_rate": 4.853002689105294e-06, + "loss": 0.2903, + "step": 18069 + }, + { + "epoch": 0.52, + "grad_norm": 1.3995708939955798, + "learning_rate": 4.8525331851382215e-06, + "loss": 0.3164, + "step": 18070 + }, + { + "epoch": 0.52, + "grad_norm": 1.4145958370947687, + "learning_rate": 4.852063682472547e-06, + "loss": 0.2991, + "step": 18071 + }, + { + "epoch": 0.52, + "grad_norm": 1.248553212245451, + "learning_rate": 4.851594181112411e-06, + "loss": 0.3034, + "step": 18072 + }, + { + "epoch": 0.52, + "grad_norm": 1.3653624199304082, + "learning_rate": 4.851124681061959e-06, + "loss": 0.3449, + "step": 18073 + }, + { + "epoch": 0.52, + "grad_norm": 1.2614824543165566, + "learning_rate": 4.850655182325332e-06, + "loss": 0.286, + "step": 18074 + }, + { + "epoch": 0.52, + "grad_norm": 1.2929666274135148, + "learning_rate": 4.850185684906675e-06, + "loss": 0.2797, + "step": 18075 + }, + { + "epoch": 0.52, + "grad_norm": 2.1353228443113728, + "learning_rate": 4.849716188810131e-06, + "loss": 0.2874, + "step": 18076 + }, + { + "epoch": 0.52, + "grad_norm": 1.2470026799397835, + "learning_rate": 4.849246694039844e-06, + "loss": 0.3434, + "step": 18077 + }, + { + "epoch": 0.52, + "grad_norm": 1.287036214975257, + "learning_rate": 4.848777200599956e-06, + "loss": 0.3072, + "step": 18078 + }, + { + "epoch": 0.52, + "grad_norm": 1.1705542447696409, + "learning_rate": 4.848307708494612e-06, + "loss": 0.298, + "step": 18079 + }, + { + "epoch": 0.52, + "grad_norm": 1.3827220784034155, + "learning_rate": 4.847838217727955e-06, + "loss": 0.2963, + "step": 18080 + }, + { + "epoch": 0.52, + "grad_norm": 1.3123777881372127, + "learning_rate": 4.847368728304125e-06, + "loss": 0.301, + "step": 18081 + }, + { + "epoch": 0.52, + "grad_norm": 1.2035368693861193, + "learning_rate": 4.846899240227268e-06, + "loss": 0.3141, + "step": 18082 + }, + { + "epoch": 0.52, + "grad_norm": 1.2062621625198986, + "learning_rate": 4.846429753501528e-06, + "loss": 0.2841, + "step": 18083 + }, + { + "epoch": 0.52, + "grad_norm": 2.0066195952092483, + "learning_rate": 4.845960268131047e-06, + "loss": 0.2828, + "step": 18084 + }, + { + "epoch": 0.52, + "grad_norm": 1.4210436983149595, + "learning_rate": 4.845490784119967e-06, + "loss": 0.3244, + "step": 18085 + }, + { + "epoch": 0.52, + "grad_norm": 1.5704708678590011, + "learning_rate": 4.845021301472434e-06, + "loss": 0.3182, + "step": 18086 + }, + { + "epoch": 0.52, + "grad_norm": 1.2966931427256796, + "learning_rate": 4.8445518201925885e-06, + "loss": 0.2986, + "step": 18087 + }, + { + "epoch": 0.52, + "grad_norm": 1.2666960202716508, + "learning_rate": 4.844082340284576e-06, + "loss": 0.3296, + "step": 18088 + }, + { + "epoch": 0.52, + "grad_norm": 1.2633581438895984, + "learning_rate": 4.843612861752538e-06, + "loss": 0.3207, + "step": 18089 + }, + { + "epoch": 0.52, + "grad_norm": 1.3358535018522213, + "learning_rate": 4.843143384600618e-06, + "loss": 0.2883, + "step": 18090 + }, + { + "epoch": 0.52, + "grad_norm": 1.3487628026734662, + "learning_rate": 4.842673908832959e-06, + "loss": 0.2933, + "step": 18091 + }, + { + "epoch": 0.52, + "grad_norm": 1.5683405098962688, + "learning_rate": 4.842204434453706e-06, + "loss": 0.3389, + "step": 18092 + }, + { + "epoch": 0.52, + "grad_norm": 1.3934184133535767, + "learning_rate": 4.841734961467e-06, + "loss": 0.312, + "step": 18093 + }, + { + "epoch": 0.52, + "grad_norm": 1.827465412127034, + "learning_rate": 4.841265489876984e-06, + "loss": 0.3238, + "step": 18094 + }, + { + "epoch": 0.52, + "grad_norm": 1.7471262459366843, + "learning_rate": 4.840796019687802e-06, + "loss": 0.3001, + "step": 18095 + }, + { + "epoch": 0.52, + "grad_norm": 1.2138918773678151, + "learning_rate": 4.840326550903597e-06, + "loss": 0.2922, + "step": 18096 + }, + { + "epoch": 0.52, + "grad_norm": 1.2861253206182757, + "learning_rate": 4.8398570835285124e-06, + "loss": 0.3433, + "step": 18097 + }, + { + "epoch": 0.52, + "grad_norm": 1.541153067971077, + "learning_rate": 4.8393876175666905e-06, + "loss": 0.2959, + "step": 18098 + }, + { + "epoch": 0.52, + "grad_norm": 1.2275088991056207, + "learning_rate": 4.838918153022275e-06, + "loss": 0.2938, + "step": 18099 + }, + { + "epoch": 0.52, + "grad_norm": 1.3156355259253179, + "learning_rate": 4.8384486898994076e-06, + "loss": 0.3152, + "step": 18100 + }, + { + "epoch": 0.53, + "grad_norm": 1.3917746977884535, + "learning_rate": 4.837979228202234e-06, + "loss": 0.3575, + "step": 18101 + }, + { + "epoch": 0.53, + "grad_norm": 1.41719863637929, + "learning_rate": 4.837509767934893e-06, + "loss": 0.3037, + "step": 18102 + }, + { + "epoch": 0.53, + "grad_norm": 1.2399834773505973, + "learning_rate": 4.837040309101532e-06, + "loss": 0.2997, + "step": 18103 + }, + { + "epoch": 0.53, + "grad_norm": 1.0411382836720025, + "learning_rate": 4.8365708517062915e-06, + "loss": 0.655, + "step": 18104 + }, + { + "epoch": 0.53, + "grad_norm": 1.563537283696409, + "learning_rate": 4.836101395753317e-06, + "loss": 0.3068, + "step": 18105 + }, + { + "epoch": 0.53, + "grad_norm": 1.2612238285241457, + "learning_rate": 4.835631941246747e-06, + "loss": 0.314, + "step": 18106 + }, + { + "epoch": 0.53, + "grad_norm": 1.4764488839536616, + "learning_rate": 4.8351624881907274e-06, + "loss": 0.3183, + "step": 18107 + }, + { + "epoch": 0.53, + "grad_norm": 0.9503546136907873, + "learning_rate": 4.834693036589402e-06, + "loss": 0.5586, + "step": 18108 + }, + { + "epoch": 0.53, + "grad_norm": 1.3560381134057358, + "learning_rate": 4.8342235864469116e-06, + "loss": 0.3271, + "step": 18109 + }, + { + "epoch": 0.53, + "grad_norm": 1.2141083319343478, + "learning_rate": 4.833754137767399e-06, + "loss": 0.2857, + "step": 18110 + }, + { + "epoch": 0.53, + "grad_norm": 1.8417437162193997, + "learning_rate": 4.83328469055501e-06, + "loss": 0.2989, + "step": 18111 + }, + { + "epoch": 0.53, + "grad_norm": 1.4093411468683483, + "learning_rate": 4.832815244813884e-06, + "loss": 0.3473, + "step": 18112 + }, + { + "epoch": 0.53, + "grad_norm": 1.3750012615365295, + "learning_rate": 4.8323458005481646e-06, + "loss": 0.3379, + "step": 18113 + }, + { + "epoch": 0.53, + "grad_norm": 1.4119429544319475, + "learning_rate": 4.831876357761996e-06, + "loss": 0.3071, + "step": 18114 + }, + { + "epoch": 0.53, + "grad_norm": 1.2171229258422536, + "learning_rate": 4.83140691645952e-06, + "loss": 0.3098, + "step": 18115 + }, + { + "epoch": 0.53, + "grad_norm": 1.276164606231441, + "learning_rate": 4.8309374766448815e-06, + "loss": 0.3289, + "step": 18116 + }, + { + "epoch": 0.53, + "grad_norm": 1.3046122507998839, + "learning_rate": 4.83046803832222e-06, + "loss": 0.3232, + "step": 18117 + }, + { + "epoch": 0.53, + "grad_norm": 1.241468172387908, + "learning_rate": 4.829998601495681e-06, + "loss": 0.3139, + "step": 18118 + }, + { + "epoch": 0.53, + "grad_norm": 1.4422301855342274, + "learning_rate": 4.829529166169405e-06, + "loss": 0.3314, + "step": 18119 + }, + { + "epoch": 0.53, + "grad_norm": 2.0370062519684433, + "learning_rate": 4.829059732347538e-06, + "loss": 0.3009, + "step": 18120 + }, + { + "epoch": 0.53, + "grad_norm": 1.2537584385715148, + "learning_rate": 4.828590300034219e-06, + "loss": 0.279, + "step": 18121 + }, + { + "epoch": 0.53, + "grad_norm": 1.2382325482512573, + "learning_rate": 4.828120869233592e-06, + "loss": 0.2948, + "step": 18122 + }, + { + "epoch": 0.53, + "grad_norm": 1.186281596376115, + "learning_rate": 4.827651439949801e-06, + "loss": 0.2932, + "step": 18123 + }, + { + "epoch": 0.53, + "grad_norm": 1.3345090159461943, + "learning_rate": 4.827182012186989e-06, + "loss": 0.3181, + "step": 18124 + }, + { + "epoch": 0.53, + "grad_norm": 1.2321584653452136, + "learning_rate": 4.826712585949297e-06, + "loss": 0.3258, + "step": 18125 + }, + { + "epoch": 0.53, + "grad_norm": 1.5594113692850355, + "learning_rate": 4.826243161240868e-06, + "loss": 0.3192, + "step": 18126 + }, + { + "epoch": 0.53, + "grad_norm": 2.1390827095258134, + "learning_rate": 4.825773738065844e-06, + "loss": 0.2958, + "step": 18127 + }, + { + "epoch": 0.53, + "grad_norm": 1.3358831832287412, + "learning_rate": 4.825304316428371e-06, + "loss": 0.2901, + "step": 18128 + }, + { + "epoch": 0.53, + "grad_norm": 1.365643605696553, + "learning_rate": 4.824834896332588e-06, + "loss": 0.2955, + "step": 18129 + }, + { + "epoch": 0.53, + "grad_norm": 1.2181459411060345, + "learning_rate": 4.824365477782639e-06, + "loss": 0.2837, + "step": 18130 + }, + { + "epoch": 0.53, + "grad_norm": 1.4568383195700094, + "learning_rate": 4.823896060782667e-06, + "loss": 0.3143, + "step": 18131 + }, + { + "epoch": 0.53, + "grad_norm": 1.2201646876208114, + "learning_rate": 4.823426645336814e-06, + "loss": 0.307, + "step": 18132 + }, + { + "epoch": 0.53, + "grad_norm": 1.3148664101564072, + "learning_rate": 4.822957231449223e-06, + "loss": 0.3098, + "step": 18133 + }, + { + "epoch": 0.53, + "grad_norm": 1.1876577345266666, + "learning_rate": 4.822487819124036e-06, + "loss": 0.2791, + "step": 18134 + }, + { + "epoch": 0.53, + "grad_norm": 1.3011530490414078, + "learning_rate": 4.822018408365396e-06, + "loss": 0.2842, + "step": 18135 + }, + { + "epoch": 0.53, + "grad_norm": 1.4603041432520731, + "learning_rate": 4.821548999177448e-06, + "loss": 0.2848, + "step": 18136 + }, + { + "epoch": 0.53, + "grad_norm": 1.222660690527223, + "learning_rate": 4.821079591564329e-06, + "loss": 0.2831, + "step": 18137 + }, + { + "epoch": 0.53, + "grad_norm": 1.2847835693039151, + "learning_rate": 4.820610185530185e-06, + "loss": 0.2912, + "step": 18138 + }, + { + "epoch": 0.53, + "grad_norm": 1.373769465606506, + "learning_rate": 4.82014078107916e-06, + "loss": 0.3145, + "step": 18139 + }, + { + "epoch": 0.53, + "grad_norm": 1.577996243726569, + "learning_rate": 4.819671378215393e-06, + "loss": 0.3075, + "step": 18140 + }, + { + "epoch": 0.53, + "grad_norm": 1.6323154910727864, + "learning_rate": 4.819201976943028e-06, + "loss": 0.3212, + "step": 18141 + }, + { + "epoch": 0.53, + "grad_norm": 1.6142294580395962, + "learning_rate": 4.818732577266207e-06, + "loss": 0.3215, + "step": 18142 + }, + { + "epoch": 0.53, + "grad_norm": 1.54247157332653, + "learning_rate": 4.818263179189074e-06, + "loss": 0.3215, + "step": 18143 + }, + { + "epoch": 0.53, + "grad_norm": 1.3251960938862895, + "learning_rate": 4.81779378271577e-06, + "loss": 0.3026, + "step": 18144 + }, + { + "epoch": 0.53, + "grad_norm": 1.2623751198486906, + "learning_rate": 4.817324387850438e-06, + "loss": 0.3189, + "step": 18145 + }, + { + "epoch": 0.53, + "grad_norm": 1.4093819897956517, + "learning_rate": 4.8168549945972205e-06, + "loss": 0.2929, + "step": 18146 + }, + { + "epoch": 0.53, + "grad_norm": 1.4328346713700841, + "learning_rate": 4.81638560296026e-06, + "loss": 0.3089, + "step": 18147 + }, + { + "epoch": 0.53, + "grad_norm": 1.243798410725753, + "learning_rate": 4.815916212943698e-06, + "loss": 0.2848, + "step": 18148 + }, + { + "epoch": 0.53, + "grad_norm": 1.5356642140780188, + "learning_rate": 4.815446824551678e-06, + "loss": 0.3254, + "step": 18149 + }, + { + "epoch": 0.53, + "grad_norm": 1.3295564040474401, + "learning_rate": 4.814977437788341e-06, + "loss": 0.3089, + "step": 18150 + }, + { + "epoch": 0.53, + "grad_norm": 3.2686383392438136, + "learning_rate": 4.8145080526578306e-06, + "loss": 0.3572, + "step": 18151 + }, + { + "epoch": 0.53, + "grad_norm": 3.0719819194190143, + "learning_rate": 4.814038669164289e-06, + "loss": 0.3117, + "step": 18152 + }, + { + "epoch": 0.53, + "grad_norm": 1.2470317266036375, + "learning_rate": 4.813569287311858e-06, + "loss": 0.2834, + "step": 18153 + }, + { + "epoch": 0.53, + "grad_norm": 1.2596005536403998, + "learning_rate": 4.81309990710468e-06, + "loss": 0.2884, + "step": 18154 + }, + { + "epoch": 0.53, + "grad_norm": 1.3086211832955048, + "learning_rate": 4.812630528546896e-06, + "loss": 0.307, + "step": 18155 + }, + { + "epoch": 0.53, + "grad_norm": 1.4137811678718732, + "learning_rate": 4.812161151642653e-06, + "loss": 0.3125, + "step": 18156 + }, + { + "epoch": 0.53, + "grad_norm": 1.3785310688366426, + "learning_rate": 4.811691776396088e-06, + "loss": 0.3278, + "step": 18157 + }, + { + "epoch": 0.53, + "grad_norm": 1.7699431304403175, + "learning_rate": 4.811222402811344e-06, + "loss": 0.3267, + "step": 18158 + }, + { + "epoch": 0.53, + "grad_norm": 1.6614226882052299, + "learning_rate": 4.810753030892565e-06, + "loss": 0.2955, + "step": 18159 + }, + { + "epoch": 0.53, + "grad_norm": 1.3479761827218812, + "learning_rate": 4.810283660643895e-06, + "loss": 0.3217, + "step": 18160 + }, + { + "epoch": 0.53, + "grad_norm": 2.3048792343576987, + "learning_rate": 4.809814292069471e-06, + "loss": 0.3245, + "step": 18161 + }, + { + "epoch": 0.53, + "grad_norm": 1.554299822115572, + "learning_rate": 4.809344925173439e-06, + "loss": 0.3327, + "step": 18162 + }, + { + "epoch": 0.53, + "grad_norm": 1.4132505252778345, + "learning_rate": 4.8088755599599405e-06, + "loss": 0.3102, + "step": 18163 + }, + { + "epoch": 0.53, + "grad_norm": 1.1166593995905798, + "learning_rate": 4.808406196433119e-06, + "loss": 0.2851, + "step": 18164 + }, + { + "epoch": 0.53, + "grad_norm": 1.3060017894934384, + "learning_rate": 4.807936834597111e-06, + "loss": 0.3131, + "step": 18165 + }, + { + "epoch": 0.53, + "grad_norm": 1.4307696036058695, + "learning_rate": 4.807467474456066e-06, + "loss": 0.3183, + "step": 18166 + }, + { + "epoch": 0.53, + "grad_norm": 1.4422792076971351, + "learning_rate": 4.806998116014121e-06, + "loss": 0.3168, + "step": 18167 + }, + { + "epoch": 0.53, + "grad_norm": 1.6745087285041977, + "learning_rate": 4.80652875927542e-06, + "loss": 0.3299, + "step": 18168 + }, + { + "epoch": 0.53, + "grad_norm": 1.236545851436725, + "learning_rate": 4.806059404244104e-06, + "loss": 0.2987, + "step": 18169 + }, + { + "epoch": 0.53, + "grad_norm": 1.2815466566991849, + "learning_rate": 4.805590050924317e-06, + "loss": 0.298, + "step": 18170 + }, + { + "epoch": 0.53, + "grad_norm": 1.448563352684121, + "learning_rate": 4.8051206993202e-06, + "loss": 0.3137, + "step": 18171 + }, + { + "epoch": 0.53, + "grad_norm": 1.198659849988846, + "learning_rate": 4.8046513494358945e-06, + "loss": 0.3098, + "step": 18172 + }, + { + "epoch": 0.53, + "grad_norm": 1.5972034953919862, + "learning_rate": 4.804182001275542e-06, + "loss": 0.3339, + "step": 18173 + }, + { + "epoch": 0.53, + "grad_norm": 1.383674327619657, + "learning_rate": 4.803712654843286e-06, + "loss": 0.304, + "step": 18174 + }, + { + "epoch": 0.53, + "grad_norm": 1.4792374413561538, + "learning_rate": 4.80324331014327e-06, + "loss": 0.3184, + "step": 18175 + }, + { + "epoch": 0.53, + "grad_norm": 1.36873309850582, + "learning_rate": 4.802773967179633e-06, + "loss": 0.2987, + "step": 18176 + }, + { + "epoch": 0.53, + "grad_norm": 1.6202199723875828, + "learning_rate": 4.802304625956516e-06, + "loss": 0.3097, + "step": 18177 + }, + { + "epoch": 0.53, + "grad_norm": 1.3806128354679168, + "learning_rate": 4.801835286478065e-06, + "loss": 0.3566, + "step": 18178 + }, + { + "epoch": 0.53, + "grad_norm": 1.437559148476708, + "learning_rate": 4.80136594874842e-06, + "loss": 0.3329, + "step": 18179 + }, + { + "epoch": 0.53, + "grad_norm": 1.334566161835566, + "learning_rate": 4.800896612771722e-06, + "loss": 0.3086, + "step": 18180 + }, + { + "epoch": 0.53, + "grad_norm": 1.2386811664154476, + "learning_rate": 4.800427278552112e-06, + "loss": 0.3033, + "step": 18181 + }, + { + "epoch": 0.53, + "grad_norm": 1.2017324601029824, + "learning_rate": 4.799957946093735e-06, + "loss": 0.3156, + "step": 18182 + }, + { + "epoch": 0.53, + "grad_norm": 1.3937639734468374, + "learning_rate": 4.799488615400732e-06, + "loss": 0.2885, + "step": 18183 + }, + { + "epoch": 0.53, + "grad_norm": 1.3849464364539892, + "learning_rate": 4.799019286477244e-06, + "loss": 0.3335, + "step": 18184 + }, + { + "epoch": 0.53, + "grad_norm": 1.4576831445708671, + "learning_rate": 4.798549959327412e-06, + "loss": 0.3068, + "step": 18185 + }, + { + "epoch": 0.53, + "grad_norm": 1.4514434330510093, + "learning_rate": 4.798080633955379e-06, + "loss": 0.3142, + "step": 18186 + }, + { + "epoch": 0.53, + "grad_norm": 2.268582996470401, + "learning_rate": 4.7976113103652865e-06, + "loss": 0.2828, + "step": 18187 + }, + { + "epoch": 0.53, + "grad_norm": 1.4074669208097679, + "learning_rate": 4.797141988561278e-06, + "loss": 0.3039, + "step": 18188 + }, + { + "epoch": 0.53, + "grad_norm": 1.390766985950368, + "learning_rate": 4.796672668547492e-06, + "loss": 0.3038, + "step": 18189 + }, + { + "epoch": 0.53, + "grad_norm": 1.2983715802192883, + "learning_rate": 4.796203350328072e-06, + "loss": 0.3246, + "step": 18190 + }, + { + "epoch": 0.53, + "grad_norm": 1.3721606493279925, + "learning_rate": 4.795734033907161e-06, + "loss": 0.2966, + "step": 18191 + }, + { + "epoch": 0.53, + "grad_norm": 1.1768415622473614, + "learning_rate": 4.7952647192889e-06, + "loss": 0.3027, + "step": 18192 + }, + { + "epoch": 0.53, + "grad_norm": 1.3785506768654607, + "learning_rate": 4.794795406477429e-06, + "loss": 0.3129, + "step": 18193 + }, + { + "epoch": 0.53, + "grad_norm": 1.3628498068080164, + "learning_rate": 4.794326095476891e-06, + "loss": 0.3131, + "step": 18194 + }, + { + "epoch": 0.53, + "grad_norm": 2.286764660061782, + "learning_rate": 4.793856786291427e-06, + "loss": 0.3319, + "step": 18195 + }, + { + "epoch": 0.53, + "grad_norm": 1.4592458492063005, + "learning_rate": 4.793387478925179e-06, + "loss": 0.2823, + "step": 18196 + }, + { + "epoch": 0.53, + "grad_norm": 1.5670630779455579, + "learning_rate": 4.792918173382289e-06, + "loss": 0.3249, + "step": 18197 + }, + { + "epoch": 0.53, + "grad_norm": 1.4786179866812774, + "learning_rate": 4.792448869666899e-06, + "loss": 0.327, + "step": 18198 + }, + { + "epoch": 0.53, + "grad_norm": 1.3012552632680023, + "learning_rate": 4.791979567783149e-06, + "loss": 0.3121, + "step": 18199 + }, + { + "epoch": 0.53, + "grad_norm": 1.2673688281770321, + "learning_rate": 4.791510267735182e-06, + "loss": 0.2966, + "step": 18200 + }, + { + "epoch": 0.53, + "grad_norm": 4.855508718290733, + "learning_rate": 4.791040969527139e-06, + "loss": 0.3, + "step": 18201 + }, + { + "epoch": 0.53, + "grad_norm": 1.5054312982548248, + "learning_rate": 4.790571673163163e-06, + "loss": 0.3149, + "step": 18202 + }, + { + "epoch": 0.53, + "grad_norm": 2.105078646537009, + "learning_rate": 4.790102378647393e-06, + "loss": 0.3333, + "step": 18203 + }, + { + "epoch": 0.53, + "grad_norm": 1.4572299218966436, + "learning_rate": 4.789633085983972e-06, + "loss": 0.3156, + "step": 18204 + }, + { + "epoch": 0.53, + "grad_norm": 1.2895554069025013, + "learning_rate": 4.789163795177042e-06, + "loss": 0.29, + "step": 18205 + }, + { + "epoch": 0.53, + "grad_norm": 1.253045648562298, + "learning_rate": 4.788694506230744e-06, + "loss": 0.3386, + "step": 18206 + }, + { + "epoch": 0.53, + "grad_norm": 1.4391006035624, + "learning_rate": 4.788225219149219e-06, + "loss": 0.3118, + "step": 18207 + }, + { + "epoch": 0.53, + "grad_norm": 1.4872042341615404, + "learning_rate": 4.787755933936608e-06, + "loss": 0.3418, + "step": 18208 + }, + { + "epoch": 0.53, + "grad_norm": 1.447097002598091, + "learning_rate": 4.787286650597053e-06, + "loss": 0.299, + "step": 18209 + }, + { + "epoch": 0.53, + "grad_norm": 1.4926940972103189, + "learning_rate": 4.786817369134696e-06, + "loss": 0.3398, + "step": 18210 + }, + { + "epoch": 0.53, + "grad_norm": 1.2606042623344824, + "learning_rate": 4.78634808955368e-06, + "loss": 0.2989, + "step": 18211 + }, + { + "epoch": 0.53, + "grad_norm": 1.2935203408173146, + "learning_rate": 4.785878811858142e-06, + "loss": 0.2912, + "step": 18212 + }, + { + "epoch": 0.53, + "grad_norm": 1.2838074595858364, + "learning_rate": 4.785409536052227e-06, + "loss": 0.3108, + "step": 18213 + }, + { + "epoch": 0.53, + "grad_norm": 1.2559210535124825, + "learning_rate": 4.784940262140075e-06, + "loss": 0.3067, + "step": 18214 + }, + { + "epoch": 0.53, + "grad_norm": 1.2592230671383704, + "learning_rate": 4.784470990125828e-06, + "loss": 0.3146, + "step": 18215 + }, + { + "epoch": 0.53, + "grad_norm": 1.6071662624354666, + "learning_rate": 4.784001720013626e-06, + "loss": 0.3009, + "step": 18216 + }, + { + "epoch": 0.53, + "grad_norm": 1.2446620482748942, + "learning_rate": 4.783532451807612e-06, + "loss": 0.2872, + "step": 18217 + }, + { + "epoch": 0.53, + "grad_norm": 1.2163116446548459, + "learning_rate": 4.783063185511926e-06, + "loss": 0.2913, + "step": 18218 + }, + { + "epoch": 0.53, + "grad_norm": 1.4512304809040926, + "learning_rate": 4.782593921130711e-06, + "loss": 0.3042, + "step": 18219 + }, + { + "epoch": 0.53, + "grad_norm": 1.258052636787595, + "learning_rate": 4.782124658668107e-06, + "loss": 0.3177, + "step": 18220 + }, + { + "epoch": 0.53, + "grad_norm": 3.094229097133016, + "learning_rate": 4.781655398128256e-06, + "loss": 0.347, + "step": 18221 + }, + { + "epoch": 0.53, + "grad_norm": 2.377603982529871, + "learning_rate": 4.781186139515296e-06, + "loss": 0.3053, + "step": 18222 + }, + { + "epoch": 0.53, + "grad_norm": 1.2492599233610713, + "learning_rate": 4.780716882833372e-06, + "loss": 0.2879, + "step": 18223 + }, + { + "epoch": 0.53, + "grad_norm": 1.305133649030448, + "learning_rate": 4.780247628086623e-06, + "loss": 0.3162, + "step": 18224 + }, + { + "epoch": 0.53, + "grad_norm": 1.352248711298107, + "learning_rate": 4.779778375279193e-06, + "loss": 0.3047, + "step": 18225 + }, + { + "epoch": 0.53, + "grad_norm": 1.3787674923585411, + "learning_rate": 4.7793091244152196e-06, + "loss": 0.3156, + "step": 18226 + }, + { + "epoch": 0.53, + "grad_norm": 1.2592797108225713, + "learning_rate": 4.7788398754988454e-06, + "loss": 0.2879, + "step": 18227 + }, + { + "epoch": 0.53, + "grad_norm": 1.261597630810583, + "learning_rate": 4.778370628534212e-06, + "loss": 0.2928, + "step": 18228 + }, + { + "epoch": 0.53, + "grad_norm": 2.181802957055031, + "learning_rate": 4.77790138352546e-06, + "loss": 0.2903, + "step": 18229 + }, + { + "epoch": 0.53, + "grad_norm": 1.5630806672892477, + "learning_rate": 4.777432140476733e-06, + "loss": 0.3356, + "step": 18230 + }, + { + "epoch": 0.53, + "grad_norm": 1.3073691171257835, + "learning_rate": 4.776962899392168e-06, + "loss": 0.3057, + "step": 18231 + }, + { + "epoch": 0.53, + "grad_norm": 1.2949610434076892, + "learning_rate": 4.7764936602759085e-06, + "loss": 0.3383, + "step": 18232 + }, + { + "epoch": 0.53, + "grad_norm": 1.258246519593384, + "learning_rate": 4.776024423132094e-06, + "loss": 0.308, + "step": 18233 + }, + { + "epoch": 0.53, + "grad_norm": 1.3183530357285236, + "learning_rate": 4.775555187964868e-06, + "loss": 0.319, + "step": 18234 + }, + { + "epoch": 0.53, + "grad_norm": 1.266218248283472, + "learning_rate": 4.775085954778368e-06, + "loss": 0.307, + "step": 18235 + }, + { + "epoch": 0.53, + "grad_norm": 1.2515023157238425, + "learning_rate": 4.774616723576739e-06, + "loss": 0.2897, + "step": 18236 + }, + { + "epoch": 0.53, + "grad_norm": 1.2826030859552826, + "learning_rate": 4.774147494364118e-06, + "loss": 0.2797, + "step": 18237 + }, + { + "epoch": 0.53, + "grad_norm": 1.5527385378456482, + "learning_rate": 4.773678267144649e-06, + "loss": 0.301, + "step": 18238 + }, + { + "epoch": 0.53, + "grad_norm": 1.2631561598703107, + "learning_rate": 4.773209041922472e-06, + "loss": 0.2909, + "step": 18239 + }, + { + "epoch": 0.53, + "grad_norm": 1.2594640160329063, + "learning_rate": 4.7727398187017264e-06, + "loss": 0.3036, + "step": 18240 + }, + { + "epoch": 0.53, + "grad_norm": 1.3365653772884494, + "learning_rate": 4.772270597486555e-06, + "loss": 0.303, + "step": 18241 + }, + { + "epoch": 0.53, + "grad_norm": 0.9095761711172372, + "learning_rate": 4.771801378281099e-06, + "loss": 0.5682, + "step": 18242 + }, + { + "epoch": 0.53, + "grad_norm": 1.2267105997911365, + "learning_rate": 4.771332161089498e-06, + "loss": 0.2992, + "step": 18243 + }, + { + "epoch": 0.53, + "grad_norm": 1.5527731960974709, + "learning_rate": 4.7708629459158926e-06, + "loss": 0.2923, + "step": 18244 + }, + { + "epoch": 0.53, + "grad_norm": 1.3609452846501848, + "learning_rate": 4.770393732764424e-06, + "loss": 0.3139, + "step": 18245 + }, + { + "epoch": 0.53, + "grad_norm": 1.5244552409117742, + "learning_rate": 4.769924521639234e-06, + "loss": 0.3234, + "step": 18246 + }, + { + "epoch": 0.53, + "grad_norm": 1.491340236718619, + "learning_rate": 4.769455312544464e-06, + "loss": 0.3091, + "step": 18247 + }, + { + "epoch": 0.53, + "grad_norm": 1.3147755760412474, + "learning_rate": 4.7689861054842524e-06, + "loss": 0.3311, + "step": 18248 + }, + { + "epoch": 0.53, + "grad_norm": 1.2004070748888294, + "learning_rate": 4.768516900462741e-06, + "loss": 0.2956, + "step": 18249 + }, + { + "epoch": 0.53, + "grad_norm": 1.787301099427936, + "learning_rate": 4.76804769748407e-06, + "loss": 0.3115, + "step": 18250 + }, + { + "epoch": 0.53, + "grad_norm": 1.42020370334539, + "learning_rate": 4.7675784965523805e-06, + "loss": 0.2909, + "step": 18251 + }, + { + "epoch": 0.53, + "grad_norm": 1.5230455503270777, + "learning_rate": 4.767109297671814e-06, + "loss": 0.3116, + "step": 18252 + }, + { + "epoch": 0.53, + "grad_norm": 1.24118722223752, + "learning_rate": 4.76664010084651e-06, + "loss": 0.3038, + "step": 18253 + }, + { + "epoch": 0.53, + "grad_norm": 1.588698780709691, + "learning_rate": 4.76617090608061e-06, + "loss": 0.2868, + "step": 18254 + }, + { + "epoch": 0.53, + "grad_norm": 1.6182157306672413, + "learning_rate": 4.765701713378253e-06, + "loss": 0.3053, + "step": 18255 + }, + { + "epoch": 0.53, + "grad_norm": 1.3074978961229944, + "learning_rate": 4.765232522743583e-06, + "loss": 0.3017, + "step": 18256 + }, + { + "epoch": 0.53, + "grad_norm": 1.365226557410668, + "learning_rate": 4.764763334180738e-06, + "loss": 0.3179, + "step": 18257 + }, + { + "epoch": 0.53, + "grad_norm": 1.2697490201609454, + "learning_rate": 4.764294147693858e-06, + "loss": 0.3015, + "step": 18258 + }, + { + "epoch": 0.53, + "grad_norm": 2.826464233213424, + "learning_rate": 4.763824963287086e-06, + "loss": 0.3463, + "step": 18259 + }, + { + "epoch": 0.53, + "grad_norm": 1.2607375826170506, + "learning_rate": 4.7633557809645606e-06, + "loss": 0.3236, + "step": 18260 + }, + { + "epoch": 0.53, + "grad_norm": 1.2853090411871204, + "learning_rate": 4.762886600730424e-06, + "loss": 0.3071, + "step": 18261 + }, + { + "epoch": 0.53, + "grad_norm": 1.3552153085935672, + "learning_rate": 4.762417422588815e-06, + "loss": 0.3292, + "step": 18262 + }, + { + "epoch": 0.53, + "grad_norm": 1.2695727399152301, + "learning_rate": 4.761948246543875e-06, + "loss": 0.3076, + "step": 18263 + }, + { + "epoch": 0.53, + "grad_norm": 1.2898114651620454, + "learning_rate": 4.761479072599745e-06, + "loss": 0.2973, + "step": 18264 + }, + { + "epoch": 0.53, + "grad_norm": 1.6423330731449268, + "learning_rate": 4.761009900760563e-06, + "loss": 0.325, + "step": 18265 + }, + { + "epoch": 0.53, + "grad_norm": 1.3933346864221217, + "learning_rate": 4.760540731030474e-06, + "loss": 0.3114, + "step": 18266 + }, + { + "epoch": 0.53, + "grad_norm": 1.3774873583454452, + "learning_rate": 4.760071563413614e-06, + "loss": 0.3247, + "step": 18267 + }, + { + "epoch": 0.53, + "grad_norm": 2.4726311099888227, + "learning_rate": 4.759602397914125e-06, + "loss": 0.2985, + "step": 18268 + }, + { + "epoch": 0.53, + "grad_norm": 1.436260538863685, + "learning_rate": 4.759133234536147e-06, + "loss": 0.318, + "step": 18269 + }, + { + "epoch": 0.53, + "grad_norm": 1.2175649955092562, + "learning_rate": 4.758664073283823e-06, + "loss": 0.3021, + "step": 18270 + }, + { + "epoch": 0.53, + "grad_norm": 1.072492373191575, + "learning_rate": 4.75819491416129e-06, + "loss": 0.5919, + "step": 18271 + }, + { + "epoch": 0.53, + "grad_norm": 1.2413166250040168, + "learning_rate": 4.757725757172688e-06, + "loss": 0.3142, + "step": 18272 + }, + { + "epoch": 0.53, + "grad_norm": 1.4835355377917951, + "learning_rate": 4.7572566023221614e-06, + "loss": 0.2978, + "step": 18273 + }, + { + "epoch": 0.53, + "grad_norm": 1.272560856730732, + "learning_rate": 4.756787449613848e-06, + "loss": 0.3063, + "step": 18274 + }, + { + "epoch": 0.53, + "grad_norm": 1.4964388752250795, + "learning_rate": 4.756318299051886e-06, + "loss": 0.3186, + "step": 18275 + }, + { + "epoch": 0.53, + "grad_norm": 1.6681883975146574, + "learning_rate": 4.7558491506404205e-06, + "loss": 0.3085, + "step": 18276 + }, + { + "epoch": 0.53, + "grad_norm": 1.306965613163735, + "learning_rate": 4.7553800043835875e-06, + "loss": 0.3032, + "step": 18277 + }, + { + "epoch": 0.53, + "grad_norm": 1.3216059935040638, + "learning_rate": 4.754910860285528e-06, + "loss": 0.293, + "step": 18278 + }, + { + "epoch": 0.53, + "grad_norm": 1.279780957581823, + "learning_rate": 4.754441718350383e-06, + "loss": 0.3219, + "step": 18279 + }, + { + "epoch": 0.53, + "grad_norm": 1.6417894455368198, + "learning_rate": 4.753972578582294e-06, + "loss": 0.3279, + "step": 18280 + }, + { + "epoch": 0.53, + "grad_norm": 1.4576507895844293, + "learning_rate": 4.753503440985398e-06, + "loss": 0.3065, + "step": 18281 + }, + { + "epoch": 0.53, + "grad_norm": 1.3962789520442696, + "learning_rate": 4.753034305563837e-06, + "loss": 0.3268, + "step": 18282 + }, + { + "epoch": 0.53, + "grad_norm": 2.2382014505692474, + "learning_rate": 4.752565172321752e-06, + "loss": 0.3252, + "step": 18283 + }, + { + "epoch": 0.53, + "grad_norm": 1.3017776595505512, + "learning_rate": 4.752096041263281e-06, + "loss": 0.302, + "step": 18284 + }, + { + "epoch": 0.53, + "grad_norm": 1.310853662194078, + "learning_rate": 4.7516269123925665e-06, + "loss": 0.2964, + "step": 18285 + }, + { + "epoch": 0.53, + "grad_norm": 1.319754467270563, + "learning_rate": 4.7511577857137455e-06, + "loss": 0.334, + "step": 18286 + }, + { + "epoch": 0.53, + "grad_norm": 1.3573301450344912, + "learning_rate": 4.750688661230961e-06, + "loss": 0.3098, + "step": 18287 + }, + { + "epoch": 0.53, + "grad_norm": 2.094594600927756, + "learning_rate": 4.750219538948351e-06, + "loss": 0.2949, + "step": 18288 + }, + { + "epoch": 0.53, + "grad_norm": 1.240770796462992, + "learning_rate": 4.749750418870058e-06, + "loss": 0.3033, + "step": 18289 + }, + { + "epoch": 0.53, + "grad_norm": 1.5150347294957822, + "learning_rate": 4.74928130100022e-06, + "loss": 0.2914, + "step": 18290 + }, + { + "epoch": 0.53, + "grad_norm": 1.5701464364192532, + "learning_rate": 4.748812185342975e-06, + "loss": 0.2861, + "step": 18291 + }, + { + "epoch": 0.53, + "grad_norm": 1.3256646819668325, + "learning_rate": 4.748343071902467e-06, + "loss": 0.2829, + "step": 18292 + }, + { + "epoch": 0.53, + "grad_norm": 0.888822877878934, + "learning_rate": 4.747873960682835e-06, + "loss": 0.6071, + "step": 18293 + }, + { + "epoch": 0.53, + "grad_norm": 1.5640146819776894, + "learning_rate": 4.747404851688216e-06, + "loss": 0.3345, + "step": 18294 + }, + { + "epoch": 0.53, + "grad_norm": 1.6457401125308833, + "learning_rate": 4.746935744922753e-06, + "loss": 0.331, + "step": 18295 + }, + { + "epoch": 0.53, + "grad_norm": 1.2895440235089644, + "learning_rate": 4.746466640390585e-06, + "loss": 0.3221, + "step": 18296 + }, + { + "epoch": 0.53, + "grad_norm": 1.3168190565796132, + "learning_rate": 4.745997538095853e-06, + "loss": 0.3055, + "step": 18297 + }, + { + "epoch": 0.53, + "grad_norm": 1.3193786161072878, + "learning_rate": 4.745528438042694e-06, + "loss": 0.3057, + "step": 18298 + }, + { + "epoch": 0.53, + "grad_norm": 1.2328004149952059, + "learning_rate": 4.745059340235249e-06, + "loss": 0.3089, + "step": 18299 + }, + { + "epoch": 0.53, + "grad_norm": 2.0409370182752467, + "learning_rate": 4.744590244677659e-06, + "loss": 0.2917, + "step": 18300 + }, + { + "epoch": 0.53, + "grad_norm": 1.2113079918664627, + "learning_rate": 4.744121151374062e-06, + "loss": 0.2855, + "step": 18301 + }, + { + "epoch": 0.53, + "grad_norm": 1.299607979475765, + "learning_rate": 4.7436520603286e-06, + "loss": 0.3251, + "step": 18302 + }, + { + "epoch": 0.53, + "grad_norm": 1.3864066104551327, + "learning_rate": 4.74318297154541e-06, + "loss": 0.2949, + "step": 18303 + }, + { + "epoch": 0.53, + "grad_norm": 1.4724169427710052, + "learning_rate": 4.7427138850286354e-06, + "loss": 0.3194, + "step": 18304 + }, + { + "epoch": 0.53, + "grad_norm": 1.2710642434428363, + "learning_rate": 4.742244800782412e-06, + "loss": 0.3301, + "step": 18305 + }, + { + "epoch": 0.53, + "grad_norm": 1.3893793855583445, + "learning_rate": 4.741775718810882e-06, + "loss": 0.291, + "step": 18306 + }, + { + "epoch": 0.53, + "grad_norm": 1.3802678424843395, + "learning_rate": 4.741306639118182e-06, + "loss": 0.3163, + "step": 18307 + }, + { + "epoch": 0.53, + "grad_norm": 1.4909926206176607, + "learning_rate": 4.7408375617084564e-06, + "loss": 0.2926, + "step": 18308 + }, + { + "epoch": 0.53, + "grad_norm": 1.7443327778695468, + "learning_rate": 4.74036848658584e-06, + "loss": 0.2865, + "step": 18309 + }, + { + "epoch": 0.53, + "grad_norm": 1.2370833914086536, + "learning_rate": 4.7398994137544756e-06, + "loss": 0.2984, + "step": 18310 + }, + { + "epoch": 0.53, + "grad_norm": 1.2421588870639346, + "learning_rate": 4.739430343218501e-06, + "loss": 0.2794, + "step": 18311 + }, + { + "epoch": 0.53, + "grad_norm": 1.2936670626354967, + "learning_rate": 4.738961274982058e-06, + "loss": 0.3204, + "step": 18312 + }, + { + "epoch": 0.53, + "grad_norm": 3.274949510629876, + "learning_rate": 4.738492209049283e-06, + "loss": 0.3053, + "step": 18313 + }, + { + "epoch": 0.53, + "grad_norm": 1.4752119014671656, + "learning_rate": 4.738023145424318e-06, + "loss": 0.3072, + "step": 18314 + }, + { + "epoch": 0.53, + "grad_norm": 1.4877286923081503, + "learning_rate": 4.737554084111302e-06, + "loss": 0.3138, + "step": 18315 + }, + { + "epoch": 0.53, + "grad_norm": 1.4871706651740126, + "learning_rate": 4.737085025114374e-06, + "loss": 0.3231, + "step": 18316 + }, + { + "epoch": 0.53, + "grad_norm": 1.7269361498208693, + "learning_rate": 4.736615968437673e-06, + "loss": 0.2745, + "step": 18317 + }, + { + "epoch": 0.53, + "grad_norm": 1.6818781728908196, + "learning_rate": 4.736146914085339e-06, + "loss": 0.3078, + "step": 18318 + }, + { + "epoch": 0.53, + "grad_norm": 3.325706240315257, + "learning_rate": 4.735677862061512e-06, + "loss": 0.3092, + "step": 18319 + }, + { + "epoch": 0.53, + "grad_norm": 1.370183737795175, + "learning_rate": 4.73520881237033e-06, + "loss": 0.326, + "step": 18320 + }, + { + "epoch": 0.53, + "grad_norm": 1.4804313485088716, + "learning_rate": 4.734739765015934e-06, + "loss": 0.3121, + "step": 18321 + }, + { + "epoch": 0.53, + "grad_norm": 1.8027013654755095, + "learning_rate": 4.7342707200024626e-06, + "loss": 0.3006, + "step": 18322 + }, + { + "epoch": 0.53, + "grad_norm": 1.4996535804864073, + "learning_rate": 4.733801677334054e-06, + "loss": 0.3273, + "step": 18323 + }, + { + "epoch": 0.53, + "grad_norm": 1.29148718451932, + "learning_rate": 4.733332637014849e-06, + "loss": 0.3133, + "step": 18324 + }, + { + "epoch": 0.53, + "grad_norm": 1.417573631954129, + "learning_rate": 4.732863599048987e-06, + "loss": 0.2814, + "step": 18325 + }, + { + "epoch": 0.53, + "grad_norm": 1.3657971961816142, + "learning_rate": 4.732394563440607e-06, + "loss": 0.3079, + "step": 18326 + }, + { + "epoch": 0.53, + "grad_norm": 1.3975635095019947, + "learning_rate": 4.731925530193846e-06, + "loss": 0.3146, + "step": 18327 + }, + { + "epoch": 0.53, + "grad_norm": 1.3021782628336922, + "learning_rate": 4.731456499312847e-06, + "loss": 0.2917, + "step": 18328 + }, + { + "epoch": 0.53, + "grad_norm": 1.3913300364121033, + "learning_rate": 4.730987470801747e-06, + "loss": 0.2905, + "step": 18329 + }, + { + "epoch": 0.53, + "grad_norm": 1.4045592392415343, + "learning_rate": 4.730518444664685e-06, + "loss": 0.3214, + "step": 18330 + }, + { + "epoch": 0.53, + "grad_norm": 1.4073447825520344, + "learning_rate": 4.730049420905801e-06, + "loss": 0.3354, + "step": 18331 + }, + { + "epoch": 0.53, + "grad_norm": 1.4335055570058726, + "learning_rate": 4.729580399529233e-06, + "loss": 0.324, + "step": 18332 + }, + { + "epoch": 0.53, + "grad_norm": 2.25690265322608, + "learning_rate": 4.729111380539124e-06, + "loss": 0.3088, + "step": 18333 + }, + { + "epoch": 0.53, + "grad_norm": 1.2614252561290658, + "learning_rate": 4.728642363939608e-06, + "loss": 0.3084, + "step": 18334 + }, + { + "epoch": 0.53, + "grad_norm": 1.4095809132412822, + "learning_rate": 4.728173349734827e-06, + "loss": 0.3008, + "step": 18335 + }, + { + "epoch": 0.53, + "grad_norm": 1.771029202663463, + "learning_rate": 4.727704337928918e-06, + "loss": 0.2996, + "step": 18336 + }, + { + "epoch": 0.53, + "grad_norm": 1.2541903187714563, + "learning_rate": 4.7272353285260215e-06, + "loss": 0.3019, + "step": 18337 + }, + { + "epoch": 0.53, + "grad_norm": 1.2933525847048473, + "learning_rate": 4.726766321530276e-06, + "loss": 0.3193, + "step": 18338 + }, + { + "epoch": 0.53, + "grad_norm": 2.0345862475069727, + "learning_rate": 4.7262973169458215e-06, + "loss": 0.3102, + "step": 18339 + }, + { + "epoch": 0.53, + "grad_norm": 1.258413736704106, + "learning_rate": 4.725828314776796e-06, + "loss": 0.3268, + "step": 18340 + }, + { + "epoch": 0.53, + "grad_norm": 1.3346218809005943, + "learning_rate": 4.7253593150273384e-06, + "loss": 0.3423, + "step": 18341 + }, + { + "epoch": 0.53, + "grad_norm": 1.9235944795825803, + "learning_rate": 4.724890317701587e-06, + "loss": 0.3047, + "step": 18342 + }, + { + "epoch": 0.53, + "grad_norm": 1.4120628602588183, + "learning_rate": 4.724421322803682e-06, + "loss": 0.3054, + "step": 18343 + }, + { + "epoch": 0.53, + "grad_norm": 1.5036887784968849, + "learning_rate": 4.723952330337763e-06, + "loss": 0.293, + "step": 18344 + }, + { + "epoch": 0.53, + "grad_norm": 1.53679275104093, + "learning_rate": 4.7234833403079675e-06, + "loss": 0.3338, + "step": 18345 + }, + { + "epoch": 0.53, + "grad_norm": 1.4142709452219628, + "learning_rate": 4.723014352718434e-06, + "loss": 0.2871, + "step": 18346 + }, + { + "epoch": 0.53, + "grad_norm": 1.4156539401499268, + "learning_rate": 4.7225453675733016e-06, + "loss": 0.3056, + "step": 18347 + }, + { + "epoch": 0.53, + "grad_norm": 1.2534100033286217, + "learning_rate": 4.722076384876711e-06, + "loss": 0.2858, + "step": 18348 + }, + { + "epoch": 0.53, + "grad_norm": 1.343180850566279, + "learning_rate": 4.721607404632798e-06, + "loss": 0.3061, + "step": 18349 + }, + { + "epoch": 0.53, + "grad_norm": 1.5394241813462046, + "learning_rate": 4.721138426845703e-06, + "loss": 0.2946, + "step": 18350 + }, + { + "epoch": 0.53, + "grad_norm": 1.4296046107572753, + "learning_rate": 4.720669451519564e-06, + "loss": 0.3035, + "step": 18351 + }, + { + "epoch": 0.53, + "grad_norm": 1.2337832945055645, + "learning_rate": 4.720200478658521e-06, + "loss": 0.2868, + "step": 18352 + }, + { + "epoch": 0.53, + "grad_norm": 2.0157371182295005, + "learning_rate": 4.7197315082667115e-06, + "loss": 0.3066, + "step": 18353 + }, + { + "epoch": 0.53, + "grad_norm": 1.425101070600088, + "learning_rate": 4.719262540348275e-06, + "loss": 0.3174, + "step": 18354 + }, + { + "epoch": 0.53, + "grad_norm": 1.4402947716850105, + "learning_rate": 4.718793574907349e-06, + "loss": 0.3175, + "step": 18355 + }, + { + "epoch": 0.53, + "grad_norm": 1.3101953659315297, + "learning_rate": 4.718324611948073e-06, + "loss": 0.2885, + "step": 18356 + }, + { + "epoch": 0.53, + "grad_norm": 1.3801399993490142, + "learning_rate": 4.717855651474587e-06, + "loss": 0.3204, + "step": 18357 + }, + { + "epoch": 0.53, + "grad_norm": 2.2220461122829134, + "learning_rate": 4.717386693491026e-06, + "loss": 0.3092, + "step": 18358 + }, + { + "epoch": 0.53, + "grad_norm": 1.2811242539981507, + "learning_rate": 4.716917738001531e-06, + "loss": 0.2975, + "step": 18359 + }, + { + "epoch": 0.53, + "grad_norm": 1.3947599207740482, + "learning_rate": 4.7164487850102395e-06, + "loss": 0.2997, + "step": 18360 + }, + { + "epoch": 0.53, + "grad_norm": 1.5433304487429655, + "learning_rate": 4.7159798345212935e-06, + "loss": 0.2961, + "step": 18361 + }, + { + "epoch": 0.53, + "grad_norm": 1.3411749983873849, + "learning_rate": 4.715510886538826e-06, + "loss": 0.3048, + "step": 18362 + }, + { + "epoch": 0.53, + "grad_norm": 1.2763387382483202, + "learning_rate": 4.715041941066979e-06, + "loss": 0.3286, + "step": 18363 + }, + { + "epoch": 0.53, + "grad_norm": 1.2711137443564777, + "learning_rate": 4.71457299810989e-06, + "loss": 0.3136, + "step": 18364 + }, + { + "epoch": 0.53, + "grad_norm": 1.329019091013765, + "learning_rate": 4.714104057671696e-06, + "loss": 0.3255, + "step": 18365 + }, + { + "epoch": 0.53, + "grad_norm": 1.393978928675329, + "learning_rate": 4.713635119756539e-06, + "loss": 0.3019, + "step": 18366 + }, + { + "epoch": 0.53, + "grad_norm": 1.325549450388779, + "learning_rate": 4.713166184368554e-06, + "loss": 0.2928, + "step": 18367 + }, + { + "epoch": 0.53, + "grad_norm": 1.3535792353108154, + "learning_rate": 4.712697251511881e-06, + "loss": 0.3042, + "step": 18368 + }, + { + "epoch": 0.53, + "grad_norm": 0.9830853097149057, + "learning_rate": 4.712228321190657e-06, + "loss": 0.591, + "step": 18369 + }, + { + "epoch": 0.53, + "grad_norm": 0.9943295944947357, + "learning_rate": 4.711759393409022e-06, + "loss": 0.6105, + "step": 18370 + }, + { + "epoch": 0.53, + "grad_norm": 1.5113984663429445, + "learning_rate": 4.711290468171115e-06, + "loss": 0.347, + "step": 18371 + }, + { + "epoch": 0.53, + "grad_norm": 1.2460391206111547, + "learning_rate": 4.710821545481071e-06, + "loss": 0.3136, + "step": 18372 + }, + { + "epoch": 0.53, + "grad_norm": 1.3399853728226259, + "learning_rate": 4.71035262534303e-06, + "loss": 0.33, + "step": 18373 + }, + { + "epoch": 0.53, + "grad_norm": 1.774368436518293, + "learning_rate": 4.70988370776113e-06, + "loss": 0.2995, + "step": 18374 + }, + { + "epoch": 0.53, + "grad_norm": 1.263275194322236, + "learning_rate": 4.709414792739511e-06, + "loss": 0.2859, + "step": 18375 + }, + { + "epoch": 0.53, + "grad_norm": 2.3924349565964764, + "learning_rate": 4.708945880282309e-06, + "loss": 0.3059, + "step": 18376 + }, + { + "epoch": 0.53, + "grad_norm": 1.3703107190385047, + "learning_rate": 4.708476970393662e-06, + "loss": 0.3254, + "step": 18377 + }, + { + "epoch": 0.53, + "grad_norm": 1.462454919391074, + "learning_rate": 4.70800806307771e-06, + "loss": 0.3086, + "step": 18378 + }, + { + "epoch": 0.53, + "grad_norm": 1.4583628000128193, + "learning_rate": 4.707539158338589e-06, + "loss": 0.3034, + "step": 18379 + }, + { + "epoch": 0.53, + "grad_norm": 1.673595828010167, + "learning_rate": 4.707070256180441e-06, + "loss": 0.2965, + "step": 18380 + }, + { + "epoch": 0.53, + "grad_norm": 1.395361889464565, + "learning_rate": 4.706601356607398e-06, + "loss": 0.3305, + "step": 18381 + }, + { + "epoch": 0.53, + "grad_norm": 1.438388331900116, + "learning_rate": 4.7061324596236024e-06, + "loss": 0.2965, + "step": 18382 + }, + { + "epoch": 0.53, + "grad_norm": 2.6152021362226425, + "learning_rate": 4.705663565233191e-06, + "loss": 0.3051, + "step": 18383 + }, + { + "epoch": 0.53, + "grad_norm": 1.1445005453941166, + "learning_rate": 4.705194673440303e-06, + "loss": 0.2828, + "step": 18384 + }, + { + "epoch": 0.53, + "grad_norm": 1.3366518280007162, + "learning_rate": 4.704725784249074e-06, + "loss": 0.3328, + "step": 18385 + }, + { + "epoch": 0.53, + "grad_norm": 1.3782977580859423, + "learning_rate": 4.704256897663643e-06, + "loss": 0.2875, + "step": 18386 + }, + { + "epoch": 0.53, + "grad_norm": 1.2636539437041376, + "learning_rate": 4.703788013688148e-06, + "loss": 0.2965, + "step": 18387 + }, + { + "epoch": 0.53, + "grad_norm": 1.9946374821627073, + "learning_rate": 4.703319132326729e-06, + "loss": 0.2907, + "step": 18388 + }, + { + "epoch": 0.53, + "grad_norm": 1.5024383154138425, + "learning_rate": 4.702850253583522e-06, + "loss": 0.324, + "step": 18389 + }, + { + "epoch": 0.53, + "grad_norm": 1.2136955470690087, + "learning_rate": 4.702381377462664e-06, + "loss": 0.2934, + "step": 18390 + }, + { + "epoch": 0.53, + "grad_norm": 1.3974962725552103, + "learning_rate": 4.701912503968293e-06, + "loss": 0.3287, + "step": 18391 + }, + { + "epoch": 0.53, + "grad_norm": 1.6747004888610046, + "learning_rate": 4.701443633104547e-06, + "loss": 0.3012, + "step": 18392 + }, + { + "epoch": 0.53, + "grad_norm": 1.3390448728175408, + "learning_rate": 4.700974764875565e-06, + "loss": 0.3179, + "step": 18393 + }, + { + "epoch": 0.53, + "grad_norm": 1.321924466132189, + "learning_rate": 4.700505899285485e-06, + "loss": 0.3006, + "step": 18394 + }, + { + "epoch": 0.53, + "grad_norm": 1.2873951963219246, + "learning_rate": 4.700037036338443e-06, + "loss": 0.2967, + "step": 18395 + }, + { + "epoch": 0.53, + "grad_norm": 1.29973163380616, + "learning_rate": 4.699568176038576e-06, + "loss": 0.311, + "step": 18396 + }, + { + "epoch": 0.53, + "grad_norm": 1.3208158854584542, + "learning_rate": 4.699099318390025e-06, + "loss": 0.3036, + "step": 18397 + }, + { + "epoch": 0.53, + "grad_norm": 1.5911160941101077, + "learning_rate": 4.698630463396925e-06, + "loss": 0.323, + "step": 18398 + }, + { + "epoch": 0.53, + "grad_norm": 1.2636492268779012, + "learning_rate": 4.698161611063416e-06, + "loss": 0.3148, + "step": 18399 + }, + { + "epoch": 0.53, + "grad_norm": 1.2033442678331088, + "learning_rate": 4.697692761393632e-06, + "loss": 0.2596, + "step": 18400 + }, + { + "epoch": 0.53, + "grad_norm": 1.0970373408255396, + "learning_rate": 4.697223914391713e-06, + "loss": 0.6241, + "step": 18401 + }, + { + "epoch": 0.53, + "grad_norm": 1.2209449655186484, + "learning_rate": 4.696755070061797e-06, + "loss": 0.2792, + "step": 18402 + }, + { + "epoch": 0.53, + "grad_norm": 1.326126876841069, + "learning_rate": 4.6962862284080215e-06, + "loss": 0.3256, + "step": 18403 + }, + { + "epoch": 0.53, + "grad_norm": 1.3194011252922808, + "learning_rate": 4.695817389434522e-06, + "loss": 0.2992, + "step": 18404 + }, + { + "epoch": 0.53, + "grad_norm": 1.3203176786409736, + "learning_rate": 4.695348553145438e-06, + "loss": 0.2916, + "step": 18405 + }, + { + "epoch": 0.53, + "grad_norm": 1.3381231840938221, + "learning_rate": 4.694879719544906e-06, + "loss": 0.3068, + "step": 18406 + }, + { + "epoch": 0.53, + "grad_norm": 1.3563633956398833, + "learning_rate": 4.694410888637065e-06, + "loss": 0.2967, + "step": 18407 + }, + { + "epoch": 0.53, + "grad_norm": 1.5539411437855695, + "learning_rate": 4.693942060426051e-06, + "loss": 0.333, + "step": 18408 + }, + { + "epoch": 0.53, + "grad_norm": 1.449790885184184, + "learning_rate": 4.693473234916001e-06, + "loss": 0.3141, + "step": 18409 + }, + { + "epoch": 0.53, + "grad_norm": 1.70417019486894, + "learning_rate": 4.693004412111054e-06, + "loss": 0.2856, + "step": 18410 + }, + { + "epoch": 0.53, + "grad_norm": 1.2984201523847327, + "learning_rate": 4.692535592015346e-06, + "loss": 0.3014, + "step": 18411 + }, + { + "epoch": 0.53, + "grad_norm": 1.6450000231149262, + "learning_rate": 4.692066774633014e-06, + "loss": 0.3461, + "step": 18412 + }, + { + "epoch": 0.53, + "grad_norm": 1.3106736521661284, + "learning_rate": 4.691597959968197e-06, + "loss": 0.2998, + "step": 18413 + }, + { + "epoch": 0.53, + "grad_norm": 1.3680055840885763, + "learning_rate": 4.691129148025031e-06, + "loss": 0.3097, + "step": 18414 + }, + { + "epoch": 0.53, + "grad_norm": 1.231123699016, + "learning_rate": 4.690660338807654e-06, + "loss": 0.2927, + "step": 18415 + }, + { + "epoch": 0.53, + "grad_norm": 1.3509699802003268, + "learning_rate": 4.690191532320204e-06, + "loss": 0.3316, + "step": 18416 + }, + { + "epoch": 0.53, + "grad_norm": 1.2395997667625673, + "learning_rate": 4.689722728566817e-06, + "loss": 0.3065, + "step": 18417 + }, + { + "epoch": 0.53, + "grad_norm": 1.234383843797906, + "learning_rate": 4.689253927551631e-06, + "loss": 0.3202, + "step": 18418 + }, + { + "epoch": 0.53, + "grad_norm": 1.337493436469126, + "learning_rate": 4.688785129278779e-06, + "loss": 0.2954, + "step": 18419 + }, + { + "epoch": 0.53, + "grad_norm": 1.5535769303232199, + "learning_rate": 4.688316333752404e-06, + "loss": 0.3022, + "step": 18420 + }, + { + "epoch": 0.53, + "grad_norm": 1.744095241688848, + "learning_rate": 4.68784754097664e-06, + "loss": 0.2939, + "step": 18421 + }, + { + "epoch": 0.53, + "grad_norm": 1.27149968007694, + "learning_rate": 4.687378750955626e-06, + "loss": 0.284, + "step": 18422 + }, + { + "epoch": 0.53, + "grad_norm": 1.3228448446978176, + "learning_rate": 4.686909963693498e-06, + "loss": 0.3075, + "step": 18423 + }, + { + "epoch": 0.53, + "grad_norm": 1.2513027996049904, + "learning_rate": 4.686441179194392e-06, + "loss": 0.344, + "step": 18424 + }, + { + "epoch": 0.53, + "grad_norm": 1.3700790886289844, + "learning_rate": 4.685972397462446e-06, + "loss": 0.2972, + "step": 18425 + }, + { + "epoch": 0.53, + "grad_norm": 1.2120001281069697, + "learning_rate": 4.685503618501799e-06, + "loss": 0.3023, + "step": 18426 + }, + { + "epoch": 0.53, + "grad_norm": 1.2961701944874047, + "learning_rate": 4.6850348423165845e-06, + "loss": 0.2768, + "step": 18427 + }, + { + "epoch": 0.53, + "grad_norm": 1.3284978085940444, + "learning_rate": 4.684566068910941e-06, + "loss": 0.3134, + "step": 18428 + }, + { + "epoch": 0.53, + "grad_norm": 1.2997321649922962, + "learning_rate": 4.684097298289005e-06, + "loss": 0.3154, + "step": 18429 + }, + { + "epoch": 0.53, + "grad_norm": 1.4259937848662947, + "learning_rate": 4.6836285304549154e-06, + "loss": 0.2956, + "step": 18430 + }, + { + "epoch": 0.53, + "grad_norm": 1.2934526466084546, + "learning_rate": 4.683159765412806e-06, + "loss": 0.3185, + "step": 18431 + }, + { + "epoch": 0.53, + "grad_norm": 1.2483498929736148, + "learning_rate": 4.682691003166815e-06, + "loss": 0.2955, + "step": 18432 + }, + { + "epoch": 0.53, + "grad_norm": 1.2918657857153015, + "learning_rate": 4.68222224372108e-06, + "loss": 0.2901, + "step": 18433 + }, + { + "epoch": 0.53, + "grad_norm": 1.2650082515886962, + "learning_rate": 4.681753487079737e-06, + "loss": 0.2882, + "step": 18434 + }, + { + "epoch": 0.53, + "grad_norm": 1.1767695606495927, + "learning_rate": 4.6812847332469245e-06, + "loss": 0.2902, + "step": 18435 + }, + { + "epoch": 0.53, + "grad_norm": 1.2535150435457265, + "learning_rate": 4.680815982226777e-06, + "loss": 0.3061, + "step": 18436 + }, + { + "epoch": 0.53, + "grad_norm": 1.3140391644790013, + "learning_rate": 4.680347234023431e-06, + "loss": 0.3038, + "step": 18437 + }, + { + "epoch": 0.53, + "grad_norm": 1.3005571436592667, + "learning_rate": 4.679878488641025e-06, + "loss": 0.3194, + "step": 18438 + }, + { + "epoch": 0.53, + "grad_norm": 1.2799642665854443, + "learning_rate": 4.679409746083695e-06, + "loss": 0.3046, + "step": 18439 + }, + { + "epoch": 0.53, + "grad_norm": 1.6349140461526404, + "learning_rate": 4.678941006355578e-06, + "loss": 0.3206, + "step": 18440 + }, + { + "epoch": 0.53, + "grad_norm": 1.4461923153806313, + "learning_rate": 4.678472269460809e-06, + "loss": 0.2938, + "step": 18441 + }, + { + "epoch": 0.53, + "grad_norm": 0.918749807517141, + "learning_rate": 4.678003535403526e-06, + "loss": 0.5933, + "step": 18442 + }, + { + "epoch": 0.53, + "grad_norm": 1.255625939720131, + "learning_rate": 4.677534804187866e-06, + "loss": 0.2938, + "step": 18443 + }, + { + "epoch": 0.53, + "grad_norm": 1.2934043970024407, + "learning_rate": 4.677066075817964e-06, + "loss": 0.3122, + "step": 18444 + }, + { + "epoch": 0.53, + "grad_norm": 1.336216004706549, + "learning_rate": 4.676597350297961e-06, + "loss": 0.3113, + "step": 18445 + }, + { + "epoch": 0.54, + "grad_norm": 1.2765441714946457, + "learning_rate": 4.676128627631986e-06, + "loss": 0.3135, + "step": 18446 + }, + { + "epoch": 0.54, + "grad_norm": 1.2250557705416514, + "learning_rate": 4.67565990782418e-06, + "loss": 0.2876, + "step": 18447 + }, + { + "epoch": 0.54, + "grad_norm": 1.3518279375460318, + "learning_rate": 4.67519119087868e-06, + "loss": 0.3004, + "step": 18448 + }, + { + "epoch": 0.54, + "grad_norm": 1.3083728446224925, + "learning_rate": 4.674722476799621e-06, + "loss": 0.3036, + "step": 18449 + }, + { + "epoch": 0.54, + "grad_norm": 1.2936886543211297, + "learning_rate": 4.674253765591139e-06, + "loss": 0.3024, + "step": 18450 + }, + { + "epoch": 0.54, + "grad_norm": 1.2449240289100958, + "learning_rate": 4.673785057257372e-06, + "loss": 0.2908, + "step": 18451 + }, + { + "epoch": 0.54, + "grad_norm": 2.7463052409157713, + "learning_rate": 4.673316351802454e-06, + "loss": 0.3052, + "step": 18452 + }, + { + "epoch": 0.54, + "grad_norm": 1.3862859368720932, + "learning_rate": 4.672847649230524e-06, + "loss": 0.3135, + "step": 18453 + }, + { + "epoch": 0.54, + "grad_norm": 1.4162285431786683, + "learning_rate": 4.672378949545718e-06, + "loss": 0.3243, + "step": 18454 + }, + { + "epoch": 0.54, + "grad_norm": 1.2446630157980723, + "learning_rate": 4.67191025275217e-06, + "loss": 0.2854, + "step": 18455 + }, + { + "epoch": 0.54, + "grad_norm": 1.4404651265940116, + "learning_rate": 4.671441558854018e-06, + "loss": 0.2908, + "step": 18456 + }, + { + "epoch": 0.54, + "grad_norm": 1.5139585807318052, + "learning_rate": 4.670972867855398e-06, + "loss": 0.3241, + "step": 18457 + }, + { + "epoch": 0.54, + "grad_norm": 1.4098355004722192, + "learning_rate": 4.670504179760446e-06, + "loss": 0.3148, + "step": 18458 + }, + { + "epoch": 0.54, + "grad_norm": 1.3617028848608663, + "learning_rate": 4.6700354945732986e-06, + "loss": 0.2893, + "step": 18459 + }, + { + "epoch": 0.54, + "grad_norm": 1.4725119849054868, + "learning_rate": 4.669566812298092e-06, + "loss": 0.3036, + "step": 18460 + }, + { + "epoch": 0.54, + "grad_norm": 1.4010758768995726, + "learning_rate": 4.669098132938961e-06, + "loss": 0.3117, + "step": 18461 + }, + { + "epoch": 0.54, + "grad_norm": 1.3586781104767065, + "learning_rate": 4.668629456500043e-06, + "loss": 0.3094, + "step": 18462 + }, + { + "epoch": 0.54, + "grad_norm": 1.361856112081412, + "learning_rate": 4.668160782985474e-06, + "loss": 0.3018, + "step": 18463 + }, + { + "epoch": 0.54, + "grad_norm": 1.3610009012797555, + "learning_rate": 4.667692112399389e-06, + "loss": 0.3226, + "step": 18464 + }, + { + "epoch": 0.54, + "grad_norm": 1.5409106581237344, + "learning_rate": 4.667223444745925e-06, + "loss": 0.3317, + "step": 18465 + }, + { + "epoch": 0.54, + "grad_norm": 1.342336009898781, + "learning_rate": 4.666754780029219e-06, + "loss": 0.3109, + "step": 18466 + }, + { + "epoch": 0.54, + "grad_norm": 1.3420652358543508, + "learning_rate": 4.666286118253404e-06, + "loss": 0.3034, + "step": 18467 + }, + { + "epoch": 0.54, + "grad_norm": 1.3414385270177887, + "learning_rate": 4.665817459422619e-06, + "loss": 0.3145, + "step": 18468 + }, + { + "epoch": 0.54, + "grad_norm": 1.2870564549780652, + "learning_rate": 4.6653488035409975e-06, + "loss": 0.3025, + "step": 18469 + }, + { + "epoch": 0.54, + "grad_norm": 1.2399927586858415, + "learning_rate": 4.664880150612677e-06, + "loss": 0.2826, + "step": 18470 + }, + { + "epoch": 0.54, + "grad_norm": 1.421609405859295, + "learning_rate": 4.664411500641794e-06, + "loss": 0.3074, + "step": 18471 + }, + { + "epoch": 0.54, + "grad_norm": 1.4227961200604375, + "learning_rate": 4.663942853632482e-06, + "loss": 0.3512, + "step": 18472 + }, + { + "epoch": 0.54, + "grad_norm": 1.2125738953105607, + "learning_rate": 4.6634742095888804e-06, + "loss": 0.3165, + "step": 18473 + }, + { + "epoch": 0.54, + "grad_norm": 1.3860651110156024, + "learning_rate": 4.66300556851512e-06, + "loss": 0.3429, + "step": 18474 + }, + { + "epoch": 0.54, + "grad_norm": 1.2597375161322246, + "learning_rate": 4.66253693041534e-06, + "loss": 0.3063, + "step": 18475 + }, + { + "epoch": 0.54, + "grad_norm": 0.9314316019510585, + "learning_rate": 4.6620682952936745e-06, + "loss": 0.5956, + "step": 18476 + }, + { + "epoch": 0.54, + "grad_norm": 1.585921551351092, + "learning_rate": 4.661599663154261e-06, + "loss": 0.312, + "step": 18477 + }, + { + "epoch": 0.54, + "grad_norm": 1.2905541997607062, + "learning_rate": 4.661131034001234e-06, + "loss": 0.3092, + "step": 18478 + }, + { + "epoch": 0.54, + "grad_norm": 1.3309635683999232, + "learning_rate": 4.6606624078387284e-06, + "loss": 0.3018, + "step": 18479 + }, + { + "epoch": 0.54, + "grad_norm": 1.3561360989185156, + "learning_rate": 4.660193784670882e-06, + "loss": 0.313, + "step": 18480 + }, + { + "epoch": 0.54, + "grad_norm": 1.3183993879822495, + "learning_rate": 4.65972516450183e-06, + "loss": 0.303, + "step": 18481 + }, + { + "epoch": 0.54, + "grad_norm": 1.4122077337298675, + "learning_rate": 4.659256547335705e-06, + "loss": 0.2938, + "step": 18482 + }, + { + "epoch": 0.54, + "grad_norm": 1.4442301896352834, + "learning_rate": 4.6587879331766465e-06, + "loss": 0.2918, + "step": 18483 + }, + { + "epoch": 0.54, + "grad_norm": 1.2319277991758926, + "learning_rate": 4.658319322028787e-06, + "loss": 0.3048, + "step": 18484 + }, + { + "epoch": 0.54, + "grad_norm": 1.3104526749835355, + "learning_rate": 4.657850713896264e-06, + "loss": 0.3101, + "step": 18485 + }, + { + "epoch": 0.54, + "grad_norm": 1.4581766534241039, + "learning_rate": 4.657382108783212e-06, + "loss": 0.3079, + "step": 18486 + }, + { + "epoch": 0.54, + "grad_norm": 1.9674452586643947, + "learning_rate": 4.656913506693767e-06, + "loss": 0.3196, + "step": 18487 + }, + { + "epoch": 0.54, + "grad_norm": 1.3450223397851657, + "learning_rate": 4.656444907632064e-06, + "loss": 0.3054, + "step": 18488 + }, + { + "epoch": 0.54, + "grad_norm": 1.329044357509134, + "learning_rate": 4.655976311602238e-06, + "loss": 0.3147, + "step": 18489 + }, + { + "epoch": 0.54, + "grad_norm": 1.5969142182911253, + "learning_rate": 4.6555077186084254e-06, + "loss": 0.3079, + "step": 18490 + }, + { + "epoch": 0.54, + "grad_norm": 1.1790007549846508, + "learning_rate": 4.6550391286547615e-06, + "loss": 0.3054, + "step": 18491 + }, + { + "epoch": 0.54, + "grad_norm": 1.282089241606671, + "learning_rate": 4.65457054174538e-06, + "loss": 0.2999, + "step": 18492 + }, + { + "epoch": 0.54, + "grad_norm": 1.2375363933501846, + "learning_rate": 4.654101957884418e-06, + "loss": 0.2895, + "step": 18493 + }, + { + "epoch": 0.54, + "grad_norm": 0.9237699575620633, + "learning_rate": 4.65363337707601e-06, + "loss": 0.5833, + "step": 18494 + }, + { + "epoch": 0.54, + "grad_norm": 1.3047828786898297, + "learning_rate": 4.653164799324291e-06, + "loss": 0.3073, + "step": 18495 + }, + { + "epoch": 0.54, + "grad_norm": 1.2518875583998679, + "learning_rate": 4.652696224633396e-06, + "loss": 0.3238, + "step": 18496 + }, + { + "epoch": 0.54, + "grad_norm": 1.241856269136134, + "learning_rate": 4.652227653007462e-06, + "loss": 0.3178, + "step": 18497 + }, + { + "epoch": 0.54, + "grad_norm": 2.262481107216419, + "learning_rate": 4.651759084450623e-06, + "loss": 0.2873, + "step": 18498 + }, + { + "epoch": 0.54, + "grad_norm": 1.3268891837771406, + "learning_rate": 4.651290518967012e-06, + "loss": 0.3002, + "step": 18499 + }, + { + "epoch": 0.54, + "grad_norm": 1.2712128350198446, + "learning_rate": 4.650821956560767e-06, + "loss": 0.3115, + "step": 18500 + }, + { + "epoch": 0.54, + "grad_norm": 1.3573764716927108, + "learning_rate": 4.650353397236025e-06, + "loss": 0.317, + "step": 18501 + }, + { + "epoch": 0.54, + "grad_norm": 1.1481239687800837, + "learning_rate": 4.649884840996915e-06, + "loss": 0.2917, + "step": 18502 + }, + { + "epoch": 0.54, + "grad_norm": 1.5660563052033707, + "learning_rate": 4.649416287847574e-06, + "loss": 0.3038, + "step": 18503 + }, + { + "epoch": 0.54, + "grad_norm": 1.3846250749089999, + "learning_rate": 4.648947737792141e-06, + "loss": 0.3318, + "step": 18504 + }, + { + "epoch": 0.54, + "grad_norm": 1.3640992453856584, + "learning_rate": 4.648479190834747e-06, + "loss": 0.3027, + "step": 18505 + }, + { + "epoch": 0.54, + "grad_norm": 1.2267065181233219, + "learning_rate": 4.648010646979528e-06, + "loss": 0.3066, + "step": 18506 + }, + { + "epoch": 0.54, + "grad_norm": 1.3370455381467132, + "learning_rate": 4.647542106230618e-06, + "loss": 0.3075, + "step": 18507 + }, + { + "epoch": 0.54, + "grad_norm": 1.191044997019952, + "learning_rate": 4.647073568592154e-06, + "loss": 0.3037, + "step": 18508 + }, + { + "epoch": 0.54, + "grad_norm": 1.3233575698477786, + "learning_rate": 4.646605034068269e-06, + "loss": 0.3018, + "step": 18509 + }, + { + "epoch": 0.54, + "grad_norm": 1.6166500990351458, + "learning_rate": 4.646136502663099e-06, + "loss": 0.3157, + "step": 18510 + }, + { + "epoch": 0.54, + "grad_norm": 1.2191128370287925, + "learning_rate": 4.645667974380778e-06, + "loss": 0.3084, + "step": 18511 + }, + { + "epoch": 0.54, + "grad_norm": 1.2574645261116018, + "learning_rate": 4.645199449225441e-06, + "loss": 0.3176, + "step": 18512 + }, + { + "epoch": 0.54, + "grad_norm": 1.4574322225201788, + "learning_rate": 4.6447309272012235e-06, + "loss": 0.3332, + "step": 18513 + }, + { + "epoch": 0.54, + "grad_norm": 1.4449785620243834, + "learning_rate": 4.644262408312259e-06, + "loss": 0.3086, + "step": 18514 + }, + { + "epoch": 0.54, + "grad_norm": 1.5065652562866494, + "learning_rate": 4.643793892562682e-06, + "loss": 0.3051, + "step": 18515 + }, + { + "epoch": 0.54, + "grad_norm": 1.2798528272965508, + "learning_rate": 4.643325379956628e-06, + "loss": 0.3142, + "step": 18516 + }, + { + "epoch": 0.54, + "grad_norm": 1.4114359205154066, + "learning_rate": 4.642856870498233e-06, + "loss": 0.2987, + "step": 18517 + }, + { + "epoch": 0.54, + "grad_norm": 1.6528114584905003, + "learning_rate": 4.642388364191629e-06, + "loss": 0.3059, + "step": 18518 + }, + { + "epoch": 0.54, + "grad_norm": 2.3971395989710995, + "learning_rate": 4.641919861040951e-06, + "loss": 0.3093, + "step": 18519 + }, + { + "epoch": 0.54, + "grad_norm": 1.4293318326066957, + "learning_rate": 4.6414513610503345e-06, + "loss": 0.3083, + "step": 18520 + }, + { + "epoch": 0.54, + "grad_norm": 1.1839289053553383, + "learning_rate": 4.6409828642239155e-06, + "loss": 0.2902, + "step": 18521 + }, + { + "epoch": 0.54, + "grad_norm": 1.3622752840251053, + "learning_rate": 4.640514370565825e-06, + "loss": 0.3017, + "step": 18522 + }, + { + "epoch": 0.54, + "grad_norm": 1.6567985671161056, + "learning_rate": 4.640045880080199e-06, + "loss": 0.3013, + "step": 18523 + }, + { + "epoch": 0.54, + "grad_norm": 1.3784439725381616, + "learning_rate": 4.639577392771172e-06, + "loss": 0.2867, + "step": 18524 + }, + { + "epoch": 0.54, + "grad_norm": 1.9254488781759342, + "learning_rate": 4.639108908642879e-06, + "loss": 0.3238, + "step": 18525 + }, + { + "epoch": 0.54, + "grad_norm": 1.6725011186345404, + "learning_rate": 4.638640427699455e-06, + "loss": 0.3052, + "step": 18526 + }, + { + "epoch": 0.54, + "grad_norm": 1.575356063704051, + "learning_rate": 4.638171949945032e-06, + "loss": 0.3215, + "step": 18527 + }, + { + "epoch": 0.54, + "grad_norm": 1.320710819305454, + "learning_rate": 4.637703475383745e-06, + "loss": 0.2819, + "step": 18528 + }, + { + "epoch": 0.54, + "grad_norm": 1.244791057171178, + "learning_rate": 4.637235004019732e-06, + "loss": 0.3138, + "step": 18529 + }, + { + "epoch": 0.54, + "grad_norm": 1.2043382942065193, + "learning_rate": 4.636766535857122e-06, + "loss": 0.3049, + "step": 18530 + }, + { + "epoch": 0.54, + "grad_norm": 1.3951695756288014, + "learning_rate": 4.63629807090005e-06, + "loss": 0.2869, + "step": 18531 + }, + { + "epoch": 0.54, + "grad_norm": 1.2045643662712555, + "learning_rate": 4.635829609152654e-06, + "loss": 0.316, + "step": 18532 + }, + { + "epoch": 0.54, + "grad_norm": 1.3133215911813747, + "learning_rate": 4.635361150619064e-06, + "loss": 0.337, + "step": 18533 + }, + { + "epoch": 0.54, + "grad_norm": 1.3509381649952044, + "learning_rate": 4.634892695303415e-06, + "loss": 0.3156, + "step": 18534 + }, + { + "epoch": 0.54, + "grad_norm": 1.205074122768992, + "learning_rate": 4.634424243209844e-06, + "loss": 0.294, + "step": 18535 + }, + { + "epoch": 0.54, + "grad_norm": 1.4134531369485832, + "learning_rate": 4.633955794342483e-06, + "loss": 0.3087, + "step": 18536 + }, + { + "epoch": 0.54, + "grad_norm": 1.2368135544899128, + "learning_rate": 4.633487348705466e-06, + "loss": 0.2903, + "step": 18537 + }, + { + "epoch": 0.54, + "grad_norm": 1.2266603434004286, + "learning_rate": 4.633018906302925e-06, + "loss": 0.309, + "step": 18538 + }, + { + "epoch": 0.54, + "grad_norm": 1.2908025528761713, + "learning_rate": 4.6325504671389984e-06, + "loss": 0.2912, + "step": 18539 + }, + { + "epoch": 0.54, + "grad_norm": 1.2631019968924408, + "learning_rate": 4.6320820312178186e-06, + "loss": 0.3007, + "step": 18540 + }, + { + "epoch": 0.54, + "grad_norm": 1.3940100915755753, + "learning_rate": 4.6316135985435176e-06, + "loss": 0.3145, + "step": 18541 + }, + { + "epoch": 0.54, + "grad_norm": 1.244761742495712, + "learning_rate": 4.631145169120231e-06, + "loss": 0.2991, + "step": 18542 + }, + { + "epoch": 0.54, + "grad_norm": 1.3274784139058575, + "learning_rate": 4.630676742952092e-06, + "loss": 0.3084, + "step": 18543 + }, + { + "epoch": 0.54, + "grad_norm": 1.3678263001566882, + "learning_rate": 4.630208320043235e-06, + "loss": 0.3012, + "step": 18544 + }, + { + "epoch": 0.54, + "grad_norm": 1.3775740906303284, + "learning_rate": 4.629739900397795e-06, + "loss": 0.2899, + "step": 18545 + }, + { + "epoch": 0.54, + "grad_norm": 1.3469280894524744, + "learning_rate": 4.629271484019903e-06, + "loss": 0.3087, + "step": 18546 + }, + { + "epoch": 0.54, + "grad_norm": 0.9035299575451594, + "learning_rate": 4.628803070913694e-06, + "loss": 0.5977, + "step": 18547 + }, + { + "epoch": 0.54, + "grad_norm": 1.348150042191662, + "learning_rate": 4.628334661083303e-06, + "loss": 0.3016, + "step": 18548 + }, + { + "epoch": 0.54, + "grad_norm": 1.2212085348905326, + "learning_rate": 4.627866254532863e-06, + "loss": 0.3082, + "step": 18549 + }, + { + "epoch": 0.54, + "grad_norm": 1.9140309686186503, + "learning_rate": 4.627397851266507e-06, + "loss": 0.2969, + "step": 18550 + }, + { + "epoch": 0.54, + "grad_norm": 1.281090646755649, + "learning_rate": 4.626929451288368e-06, + "loss": 0.3024, + "step": 18551 + }, + { + "epoch": 0.54, + "grad_norm": 1.32124433289458, + "learning_rate": 4.626461054602582e-06, + "loss": 0.2976, + "step": 18552 + }, + { + "epoch": 0.54, + "grad_norm": 1.4630372997459897, + "learning_rate": 4.625992661213282e-06, + "loss": 0.3345, + "step": 18553 + }, + { + "epoch": 0.54, + "grad_norm": 1.4973833895720168, + "learning_rate": 4.6255242711246e-06, + "loss": 0.2995, + "step": 18554 + }, + { + "epoch": 0.54, + "grad_norm": 1.595245629654473, + "learning_rate": 4.6250558843406695e-06, + "loss": 0.3066, + "step": 18555 + }, + { + "epoch": 0.54, + "grad_norm": 1.1779172003532743, + "learning_rate": 4.624587500865626e-06, + "loss": 0.2874, + "step": 18556 + }, + { + "epoch": 0.54, + "grad_norm": 1.4939036197706508, + "learning_rate": 4.624119120703603e-06, + "loss": 0.2927, + "step": 18557 + }, + { + "epoch": 0.54, + "grad_norm": 1.3374898488680997, + "learning_rate": 4.623650743858732e-06, + "loss": 0.2901, + "step": 18558 + }, + { + "epoch": 0.54, + "grad_norm": 1.6438808852515283, + "learning_rate": 4.623182370335148e-06, + "loss": 0.3129, + "step": 18559 + }, + { + "epoch": 0.54, + "grad_norm": 1.4353225514827266, + "learning_rate": 4.622714000136983e-06, + "loss": 0.271, + "step": 18560 + }, + { + "epoch": 0.54, + "grad_norm": 1.3689582633060247, + "learning_rate": 4.622245633268371e-06, + "loss": 0.3019, + "step": 18561 + }, + { + "epoch": 0.54, + "grad_norm": 1.5338027761115258, + "learning_rate": 4.621777269733445e-06, + "loss": 0.2869, + "step": 18562 + }, + { + "epoch": 0.54, + "grad_norm": 1.3413501670062669, + "learning_rate": 4.62130890953634e-06, + "loss": 0.2808, + "step": 18563 + }, + { + "epoch": 0.54, + "grad_norm": 2.367931735902063, + "learning_rate": 4.620840552681187e-06, + "loss": 0.3058, + "step": 18564 + }, + { + "epoch": 0.54, + "grad_norm": 1.2475706072271613, + "learning_rate": 4.620372199172121e-06, + "loss": 0.2904, + "step": 18565 + }, + { + "epoch": 0.54, + "grad_norm": 1.4179033765124518, + "learning_rate": 4.619903849013273e-06, + "loss": 0.316, + "step": 18566 + }, + { + "epoch": 0.54, + "grad_norm": 1.270111685061208, + "learning_rate": 4.61943550220878e-06, + "loss": 0.2981, + "step": 18567 + }, + { + "epoch": 0.54, + "grad_norm": 1.3009619909296701, + "learning_rate": 4.6189671587627725e-06, + "loss": 0.3145, + "step": 18568 + }, + { + "epoch": 0.54, + "grad_norm": 1.2387997105885458, + "learning_rate": 4.618498818679383e-06, + "loss": 0.2956, + "step": 18569 + }, + { + "epoch": 0.54, + "grad_norm": 1.4906141703562, + "learning_rate": 4.618030481962746e-06, + "loss": 0.3235, + "step": 18570 + }, + { + "epoch": 0.54, + "grad_norm": 1.5535919013689297, + "learning_rate": 4.617562148616994e-06, + "loss": 0.3002, + "step": 18571 + }, + { + "epoch": 0.54, + "grad_norm": 1.27917574784289, + "learning_rate": 4.617093818646261e-06, + "loss": 0.3073, + "step": 18572 + }, + { + "epoch": 0.54, + "grad_norm": 1.5570855097968952, + "learning_rate": 4.616625492054679e-06, + "loss": 0.2984, + "step": 18573 + }, + { + "epoch": 0.54, + "grad_norm": 1.444998881860174, + "learning_rate": 4.616157168846381e-06, + "loss": 0.3264, + "step": 18574 + }, + { + "epoch": 0.54, + "grad_norm": 1.2546257532576899, + "learning_rate": 4.6156888490255e-06, + "loss": 0.3064, + "step": 18575 + }, + { + "epoch": 0.54, + "grad_norm": 0.962004411717875, + "learning_rate": 4.61522053259617e-06, + "loss": 0.6081, + "step": 18576 + }, + { + "epoch": 0.54, + "grad_norm": 1.883252825504413, + "learning_rate": 4.614752219562523e-06, + "loss": 0.2928, + "step": 18577 + }, + { + "epoch": 0.54, + "grad_norm": 2.059919173896435, + "learning_rate": 4.614283909928692e-06, + "loss": 0.291, + "step": 18578 + }, + { + "epoch": 0.54, + "grad_norm": 1.428217062353896, + "learning_rate": 4.613815603698809e-06, + "loss": 0.3193, + "step": 18579 + }, + { + "epoch": 0.54, + "grad_norm": 1.4137025266068552, + "learning_rate": 4.613347300877008e-06, + "loss": 0.3028, + "step": 18580 + }, + { + "epoch": 0.54, + "grad_norm": 1.3866734759285404, + "learning_rate": 4.6128790014674214e-06, + "loss": 0.305, + "step": 18581 + }, + { + "epoch": 0.54, + "grad_norm": 1.2732856561402714, + "learning_rate": 4.612410705474182e-06, + "loss": 0.2926, + "step": 18582 + }, + { + "epoch": 0.54, + "grad_norm": 1.2452586354510533, + "learning_rate": 4.611942412901422e-06, + "loss": 0.2842, + "step": 18583 + }, + { + "epoch": 0.54, + "grad_norm": 3.070492713436706, + "learning_rate": 4.611474123753275e-06, + "loss": 0.3083, + "step": 18584 + }, + { + "epoch": 0.54, + "grad_norm": 1.310509432537982, + "learning_rate": 4.6110058380338735e-06, + "loss": 0.3195, + "step": 18585 + }, + { + "epoch": 0.54, + "grad_norm": 1.2714146875401693, + "learning_rate": 4.61053755574735e-06, + "loss": 0.2976, + "step": 18586 + }, + { + "epoch": 0.54, + "grad_norm": 1.4628779791187458, + "learning_rate": 4.610069276897837e-06, + "loss": 0.3051, + "step": 18587 + }, + { + "epoch": 0.54, + "grad_norm": 1.559498642761116, + "learning_rate": 4.609601001489466e-06, + "loss": 0.3364, + "step": 18588 + }, + { + "epoch": 0.54, + "grad_norm": 1.3986915292996926, + "learning_rate": 4.60913272952637e-06, + "loss": 0.3056, + "step": 18589 + }, + { + "epoch": 0.54, + "grad_norm": 1.3414255128206038, + "learning_rate": 4.608664461012683e-06, + "loss": 0.315, + "step": 18590 + }, + { + "epoch": 0.54, + "grad_norm": 1.4696826639741658, + "learning_rate": 4.608196195952537e-06, + "loss": 0.3185, + "step": 18591 + }, + { + "epoch": 0.54, + "grad_norm": 1.270375235106843, + "learning_rate": 4.6077279343500625e-06, + "loss": 0.2865, + "step": 18592 + }, + { + "epoch": 0.54, + "grad_norm": 1.722411022428727, + "learning_rate": 4.607259676209394e-06, + "loss": 0.3025, + "step": 18593 + }, + { + "epoch": 0.54, + "grad_norm": 1.238357924681669, + "learning_rate": 4.606791421534662e-06, + "loss": 0.2877, + "step": 18594 + }, + { + "epoch": 0.54, + "grad_norm": 1.3182661865094747, + "learning_rate": 4.606323170330002e-06, + "loss": 0.2949, + "step": 18595 + }, + { + "epoch": 0.54, + "grad_norm": 1.2364716951470787, + "learning_rate": 4.6058549225995435e-06, + "loss": 0.3077, + "step": 18596 + }, + { + "epoch": 0.54, + "grad_norm": 1.461827010970445, + "learning_rate": 4.605386678347419e-06, + "loss": 0.3053, + "step": 18597 + }, + { + "epoch": 0.54, + "grad_norm": 1.3834645264738288, + "learning_rate": 4.6049184375777625e-06, + "loss": 0.2989, + "step": 18598 + }, + { + "epoch": 0.54, + "grad_norm": 1.2179623952622844, + "learning_rate": 4.604450200294705e-06, + "loss": 0.293, + "step": 18599 + }, + { + "epoch": 0.54, + "grad_norm": 1.4769822683181817, + "learning_rate": 4.603981966502379e-06, + "loss": 0.3021, + "step": 18600 + }, + { + "epoch": 0.54, + "grad_norm": 1.7501147476244374, + "learning_rate": 4.603513736204915e-06, + "loss": 0.2927, + "step": 18601 + }, + { + "epoch": 0.54, + "grad_norm": 1.402174461089122, + "learning_rate": 4.603045509406448e-06, + "loss": 0.3355, + "step": 18602 + }, + { + "epoch": 0.54, + "grad_norm": 1.345808688918519, + "learning_rate": 4.6025772861111085e-06, + "loss": 0.2937, + "step": 18603 + }, + { + "epoch": 0.54, + "grad_norm": 1.4244285697991774, + "learning_rate": 4.6021090663230304e-06, + "loss": 0.3344, + "step": 18604 + }, + { + "epoch": 0.54, + "grad_norm": 2.003072305848626, + "learning_rate": 4.601640850046342e-06, + "loss": 0.2978, + "step": 18605 + }, + { + "epoch": 0.54, + "grad_norm": 1.255316163151888, + "learning_rate": 4.601172637285178e-06, + "loss": 0.2986, + "step": 18606 + }, + { + "epoch": 0.54, + "grad_norm": 1.477146710479774, + "learning_rate": 4.60070442804367e-06, + "loss": 0.2943, + "step": 18607 + }, + { + "epoch": 0.54, + "grad_norm": 1.2609064176888904, + "learning_rate": 4.600236222325951e-06, + "loss": 0.3031, + "step": 18608 + }, + { + "epoch": 0.54, + "grad_norm": 1.9150533300644763, + "learning_rate": 4.59976802013615e-06, + "loss": 0.3263, + "step": 18609 + }, + { + "epoch": 0.54, + "grad_norm": 1.2308936798154901, + "learning_rate": 4.599299821478402e-06, + "loss": 0.2992, + "step": 18610 + }, + { + "epoch": 0.54, + "grad_norm": 1.5706826038953334, + "learning_rate": 4.598831626356836e-06, + "loss": 0.3195, + "step": 18611 + }, + { + "epoch": 0.54, + "grad_norm": 1.5330960167138896, + "learning_rate": 4.598363434775588e-06, + "loss": 0.3354, + "step": 18612 + }, + { + "epoch": 0.54, + "grad_norm": 2.6446729541831133, + "learning_rate": 4.597895246738786e-06, + "loss": 0.3052, + "step": 18613 + }, + { + "epoch": 0.54, + "grad_norm": 1.8519995637722204, + "learning_rate": 4.597427062250564e-06, + "loss": 0.2771, + "step": 18614 + }, + { + "epoch": 0.54, + "grad_norm": 1.310830565560897, + "learning_rate": 4.596958881315051e-06, + "loss": 0.2953, + "step": 18615 + }, + { + "epoch": 0.54, + "grad_norm": 1.3593223390614175, + "learning_rate": 4.5964907039363795e-06, + "loss": 0.291, + "step": 18616 + }, + { + "epoch": 0.54, + "grad_norm": 1.3123821149260926, + "learning_rate": 4.596022530118683e-06, + "loss": 0.3203, + "step": 18617 + }, + { + "epoch": 0.54, + "grad_norm": 1.1733539297429139, + "learning_rate": 4.595554359866093e-06, + "loss": 0.2819, + "step": 18618 + }, + { + "epoch": 0.54, + "grad_norm": 1.3822213483653571, + "learning_rate": 4.595086193182739e-06, + "loss": 0.3066, + "step": 18619 + }, + { + "epoch": 0.54, + "grad_norm": 1.4557034014443317, + "learning_rate": 4.594618030072754e-06, + "loss": 0.2878, + "step": 18620 + }, + { + "epoch": 0.54, + "grad_norm": 1.3464553355343256, + "learning_rate": 4.594149870540269e-06, + "loss": 0.3087, + "step": 18621 + }, + { + "epoch": 0.54, + "grad_norm": 1.362422056502827, + "learning_rate": 4.5936817145894165e-06, + "loss": 0.3028, + "step": 18622 + }, + { + "epoch": 0.54, + "grad_norm": 1.2660783773664037, + "learning_rate": 4.593213562224328e-06, + "loss": 0.2867, + "step": 18623 + }, + { + "epoch": 0.54, + "grad_norm": 1.281626257092474, + "learning_rate": 4.592745413449134e-06, + "loss": 0.3235, + "step": 18624 + }, + { + "epoch": 0.54, + "grad_norm": 1.3016612275439021, + "learning_rate": 4.592277268267965e-06, + "loss": 0.288, + "step": 18625 + }, + { + "epoch": 0.54, + "grad_norm": 1.2404440950478648, + "learning_rate": 4.591809126684955e-06, + "loss": 0.3023, + "step": 18626 + }, + { + "epoch": 0.54, + "grad_norm": 1.3194748150686757, + "learning_rate": 4.591340988704234e-06, + "loss": 0.2813, + "step": 18627 + }, + { + "epoch": 0.54, + "grad_norm": 1.4045940365226224, + "learning_rate": 4.590872854329932e-06, + "loss": 0.3022, + "step": 18628 + }, + { + "epoch": 0.54, + "grad_norm": 1.301389077158994, + "learning_rate": 4.590404723566183e-06, + "loss": 0.3196, + "step": 18629 + }, + { + "epoch": 0.54, + "grad_norm": 1.4337517955067287, + "learning_rate": 4.589936596417116e-06, + "loss": 0.2748, + "step": 18630 + }, + { + "epoch": 0.54, + "grad_norm": 1.3879512242187642, + "learning_rate": 4.5894684728868645e-06, + "loss": 0.2932, + "step": 18631 + }, + { + "epoch": 0.54, + "grad_norm": 1.4580510110241915, + "learning_rate": 4.589000352979556e-06, + "loss": 0.2852, + "step": 18632 + }, + { + "epoch": 0.54, + "grad_norm": 1.09223573125889, + "learning_rate": 4.5885322366993255e-06, + "loss": 0.2804, + "step": 18633 + }, + { + "epoch": 0.54, + "grad_norm": 1.3560463736924153, + "learning_rate": 4.588064124050302e-06, + "loss": 0.3034, + "step": 18634 + }, + { + "epoch": 0.54, + "grad_norm": 2.0259822905062435, + "learning_rate": 4.587596015036618e-06, + "loss": 0.3407, + "step": 18635 + }, + { + "epoch": 0.54, + "grad_norm": 1.9941820368907914, + "learning_rate": 4.587127909662403e-06, + "loss": 0.3053, + "step": 18636 + }, + { + "epoch": 0.54, + "grad_norm": 0.9535235268345298, + "learning_rate": 4.586659807931789e-06, + "loss": 0.5872, + "step": 18637 + }, + { + "epoch": 0.54, + "grad_norm": 1.3580688353457997, + "learning_rate": 4.586191709848906e-06, + "loss": 0.304, + "step": 18638 + }, + { + "epoch": 0.54, + "grad_norm": 1.3696329621236458, + "learning_rate": 4.585723615417886e-06, + "loss": 0.319, + "step": 18639 + }, + { + "epoch": 0.54, + "grad_norm": 1.305351289946847, + "learning_rate": 4.585255524642861e-06, + "loss": 0.2998, + "step": 18640 + }, + { + "epoch": 0.54, + "grad_norm": 2.2340578231966526, + "learning_rate": 4.58478743752796e-06, + "loss": 0.2988, + "step": 18641 + }, + { + "epoch": 0.54, + "grad_norm": 1.7120000655491594, + "learning_rate": 4.5843193540773155e-06, + "loss": 0.2949, + "step": 18642 + }, + { + "epoch": 0.54, + "grad_norm": 1.164157656628605, + "learning_rate": 4.583851274295056e-06, + "loss": 0.2745, + "step": 18643 + }, + { + "epoch": 0.54, + "grad_norm": 1.4139102871566551, + "learning_rate": 4.583383198185313e-06, + "loss": 0.2922, + "step": 18644 + }, + { + "epoch": 0.54, + "grad_norm": 1.7294854466780636, + "learning_rate": 4.582915125752219e-06, + "loss": 0.3098, + "step": 18645 + }, + { + "epoch": 0.54, + "grad_norm": 1.329991896607331, + "learning_rate": 4.582447056999904e-06, + "loss": 0.3036, + "step": 18646 + }, + { + "epoch": 0.54, + "grad_norm": 1.4740055811468193, + "learning_rate": 4.581978991932496e-06, + "loss": 0.3051, + "step": 18647 + }, + { + "epoch": 0.54, + "grad_norm": 1.4272098175684553, + "learning_rate": 4.58151093055413e-06, + "loss": 0.3395, + "step": 18648 + }, + { + "epoch": 0.54, + "grad_norm": 2.6015352651258503, + "learning_rate": 4.581042872868934e-06, + "loss": 0.3016, + "step": 18649 + }, + { + "epoch": 0.54, + "grad_norm": 0.939541823847607, + "learning_rate": 4.58057481888104e-06, + "loss": 0.5963, + "step": 18650 + }, + { + "epoch": 0.54, + "grad_norm": 1.385010219082476, + "learning_rate": 4.580106768594577e-06, + "loss": 0.3013, + "step": 18651 + }, + { + "epoch": 0.54, + "grad_norm": 1.35537529848143, + "learning_rate": 4.579638722013677e-06, + "loss": 0.3101, + "step": 18652 + }, + { + "epoch": 0.54, + "grad_norm": 1.3639004435442594, + "learning_rate": 4.5791706791424694e-06, + "loss": 0.3101, + "step": 18653 + }, + { + "epoch": 0.54, + "grad_norm": 1.4148080741719855, + "learning_rate": 4.5787026399850864e-06, + "loss": 0.3188, + "step": 18654 + }, + { + "epoch": 0.54, + "grad_norm": 1.3496428893368464, + "learning_rate": 4.578234604545656e-06, + "loss": 0.3093, + "step": 18655 + }, + { + "epoch": 0.54, + "grad_norm": 1.5560360150647696, + "learning_rate": 4.577766572828311e-06, + "loss": 0.3099, + "step": 18656 + }, + { + "epoch": 0.54, + "grad_norm": 1.286338557565484, + "learning_rate": 4.577298544837179e-06, + "loss": 0.2933, + "step": 18657 + }, + { + "epoch": 0.54, + "grad_norm": 1.4960367531219447, + "learning_rate": 4.576830520576393e-06, + "loss": 0.3033, + "step": 18658 + }, + { + "epoch": 0.54, + "grad_norm": 1.331247554678048, + "learning_rate": 4.576362500050083e-06, + "loss": 0.3136, + "step": 18659 + }, + { + "epoch": 0.54, + "grad_norm": 1.4380015141005085, + "learning_rate": 4.575894483262378e-06, + "loss": 0.3028, + "step": 18660 + }, + { + "epoch": 0.54, + "grad_norm": 1.2385749171071214, + "learning_rate": 4.575426470217409e-06, + "loss": 0.2989, + "step": 18661 + }, + { + "epoch": 0.54, + "grad_norm": 1.4380992163571114, + "learning_rate": 4.574958460919305e-06, + "loss": 0.3587, + "step": 18662 + }, + { + "epoch": 0.54, + "grad_norm": 1.3035754388065244, + "learning_rate": 4.5744904553722e-06, + "loss": 0.2859, + "step": 18663 + }, + { + "epoch": 0.54, + "grad_norm": 1.33958051919625, + "learning_rate": 4.574022453580219e-06, + "loss": 0.3023, + "step": 18664 + }, + { + "epoch": 0.54, + "grad_norm": 1.2633636472712464, + "learning_rate": 4.573554455547495e-06, + "loss": 0.2962, + "step": 18665 + }, + { + "epoch": 0.54, + "grad_norm": 1.258669692229523, + "learning_rate": 4.573086461278158e-06, + "loss": 0.2867, + "step": 18666 + }, + { + "epoch": 0.54, + "grad_norm": 1.2398884232987992, + "learning_rate": 4.572618470776339e-06, + "loss": 0.3027, + "step": 18667 + }, + { + "epoch": 0.54, + "grad_norm": 1.2786735151314306, + "learning_rate": 4.572150484046165e-06, + "loss": 0.3108, + "step": 18668 + }, + { + "epoch": 0.54, + "grad_norm": 1.4752778641744613, + "learning_rate": 4.571682501091767e-06, + "loss": 0.2863, + "step": 18669 + }, + { + "epoch": 0.54, + "grad_norm": 1.190623760176326, + "learning_rate": 4.57121452191728e-06, + "loss": 0.2715, + "step": 18670 + }, + { + "epoch": 0.54, + "grad_norm": 1.2372593178612168, + "learning_rate": 4.570746546526826e-06, + "loss": 0.2762, + "step": 18671 + }, + { + "epoch": 0.54, + "grad_norm": 1.491075032798748, + "learning_rate": 4.570278574924539e-06, + "loss": 0.322, + "step": 18672 + }, + { + "epoch": 0.54, + "grad_norm": 1.556461092018117, + "learning_rate": 4.569810607114548e-06, + "loss": 0.3158, + "step": 18673 + }, + { + "epoch": 0.54, + "grad_norm": 1.6518041167494588, + "learning_rate": 4.569342643100984e-06, + "loss": 0.3133, + "step": 18674 + }, + { + "epoch": 0.54, + "grad_norm": 1.289109908886277, + "learning_rate": 4.568874682887975e-06, + "loss": 0.2824, + "step": 18675 + }, + { + "epoch": 0.54, + "grad_norm": 1.7558130268397216, + "learning_rate": 4.568406726479651e-06, + "loss": 0.3014, + "step": 18676 + }, + { + "epoch": 0.54, + "grad_norm": 1.3234336565284752, + "learning_rate": 4.567938773880144e-06, + "loss": 0.3208, + "step": 18677 + }, + { + "epoch": 0.54, + "grad_norm": 1.3058527062916856, + "learning_rate": 4.567470825093583e-06, + "loss": 0.3103, + "step": 18678 + }, + { + "epoch": 0.54, + "grad_norm": 1.299113639571696, + "learning_rate": 4.567002880124095e-06, + "loss": 0.3083, + "step": 18679 + }, + { + "epoch": 0.54, + "grad_norm": 1.4478989642599698, + "learning_rate": 4.566534938975812e-06, + "loss": 0.3049, + "step": 18680 + }, + { + "epoch": 0.54, + "grad_norm": 1.268708840948423, + "learning_rate": 4.566067001652862e-06, + "loss": 0.2912, + "step": 18681 + }, + { + "epoch": 0.54, + "grad_norm": 1.3547585813619598, + "learning_rate": 4.565599068159377e-06, + "loss": 0.2924, + "step": 18682 + }, + { + "epoch": 0.54, + "grad_norm": 1.2129245310360708, + "learning_rate": 4.565131138499484e-06, + "loss": 0.2886, + "step": 18683 + }, + { + "epoch": 0.54, + "grad_norm": 1.5187579059691458, + "learning_rate": 4.5646632126773134e-06, + "loss": 0.2858, + "step": 18684 + }, + { + "epoch": 0.54, + "grad_norm": 1.3210449206602177, + "learning_rate": 4.564195290696995e-06, + "loss": 0.3154, + "step": 18685 + }, + { + "epoch": 0.54, + "grad_norm": 1.5652296549006965, + "learning_rate": 4.563727372562659e-06, + "loss": 0.2868, + "step": 18686 + }, + { + "epoch": 0.54, + "grad_norm": 1.4202497997377603, + "learning_rate": 4.563259458278433e-06, + "loss": 0.2869, + "step": 18687 + }, + { + "epoch": 0.54, + "grad_norm": 2.1646964743992156, + "learning_rate": 4.562791547848447e-06, + "loss": 0.3, + "step": 18688 + }, + { + "epoch": 0.54, + "grad_norm": 1.2490730635485843, + "learning_rate": 4.562323641276831e-06, + "loss": 0.3024, + "step": 18689 + }, + { + "epoch": 0.54, + "grad_norm": 1.3515825563818669, + "learning_rate": 4.561855738567714e-06, + "loss": 0.3255, + "step": 18690 + }, + { + "epoch": 0.54, + "grad_norm": 1.4583205367113676, + "learning_rate": 4.561387839725224e-06, + "loss": 0.2857, + "step": 18691 + }, + { + "epoch": 0.54, + "grad_norm": 1.2082384611986394, + "learning_rate": 4.560919944753491e-06, + "loss": 0.2922, + "step": 18692 + }, + { + "epoch": 0.54, + "grad_norm": 1.3254294084335592, + "learning_rate": 4.560452053656645e-06, + "loss": 0.3113, + "step": 18693 + }, + { + "epoch": 0.54, + "grad_norm": 3.3020945707772733, + "learning_rate": 4.559984166438814e-06, + "loss": 0.3102, + "step": 18694 + }, + { + "epoch": 0.54, + "grad_norm": 1.412177630845879, + "learning_rate": 4.559516283104128e-06, + "loss": 0.2991, + "step": 18695 + }, + { + "epoch": 0.54, + "grad_norm": 1.2797480664189804, + "learning_rate": 4.559048403656716e-06, + "loss": 0.2874, + "step": 18696 + }, + { + "epoch": 0.54, + "grad_norm": 1.406052635119005, + "learning_rate": 4.558580528100706e-06, + "loss": 0.31, + "step": 18697 + }, + { + "epoch": 0.54, + "grad_norm": 1.1970421558718902, + "learning_rate": 4.55811265644023e-06, + "loss": 0.2953, + "step": 18698 + }, + { + "epoch": 0.54, + "grad_norm": 1.6710047125927299, + "learning_rate": 4.557644788679413e-06, + "loss": 0.2981, + "step": 18699 + }, + { + "epoch": 0.54, + "grad_norm": 1.3335823357564316, + "learning_rate": 4.557176924822385e-06, + "loss": 0.3084, + "step": 18700 + }, + { + "epoch": 0.54, + "grad_norm": 2.9821312965617386, + "learning_rate": 4.556709064873277e-06, + "loss": 0.2953, + "step": 18701 + }, + { + "epoch": 0.54, + "grad_norm": 1.2878870136990108, + "learning_rate": 4.556241208836215e-06, + "loss": 0.3202, + "step": 18702 + }, + { + "epoch": 0.54, + "grad_norm": 1.3555072580276537, + "learning_rate": 4.5557733567153295e-06, + "loss": 0.2923, + "step": 18703 + }, + { + "epoch": 0.54, + "grad_norm": 1.274437469501668, + "learning_rate": 4.555305508514749e-06, + "loss": 0.2985, + "step": 18704 + }, + { + "epoch": 0.54, + "grad_norm": 1.4536984665975012, + "learning_rate": 4.5548376642386035e-06, + "loss": 0.3205, + "step": 18705 + }, + { + "epoch": 0.54, + "grad_norm": 1.483012771811091, + "learning_rate": 4.55436982389102e-06, + "loss": 0.2935, + "step": 18706 + }, + { + "epoch": 0.54, + "grad_norm": 1.121836061782105, + "learning_rate": 4.553901987476127e-06, + "loss": 0.2675, + "step": 18707 + }, + { + "epoch": 0.54, + "grad_norm": 1.563700173259177, + "learning_rate": 4.553434154998053e-06, + "loss": 0.2919, + "step": 18708 + }, + { + "epoch": 0.54, + "grad_norm": 1.2554987521867451, + "learning_rate": 4.55296632646093e-06, + "loss": 0.3167, + "step": 18709 + }, + { + "epoch": 0.54, + "grad_norm": 1.336767569678062, + "learning_rate": 4.552498501868883e-06, + "loss": 0.2906, + "step": 18710 + }, + { + "epoch": 0.54, + "grad_norm": 1.2937599232523285, + "learning_rate": 4.552030681226041e-06, + "loss": 0.2976, + "step": 18711 + }, + { + "epoch": 0.54, + "grad_norm": 1.3328689186197764, + "learning_rate": 4.551562864536534e-06, + "loss": 0.2996, + "step": 18712 + }, + { + "epoch": 0.54, + "grad_norm": 1.318783699032526, + "learning_rate": 4.5510950518044885e-06, + "loss": 0.3015, + "step": 18713 + }, + { + "epoch": 0.54, + "grad_norm": 1.2487021816162935, + "learning_rate": 4.550627243034036e-06, + "loss": 0.2925, + "step": 18714 + }, + { + "epoch": 0.54, + "grad_norm": 1.5039813793627905, + "learning_rate": 4.5501594382293025e-06, + "loss": 0.305, + "step": 18715 + }, + { + "epoch": 0.54, + "grad_norm": 1.4153997070298407, + "learning_rate": 4.5496916373944165e-06, + "loss": 0.2895, + "step": 18716 + }, + { + "epoch": 0.54, + "grad_norm": 1.401666368899605, + "learning_rate": 4.549223840533507e-06, + "loss": 0.3064, + "step": 18717 + }, + { + "epoch": 0.54, + "grad_norm": 1.254386282348764, + "learning_rate": 4.5487560476507024e-06, + "loss": 0.2996, + "step": 18718 + }, + { + "epoch": 0.54, + "grad_norm": 1.3206991995470665, + "learning_rate": 4.54828825875013e-06, + "loss": 0.3114, + "step": 18719 + }, + { + "epoch": 0.54, + "grad_norm": 1.257051431438859, + "learning_rate": 4.547820473835919e-06, + "loss": 0.2885, + "step": 18720 + }, + { + "epoch": 0.54, + "grad_norm": 1.4054523614639005, + "learning_rate": 4.5473526929121975e-06, + "loss": 0.3019, + "step": 18721 + }, + { + "epoch": 0.54, + "grad_norm": 1.3318296183734015, + "learning_rate": 4.5468849159830944e-06, + "loss": 0.2971, + "step": 18722 + }, + { + "epoch": 0.54, + "grad_norm": 1.5134027876227882, + "learning_rate": 4.546417143052736e-06, + "loss": 0.3092, + "step": 18723 + }, + { + "epoch": 0.54, + "grad_norm": 1.3045305288010363, + "learning_rate": 4.5459493741252505e-06, + "loss": 0.2967, + "step": 18724 + }, + { + "epoch": 0.54, + "grad_norm": 1.5559355731668874, + "learning_rate": 4.545481609204767e-06, + "loss": 0.3061, + "step": 18725 + }, + { + "epoch": 0.54, + "grad_norm": 1.6250505451851216, + "learning_rate": 4.545013848295416e-06, + "loss": 0.2952, + "step": 18726 + }, + { + "epoch": 0.54, + "grad_norm": 1.4418431397684228, + "learning_rate": 4.544546091401321e-06, + "loss": 0.2915, + "step": 18727 + }, + { + "epoch": 0.54, + "grad_norm": 1.7109753757682165, + "learning_rate": 4.5440783385266125e-06, + "loss": 0.2849, + "step": 18728 + }, + { + "epoch": 0.54, + "grad_norm": 1.3856409714632458, + "learning_rate": 4.543610589675417e-06, + "loss": 0.2967, + "step": 18729 + }, + { + "epoch": 0.54, + "grad_norm": 2.909889979929419, + "learning_rate": 4.5431428448518625e-06, + "loss": 0.3071, + "step": 18730 + }, + { + "epoch": 0.54, + "grad_norm": 3.860046658144206, + "learning_rate": 4.542675104060077e-06, + "loss": 0.297, + "step": 18731 + }, + { + "epoch": 0.54, + "grad_norm": 2.044250274259123, + "learning_rate": 4.54220736730419e-06, + "loss": 0.307, + "step": 18732 + }, + { + "epoch": 0.54, + "grad_norm": 1.4954792755035289, + "learning_rate": 4.541739634588328e-06, + "loss": 0.2901, + "step": 18733 + }, + { + "epoch": 0.54, + "grad_norm": 1.2706041072063987, + "learning_rate": 4.5412719059166185e-06, + "loss": 0.2922, + "step": 18734 + }, + { + "epoch": 0.54, + "grad_norm": 1.195782635529111, + "learning_rate": 4.540804181293189e-06, + "loss": 0.2966, + "step": 18735 + }, + { + "epoch": 0.54, + "grad_norm": 1.4115942349536834, + "learning_rate": 4.5403364607221685e-06, + "loss": 0.2774, + "step": 18736 + }, + { + "epoch": 0.54, + "grad_norm": 1.3570988148626137, + "learning_rate": 4.5398687442076845e-06, + "loss": 0.3002, + "step": 18737 + }, + { + "epoch": 0.54, + "grad_norm": 1.531524313770289, + "learning_rate": 4.539401031753863e-06, + "loss": 0.3082, + "step": 18738 + }, + { + "epoch": 0.54, + "grad_norm": 0.9382064584103723, + "learning_rate": 4.538933323364832e-06, + "loss": 0.6051, + "step": 18739 + }, + { + "epoch": 0.54, + "grad_norm": 1.3871627620217997, + "learning_rate": 4.53846561904472e-06, + "loss": 0.2982, + "step": 18740 + }, + { + "epoch": 0.54, + "grad_norm": 1.670825145096086, + "learning_rate": 4.537997918797655e-06, + "loss": 0.2953, + "step": 18741 + }, + { + "epoch": 0.54, + "grad_norm": 1.1608321263392227, + "learning_rate": 4.5375302226277614e-06, + "loss": 0.2925, + "step": 18742 + }, + { + "epoch": 0.54, + "grad_norm": 1.5764457662991649, + "learning_rate": 4.53706253053917e-06, + "loss": 0.2945, + "step": 18743 + }, + { + "epoch": 0.54, + "grad_norm": 3.1168460238058975, + "learning_rate": 4.536594842536007e-06, + "loss": 0.2855, + "step": 18744 + }, + { + "epoch": 0.54, + "grad_norm": 1.5155849598389108, + "learning_rate": 4.536127158622401e-06, + "loss": 0.3034, + "step": 18745 + }, + { + "epoch": 0.54, + "grad_norm": 1.2143731871845471, + "learning_rate": 4.535659478802476e-06, + "loss": 0.3046, + "step": 18746 + }, + { + "epoch": 0.54, + "grad_norm": 1.4139760624875712, + "learning_rate": 4.535191803080361e-06, + "loss": 0.2843, + "step": 18747 + }, + { + "epoch": 0.54, + "grad_norm": 1.521908993316083, + "learning_rate": 4.534724131460184e-06, + "loss": 0.2907, + "step": 18748 + }, + { + "epoch": 0.54, + "grad_norm": 1.8620645190235514, + "learning_rate": 4.5342564639460715e-06, + "loss": 0.3023, + "step": 18749 + }, + { + "epoch": 0.54, + "grad_norm": 1.3016682826804666, + "learning_rate": 4.5337888005421525e-06, + "loss": 0.3278, + "step": 18750 + }, + { + "epoch": 0.54, + "grad_norm": 1.3700522995009583, + "learning_rate": 4.533321141252551e-06, + "loss": 0.3017, + "step": 18751 + }, + { + "epoch": 0.54, + "grad_norm": 1.4666149847365795, + "learning_rate": 4.532853486081395e-06, + "loss": 0.2926, + "step": 18752 + }, + { + "epoch": 0.54, + "grad_norm": 1.4586364021231872, + "learning_rate": 4.532385835032813e-06, + "loss": 0.3089, + "step": 18753 + }, + { + "epoch": 0.54, + "grad_norm": 1.2427289617560764, + "learning_rate": 4.5319181881109335e-06, + "loss": 0.2915, + "step": 18754 + }, + { + "epoch": 0.54, + "grad_norm": 1.6328320627926178, + "learning_rate": 4.531450545319878e-06, + "loss": 0.3064, + "step": 18755 + }, + { + "epoch": 0.54, + "grad_norm": 1.3398346360600595, + "learning_rate": 4.530982906663779e-06, + "loss": 0.3039, + "step": 18756 + }, + { + "epoch": 0.54, + "grad_norm": 1.3092461499905022, + "learning_rate": 4.530515272146759e-06, + "loss": 0.3, + "step": 18757 + }, + { + "epoch": 0.54, + "grad_norm": 1.3838433174159308, + "learning_rate": 4.530047641772947e-06, + "loss": 0.3092, + "step": 18758 + }, + { + "epoch": 0.54, + "grad_norm": 1.249318020437569, + "learning_rate": 4.52958001554647e-06, + "loss": 0.3022, + "step": 18759 + }, + { + "epoch": 0.54, + "grad_norm": 1.2623886192690652, + "learning_rate": 4.529112393471457e-06, + "loss": 0.2798, + "step": 18760 + }, + { + "epoch": 0.54, + "grad_norm": 1.275864847925649, + "learning_rate": 4.528644775552029e-06, + "loss": 0.2918, + "step": 18761 + }, + { + "epoch": 0.54, + "grad_norm": 3.338498144807577, + "learning_rate": 4.528177161792317e-06, + "loss": 0.3264, + "step": 18762 + }, + { + "epoch": 0.54, + "grad_norm": 1.49234813836029, + "learning_rate": 4.527709552196447e-06, + "loss": 0.3262, + "step": 18763 + }, + { + "epoch": 0.54, + "grad_norm": 1.3695622536121885, + "learning_rate": 4.527241946768546e-06, + "loss": 0.2968, + "step": 18764 + }, + { + "epoch": 0.54, + "grad_norm": 1.4082269098079057, + "learning_rate": 4.52677434551274e-06, + "loss": 0.3035, + "step": 18765 + }, + { + "epoch": 0.54, + "grad_norm": 1.2820355342896277, + "learning_rate": 4.5263067484331545e-06, + "loss": 0.287, + "step": 18766 + }, + { + "epoch": 0.54, + "grad_norm": 1.2445198441817709, + "learning_rate": 4.5258391555339185e-06, + "loss": 0.292, + "step": 18767 + }, + { + "epoch": 0.54, + "grad_norm": 1.3850970336940467, + "learning_rate": 4.525371566819157e-06, + "loss": 0.2966, + "step": 18768 + }, + { + "epoch": 0.54, + "grad_norm": 2.0454167709835467, + "learning_rate": 4.524903982292996e-06, + "loss": 0.3127, + "step": 18769 + }, + { + "epoch": 0.54, + "grad_norm": 1.555253139352583, + "learning_rate": 4.5244364019595634e-06, + "loss": 0.2969, + "step": 18770 + }, + { + "epoch": 0.54, + "grad_norm": 1.9221857114987269, + "learning_rate": 4.523968825822983e-06, + "loss": 0.2948, + "step": 18771 + }, + { + "epoch": 0.54, + "grad_norm": 1.3568840346589626, + "learning_rate": 4.523501253887385e-06, + "loss": 0.3109, + "step": 18772 + }, + { + "epoch": 0.54, + "grad_norm": 1.3834826423126017, + "learning_rate": 4.523033686156894e-06, + "loss": 0.299, + "step": 18773 + }, + { + "epoch": 0.54, + "grad_norm": 1.2954545396533574, + "learning_rate": 4.522566122635635e-06, + "loss": 0.2978, + "step": 18774 + }, + { + "epoch": 0.54, + "grad_norm": 1.2799299861509161, + "learning_rate": 4.522098563327735e-06, + "loss": 0.3008, + "step": 18775 + }, + { + "epoch": 0.54, + "grad_norm": 1.3439547562893848, + "learning_rate": 4.521631008237321e-06, + "loss": 0.3486, + "step": 18776 + }, + { + "epoch": 0.54, + "grad_norm": 1.6928406937578808, + "learning_rate": 4.521163457368519e-06, + "loss": 0.3015, + "step": 18777 + }, + { + "epoch": 0.54, + "grad_norm": 1.3073442353582994, + "learning_rate": 4.5206959107254545e-06, + "loss": 0.2892, + "step": 18778 + }, + { + "epoch": 0.54, + "grad_norm": 0.9336348942907061, + "learning_rate": 4.5202283683122535e-06, + "loss": 0.5382, + "step": 18779 + }, + { + "epoch": 0.54, + "grad_norm": 1.282216480571478, + "learning_rate": 4.519760830133042e-06, + "loss": 0.3099, + "step": 18780 + }, + { + "epoch": 0.54, + "grad_norm": 3.0418261492163245, + "learning_rate": 4.519293296191948e-06, + "loss": 0.3117, + "step": 18781 + }, + { + "epoch": 0.54, + "grad_norm": 1.2585618606091564, + "learning_rate": 4.518825766493096e-06, + "loss": 0.3095, + "step": 18782 + }, + { + "epoch": 0.54, + "grad_norm": 1.2471223715319275, + "learning_rate": 4.518358241040612e-06, + "loss": 0.2869, + "step": 18783 + }, + { + "epoch": 0.54, + "grad_norm": 1.3398249667414104, + "learning_rate": 4.517890719838621e-06, + "loss": 0.3045, + "step": 18784 + }, + { + "epoch": 0.54, + "grad_norm": 1.280240973954041, + "learning_rate": 4.517423202891249e-06, + "loss": 0.3352, + "step": 18785 + }, + { + "epoch": 0.54, + "grad_norm": 1.2852849195648564, + "learning_rate": 4.516955690202623e-06, + "loss": 0.3043, + "step": 18786 + }, + { + "epoch": 0.54, + "grad_norm": 1.348019485877154, + "learning_rate": 4.51648818177687e-06, + "loss": 0.3348, + "step": 18787 + }, + { + "epoch": 0.54, + "grad_norm": 1.225455761417586, + "learning_rate": 4.516020677618113e-06, + "loss": 0.2869, + "step": 18788 + }, + { + "epoch": 0.54, + "grad_norm": 1.5279931960890167, + "learning_rate": 4.515553177730478e-06, + "loss": 0.294, + "step": 18789 + }, + { + "epoch": 0.55, + "grad_norm": 1.3430863560602375, + "learning_rate": 4.515085682118093e-06, + "loss": 0.318, + "step": 18790 + }, + { + "epoch": 0.55, + "grad_norm": 1.3314063816978288, + "learning_rate": 4.514618190785081e-06, + "loss": 0.3174, + "step": 18791 + }, + { + "epoch": 0.55, + "grad_norm": 1.3688267076658798, + "learning_rate": 4.5141507037355705e-06, + "loss": 0.2961, + "step": 18792 + }, + { + "epoch": 0.55, + "grad_norm": 1.3533150270118763, + "learning_rate": 4.513683220973684e-06, + "loss": 0.3016, + "step": 18793 + }, + { + "epoch": 0.55, + "grad_norm": 1.2406329342357156, + "learning_rate": 4.513215742503549e-06, + "loss": 0.2961, + "step": 18794 + }, + { + "epoch": 0.55, + "grad_norm": 3.582397084062072, + "learning_rate": 4.51274826832929e-06, + "loss": 0.3129, + "step": 18795 + }, + { + "epoch": 0.55, + "grad_norm": 1.2603026599089964, + "learning_rate": 4.512280798455034e-06, + "loss": 0.3051, + "step": 18796 + }, + { + "epoch": 0.55, + "grad_norm": 1.3088905412250786, + "learning_rate": 4.511813332884905e-06, + "loss": 0.2893, + "step": 18797 + }, + { + "epoch": 0.55, + "grad_norm": 1.3939500647446257, + "learning_rate": 4.511345871623027e-06, + "loss": 0.2998, + "step": 18798 + }, + { + "epoch": 0.55, + "grad_norm": 1.3130226392919178, + "learning_rate": 4.5108784146735285e-06, + "loss": 0.3179, + "step": 18799 + }, + { + "epoch": 0.55, + "grad_norm": 1.3783122130541965, + "learning_rate": 4.5104109620405346e-06, + "loss": 0.3042, + "step": 18800 + }, + { + "epoch": 0.55, + "grad_norm": 1.6005128996915643, + "learning_rate": 4.5099435137281675e-06, + "loss": 0.3169, + "step": 18801 + }, + { + "epoch": 0.55, + "grad_norm": 1.242835046677522, + "learning_rate": 4.509476069740555e-06, + "loss": 0.2884, + "step": 18802 + }, + { + "epoch": 0.55, + "grad_norm": 1.215419913611672, + "learning_rate": 4.509008630081821e-06, + "loss": 0.2799, + "step": 18803 + }, + { + "epoch": 0.55, + "grad_norm": 1.3199529591848496, + "learning_rate": 4.508541194756094e-06, + "loss": 0.3038, + "step": 18804 + }, + { + "epoch": 0.55, + "grad_norm": 1.4523633140319385, + "learning_rate": 4.508073763767494e-06, + "loss": 0.2962, + "step": 18805 + }, + { + "epoch": 0.55, + "grad_norm": 1.320945011230539, + "learning_rate": 4.5076063371201485e-06, + "loss": 0.3092, + "step": 18806 + }, + { + "epoch": 0.55, + "grad_norm": 1.3383327896691584, + "learning_rate": 4.507138914818182e-06, + "loss": 0.3148, + "step": 18807 + }, + { + "epoch": 0.55, + "grad_norm": 1.3953986413202617, + "learning_rate": 4.506671496865721e-06, + "loss": 0.3101, + "step": 18808 + }, + { + "epoch": 0.55, + "grad_norm": 1.4284911662717634, + "learning_rate": 4.506204083266891e-06, + "loss": 0.3278, + "step": 18809 + }, + { + "epoch": 0.55, + "grad_norm": 1.5042006849149827, + "learning_rate": 4.505736674025814e-06, + "loss": 0.338, + "step": 18810 + }, + { + "epoch": 0.55, + "grad_norm": 1.625064346715542, + "learning_rate": 4.505269269146617e-06, + "loss": 0.2993, + "step": 18811 + }, + { + "epoch": 0.55, + "grad_norm": 1.336400607846495, + "learning_rate": 4.504801868633423e-06, + "loss": 0.293, + "step": 18812 + }, + { + "epoch": 0.55, + "grad_norm": 1.2484641554825782, + "learning_rate": 4.504334472490358e-06, + "loss": 0.3097, + "step": 18813 + }, + { + "epoch": 0.55, + "grad_norm": 0.8952937560404834, + "learning_rate": 4.503867080721547e-06, + "loss": 0.6342, + "step": 18814 + }, + { + "epoch": 0.55, + "grad_norm": 1.3302530300510833, + "learning_rate": 4.503399693331116e-06, + "loss": 0.2948, + "step": 18815 + }, + { + "epoch": 0.55, + "grad_norm": 1.2886841641679256, + "learning_rate": 4.5029323103231865e-06, + "loss": 0.3063, + "step": 18816 + }, + { + "epoch": 0.55, + "grad_norm": 2.1092472022545006, + "learning_rate": 4.502464931701885e-06, + "loss": 0.2973, + "step": 18817 + }, + { + "epoch": 0.55, + "grad_norm": 1.2486277260733578, + "learning_rate": 4.501997557471336e-06, + "loss": 0.302, + "step": 18818 + }, + { + "epoch": 0.55, + "grad_norm": 1.3257245120854337, + "learning_rate": 4.5015301876356655e-06, + "loss": 0.2998, + "step": 18819 + }, + { + "epoch": 0.55, + "grad_norm": 1.2194562396805333, + "learning_rate": 4.501062822198995e-06, + "loss": 0.3183, + "step": 18820 + }, + { + "epoch": 0.55, + "grad_norm": 1.2373303347400586, + "learning_rate": 4.500595461165451e-06, + "loss": 0.2856, + "step": 18821 + }, + { + "epoch": 0.55, + "grad_norm": 1.2013689634127003, + "learning_rate": 4.5001281045391574e-06, + "loss": 0.2863, + "step": 18822 + }, + { + "epoch": 0.55, + "grad_norm": 1.3290984325018633, + "learning_rate": 4.49966075232424e-06, + "loss": 0.3119, + "step": 18823 + }, + { + "epoch": 0.55, + "grad_norm": 1.2956095055624073, + "learning_rate": 4.499193404524821e-06, + "loss": 0.3053, + "step": 18824 + }, + { + "epoch": 0.55, + "grad_norm": 1.4171690347007362, + "learning_rate": 4.498726061145026e-06, + "loss": 0.2866, + "step": 18825 + }, + { + "epoch": 0.55, + "grad_norm": 0.911448446008203, + "learning_rate": 4.498258722188979e-06, + "loss": 0.5647, + "step": 18826 + }, + { + "epoch": 0.55, + "grad_norm": 1.3573531758135968, + "learning_rate": 4.497791387660804e-06, + "loss": 0.2873, + "step": 18827 + }, + { + "epoch": 0.55, + "grad_norm": 1.3332573739742657, + "learning_rate": 4.497324057564627e-06, + "loss": 0.3086, + "step": 18828 + }, + { + "epoch": 0.55, + "grad_norm": 1.4482824089183592, + "learning_rate": 4.4968567319045684e-06, + "loss": 0.2981, + "step": 18829 + }, + { + "epoch": 0.55, + "grad_norm": 1.247593066316648, + "learning_rate": 4.496389410684756e-06, + "loss": 0.2912, + "step": 18830 + }, + { + "epoch": 0.55, + "grad_norm": 1.390669684406801, + "learning_rate": 4.495922093909313e-06, + "loss": 0.3034, + "step": 18831 + }, + { + "epoch": 0.55, + "grad_norm": 1.2458993524342892, + "learning_rate": 4.495454781582364e-06, + "loss": 0.3074, + "step": 18832 + }, + { + "epoch": 0.55, + "grad_norm": 1.227000646737875, + "learning_rate": 4.4949874737080305e-06, + "loss": 0.2876, + "step": 18833 + }, + { + "epoch": 0.55, + "grad_norm": 1.2568758860604157, + "learning_rate": 4.4945201702904375e-06, + "loss": 0.2994, + "step": 18834 + }, + { + "epoch": 0.55, + "grad_norm": 1.2995998478290822, + "learning_rate": 4.49405287133371e-06, + "loss": 0.3151, + "step": 18835 + }, + { + "epoch": 0.55, + "grad_norm": 2.251056801425021, + "learning_rate": 4.4935855768419734e-06, + "loss": 0.2965, + "step": 18836 + }, + { + "epoch": 0.55, + "grad_norm": 1.7458407753816292, + "learning_rate": 4.493118286819348e-06, + "loss": 0.3016, + "step": 18837 + }, + { + "epoch": 0.55, + "grad_norm": 1.45353683434752, + "learning_rate": 4.492651001269962e-06, + "loss": 0.3057, + "step": 18838 + }, + { + "epoch": 0.55, + "grad_norm": 1.203476849932655, + "learning_rate": 4.492183720197935e-06, + "loss": 0.3137, + "step": 18839 + }, + { + "epoch": 0.55, + "grad_norm": 1.3949985867926016, + "learning_rate": 4.4917164436073905e-06, + "loss": 0.3022, + "step": 18840 + }, + { + "epoch": 0.55, + "grad_norm": 1.3635834952577797, + "learning_rate": 4.491249171502455e-06, + "loss": 0.3132, + "step": 18841 + }, + { + "epoch": 0.55, + "grad_norm": 1.2779792536946308, + "learning_rate": 4.490781903887252e-06, + "loss": 0.3035, + "step": 18842 + }, + { + "epoch": 0.55, + "grad_norm": 1.2459416675492445, + "learning_rate": 4.490314640765904e-06, + "loss": 0.3032, + "step": 18843 + }, + { + "epoch": 0.55, + "grad_norm": 1.2619155354464295, + "learning_rate": 4.489847382142534e-06, + "loss": 0.3037, + "step": 18844 + }, + { + "epoch": 0.55, + "grad_norm": 1.231372910000492, + "learning_rate": 4.489380128021267e-06, + "loss": 0.3163, + "step": 18845 + }, + { + "epoch": 0.55, + "grad_norm": 1.207489930482698, + "learning_rate": 4.488912878406227e-06, + "loss": 0.302, + "step": 18846 + }, + { + "epoch": 0.55, + "grad_norm": 1.1309440285662329, + "learning_rate": 4.488445633301536e-06, + "loss": 0.2903, + "step": 18847 + }, + { + "epoch": 0.55, + "grad_norm": 1.1771428537997515, + "learning_rate": 4.4879783927113175e-06, + "loss": 0.2783, + "step": 18848 + }, + { + "epoch": 0.55, + "grad_norm": 1.589217711154223, + "learning_rate": 4.487511156639695e-06, + "loss": 0.3082, + "step": 18849 + }, + { + "epoch": 0.55, + "grad_norm": 1.362505199284918, + "learning_rate": 4.487043925090793e-06, + "loss": 0.2945, + "step": 18850 + }, + { + "epoch": 0.55, + "grad_norm": 1.276323307535428, + "learning_rate": 4.4865766980687345e-06, + "loss": 0.3082, + "step": 18851 + }, + { + "epoch": 0.55, + "grad_norm": 1.5638550577729369, + "learning_rate": 4.486109475577642e-06, + "loss": 0.2947, + "step": 18852 + }, + { + "epoch": 0.55, + "grad_norm": 0.9580203562684652, + "learning_rate": 4.485642257621638e-06, + "loss": 0.6376, + "step": 18853 + }, + { + "epoch": 0.55, + "grad_norm": 1.8157327856558816, + "learning_rate": 4.485175044204848e-06, + "loss": 0.3072, + "step": 18854 + }, + { + "epoch": 0.55, + "grad_norm": 1.3852715444746557, + "learning_rate": 4.484707835331394e-06, + "loss": 0.3268, + "step": 18855 + }, + { + "epoch": 0.55, + "grad_norm": 1.2694074372978135, + "learning_rate": 4.484240631005398e-06, + "loss": 0.2954, + "step": 18856 + }, + { + "epoch": 0.55, + "grad_norm": 1.316737606433453, + "learning_rate": 4.483773431230984e-06, + "loss": 0.2992, + "step": 18857 + }, + { + "epoch": 0.55, + "grad_norm": 1.4308309009582865, + "learning_rate": 4.483306236012276e-06, + "loss": 0.3657, + "step": 18858 + }, + { + "epoch": 0.55, + "grad_norm": 1.7514008784023967, + "learning_rate": 4.482839045353397e-06, + "loss": 0.3196, + "step": 18859 + }, + { + "epoch": 0.55, + "grad_norm": 1.2307739995606906, + "learning_rate": 4.482371859258468e-06, + "loss": 0.3097, + "step": 18860 + }, + { + "epoch": 0.55, + "grad_norm": 1.3091620879815722, + "learning_rate": 4.481904677731613e-06, + "loss": 0.3165, + "step": 18861 + }, + { + "epoch": 0.55, + "grad_norm": 1.4364086961236753, + "learning_rate": 4.4814375007769554e-06, + "loss": 0.294, + "step": 18862 + }, + { + "epoch": 0.55, + "grad_norm": 1.1822764799082104, + "learning_rate": 4.480970328398617e-06, + "loss": 0.2951, + "step": 18863 + }, + { + "epoch": 0.55, + "grad_norm": 1.4820022601265228, + "learning_rate": 4.480503160600723e-06, + "loss": 0.2923, + "step": 18864 + }, + { + "epoch": 0.55, + "grad_norm": 1.3569924898325523, + "learning_rate": 4.480035997387393e-06, + "loss": 0.2828, + "step": 18865 + }, + { + "epoch": 0.55, + "grad_norm": 0.9513973138855957, + "learning_rate": 4.479568838762751e-06, + "loss": 0.6165, + "step": 18866 + }, + { + "epoch": 0.55, + "grad_norm": 1.3553680288232965, + "learning_rate": 4.4791016847309215e-06, + "loss": 0.294, + "step": 18867 + }, + { + "epoch": 0.55, + "grad_norm": 1.2615708196026396, + "learning_rate": 4.478634535296024e-06, + "loss": 0.3083, + "step": 18868 + }, + { + "epoch": 0.55, + "grad_norm": 0.9563145361248273, + "learning_rate": 4.478167390462182e-06, + "loss": 0.6555, + "step": 18869 + }, + { + "epoch": 0.55, + "grad_norm": 3.6079650466182756, + "learning_rate": 4.4777002502335196e-06, + "loss": 0.2908, + "step": 18870 + }, + { + "epoch": 0.55, + "grad_norm": 1.210976000689342, + "learning_rate": 4.477233114614158e-06, + "loss": 0.3069, + "step": 18871 + }, + { + "epoch": 0.55, + "grad_norm": 0.9727438954603721, + "learning_rate": 4.476765983608219e-06, + "loss": 0.6312, + "step": 18872 + }, + { + "epoch": 0.55, + "grad_norm": 1.1892615489230491, + "learning_rate": 4.4762988572198264e-06, + "loss": 0.314, + "step": 18873 + }, + { + "epoch": 0.55, + "grad_norm": 1.191945679039023, + "learning_rate": 4.475831735453103e-06, + "loss": 0.2836, + "step": 18874 + }, + { + "epoch": 0.55, + "grad_norm": 0.9741654561352762, + "learning_rate": 4.4753646183121695e-06, + "loss": 0.612, + "step": 18875 + }, + { + "epoch": 0.55, + "grad_norm": 1.3128564409221193, + "learning_rate": 4.474897505801149e-06, + "loss": 0.3076, + "step": 18876 + }, + { + "epoch": 0.55, + "grad_norm": 1.5677126878863594, + "learning_rate": 4.4744303979241645e-06, + "loss": 0.3023, + "step": 18877 + }, + { + "epoch": 0.55, + "grad_norm": 1.3334866322210361, + "learning_rate": 4.473963294685338e-06, + "loss": 0.3253, + "step": 18878 + }, + { + "epoch": 0.55, + "grad_norm": 1.2359926036835631, + "learning_rate": 4.47349619608879e-06, + "loss": 0.2988, + "step": 18879 + }, + { + "epoch": 0.55, + "grad_norm": 1.1382141238015155, + "learning_rate": 4.473029102138644e-06, + "loss": 0.2948, + "step": 18880 + }, + { + "epoch": 0.55, + "grad_norm": 1.290749581138434, + "learning_rate": 4.4725620128390226e-06, + "loss": 0.2949, + "step": 18881 + }, + { + "epoch": 0.55, + "grad_norm": 1.2119614760929351, + "learning_rate": 4.472094928194047e-06, + "loss": 0.2898, + "step": 18882 + }, + { + "epoch": 0.55, + "grad_norm": 1.5273126642403425, + "learning_rate": 4.47162784820784e-06, + "loss": 0.3115, + "step": 18883 + }, + { + "epoch": 0.55, + "grad_norm": 1.7151554170505485, + "learning_rate": 4.471160772884523e-06, + "loss": 0.3171, + "step": 18884 + }, + { + "epoch": 0.55, + "grad_norm": 1.6580077159945883, + "learning_rate": 4.470693702228217e-06, + "loss": 0.2757, + "step": 18885 + }, + { + "epoch": 0.55, + "grad_norm": 1.243539427036686, + "learning_rate": 4.4702266362430456e-06, + "loss": 0.3062, + "step": 18886 + }, + { + "epoch": 0.55, + "grad_norm": 1.6645865649319458, + "learning_rate": 4.469759574933131e-06, + "loss": 0.304, + "step": 18887 + }, + { + "epoch": 0.55, + "grad_norm": 1.351332562148327, + "learning_rate": 4.469292518302593e-06, + "loss": 0.296, + "step": 18888 + }, + { + "epoch": 0.55, + "grad_norm": 1.27092576436687, + "learning_rate": 4.468825466355554e-06, + "loss": 0.317, + "step": 18889 + }, + { + "epoch": 0.55, + "grad_norm": 1.2496323600044348, + "learning_rate": 4.468358419096137e-06, + "loss": 0.2886, + "step": 18890 + }, + { + "epoch": 0.55, + "grad_norm": 1.2440875450058133, + "learning_rate": 4.467891376528463e-06, + "loss": 0.2885, + "step": 18891 + }, + { + "epoch": 0.55, + "grad_norm": 1.4579594349075364, + "learning_rate": 4.467424338656653e-06, + "loss": 0.3167, + "step": 18892 + }, + { + "epoch": 0.55, + "grad_norm": 1.280291803409429, + "learning_rate": 4.466957305484829e-06, + "loss": 0.3183, + "step": 18893 + }, + { + "epoch": 0.55, + "grad_norm": 1.2881953882938622, + "learning_rate": 4.466490277017112e-06, + "loss": 0.2869, + "step": 18894 + }, + { + "epoch": 0.55, + "grad_norm": 1.2042927868866977, + "learning_rate": 4.466023253257627e-06, + "loss": 0.3088, + "step": 18895 + }, + { + "epoch": 0.55, + "grad_norm": 1.2246220500156075, + "learning_rate": 4.465556234210491e-06, + "loss": 0.2768, + "step": 18896 + }, + { + "epoch": 0.55, + "grad_norm": 1.5529121349882264, + "learning_rate": 4.465089219879828e-06, + "loss": 0.3007, + "step": 18897 + }, + { + "epoch": 0.55, + "grad_norm": 1.4210037643952662, + "learning_rate": 4.4646222102697574e-06, + "loss": 0.3124, + "step": 18898 + }, + { + "epoch": 0.55, + "grad_norm": 1.3307300206874855, + "learning_rate": 4.464155205384402e-06, + "loss": 0.311, + "step": 18899 + }, + { + "epoch": 0.55, + "grad_norm": 1.1510542270984736, + "learning_rate": 4.463688205227882e-06, + "loss": 0.3022, + "step": 18900 + }, + { + "epoch": 0.55, + "grad_norm": 1.2873769008007434, + "learning_rate": 4.463221209804321e-06, + "loss": 0.3271, + "step": 18901 + }, + { + "epoch": 0.55, + "grad_norm": 1.2798268562117983, + "learning_rate": 4.462754219117838e-06, + "loss": 0.3254, + "step": 18902 + }, + { + "epoch": 0.55, + "grad_norm": 1.2518121733045615, + "learning_rate": 4.462287233172555e-06, + "loss": 0.2801, + "step": 18903 + }, + { + "epoch": 0.55, + "grad_norm": 1.2005166295328689, + "learning_rate": 4.461820251972593e-06, + "loss": 0.3048, + "step": 18904 + }, + { + "epoch": 0.55, + "grad_norm": 1.708520799959801, + "learning_rate": 4.461353275522073e-06, + "loss": 0.2905, + "step": 18905 + }, + { + "epoch": 0.55, + "grad_norm": 1.2893225348614046, + "learning_rate": 4.4608863038251176e-06, + "loss": 0.3139, + "step": 18906 + }, + { + "epoch": 0.55, + "grad_norm": 1.2481444628821912, + "learning_rate": 4.460419336885845e-06, + "loss": 0.2989, + "step": 18907 + }, + { + "epoch": 0.55, + "grad_norm": 0.9528058246422078, + "learning_rate": 4.459952374708379e-06, + "loss": 0.6074, + "step": 18908 + }, + { + "epoch": 0.55, + "grad_norm": 1.324208395824368, + "learning_rate": 4.459485417296839e-06, + "loss": 0.2901, + "step": 18909 + }, + { + "epoch": 0.55, + "grad_norm": 1.2426344548994699, + "learning_rate": 4.459018464655346e-06, + "loss": 0.2749, + "step": 18910 + }, + { + "epoch": 0.55, + "grad_norm": 1.3478515716305592, + "learning_rate": 4.458551516788021e-06, + "loss": 0.3377, + "step": 18911 + }, + { + "epoch": 0.55, + "grad_norm": 1.2012187407609936, + "learning_rate": 4.458084573698984e-06, + "loss": 0.3133, + "step": 18912 + }, + { + "epoch": 0.55, + "grad_norm": 0.9420646807262313, + "learning_rate": 4.4576176353923575e-06, + "loss": 0.5799, + "step": 18913 + }, + { + "epoch": 0.55, + "grad_norm": 1.3602533486184225, + "learning_rate": 4.457150701872263e-06, + "loss": 0.2863, + "step": 18914 + }, + { + "epoch": 0.55, + "grad_norm": 1.5347645588644896, + "learning_rate": 4.456683773142817e-06, + "loss": 0.3213, + "step": 18915 + }, + { + "epoch": 0.55, + "grad_norm": 1.4794563382225172, + "learning_rate": 4.456216849208143e-06, + "loss": 0.3015, + "step": 18916 + }, + { + "epoch": 0.55, + "grad_norm": 1.298560899418626, + "learning_rate": 4.455749930072362e-06, + "loss": 0.2959, + "step": 18917 + }, + { + "epoch": 0.55, + "grad_norm": 1.5336189451109574, + "learning_rate": 4.455283015739594e-06, + "loss": 0.3127, + "step": 18918 + }, + { + "epoch": 0.55, + "grad_norm": 1.1472126663433466, + "learning_rate": 4.45481610621396e-06, + "loss": 0.3133, + "step": 18919 + }, + { + "epoch": 0.55, + "grad_norm": 1.3776908707937225, + "learning_rate": 4.454349201499579e-06, + "loss": 0.3268, + "step": 18920 + }, + { + "epoch": 0.55, + "grad_norm": 1.4841598807320184, + "learning_rate": 4.4538823016005725e-06, + "loss": 0.3114, + "step": 18921 + }, + { + "epoch": 0.55, + "grad_norm": 1.2721076552126422, + "learning_rate": 4.4534154065210606e-06, + "loss": 0.2906, + "step": 18922 + }, + { + "epoch": 0.55, + "grad_norm": 1.2647793664567304, + "learning_rate": 4.452948516265166e-06, + "loss": 0.3095, + "step": 18923 + }, + { + "epoch": 0.55, + "grad_norm": 1.2870278883324817, + "learning_rate": 4.452481630837005e-06, + "loss": 0.2951, + "step": 18924 + }, + { + "epoch": 0.55, + "grad_norm": 0.9462596476061366, + "learning_rate": 4.4520147502407e-06, + "loss": 0.5783, + "step": 18925 + }, + { + "epoch": 0.55, + "grad_norm": 1.38245894203914, + "learning_rate": 4.4515478744803705e-06, + "loss": 0.3066, + "step": 18926 + }, + { + "epoch": 0.55, + "grad_norm": 1.60970273382512, + "learning_rate": 4.451081003560136e-06, + "loss": 0.2782, + "step": 18927 + }, + { + "epoch": 0.55, + "grad_norm": 1.368461901780212, + "learning_rate": 4.4506141374841184e-06, + "loss": 0.2784, + "step": 18928 + }, + { + "epoch": 0.55, + "grad_norm": 1.2552379408078256, + "learning_rate": 4.450147276256439e-06, + "loss": 0.303, + "step": 18929 + }, + { + "epoch": 0.55, + "grad_norm": 1.8225763321888278, + "learning_rate": 4.449680419881214e-06, + "loss": 0.2897, + "step": 18930 + }, + { + "epoch": 0.55, + "grad_norm": 1.394104495043688, + "learning_rate": 4.449213568362566e-06, + "loss": 0.314, + "step": 18931 + }, + { + "epoch": 0.55, + "grad_norm": 1.1730110582957942, + "learning_rate": 4.4487467217046135e-06, + "loss": 0.285, + "step": 18932 + }, + { + "epoch": 0.55, + "grad_norm": 1.307087777684439, + "learning_rate": 4.448279879911479e-06, + "loss": 0.3303, + "step": 18933 + }, + { + "epoch": 0.55, + "grad_norm": 1.3234211822763493, + "learning_rate": 4.447813042987279e-06, + "loss": 0.3027, + "step": 18934 + }, + { + "epoch": 0.55, + "grad_norm": 1.400126859131544, + "learning_rate": 4.447346210936137e-06, + "loss": 0.306, + "step": 18935 + }, + { + "epoch": 0.55, + "grad_norm": 1.2436639605812552, + "learning_rate": 4.446879383762169e-06, + "loss": 0.2886, + "step": 18936 + }, + { + "epoch": 0.55, + "grad_norm": 1.2909058752350917, + "learning_rate": 4.446412561469497e-06, + "loss": 0.2872, + "step": 18937 + }, + { + "epoch": 0.55, + "grad_norm": 1.3535104592827074, + "learning_rate": 4.445945744062241e-06, + "loss": 0.314, + "step": 18938 + }, + { + "epoch": 0.55, + "grad_norm": 1.3590806180610573, + "learning_rate": 4.445478931544519e-06, + "loss": 0.2979, + "step": 18939 + }, + { + "epoch": 0.55, + "grad_norm": 1.9701394618407226, + "learning_rate": 4.445012123920452e-06, + "loss": 0.304, + "step": 18940 + }, + { + "epoch": 0.55, + "grad_norm": 1.6085270570067849, + "learning_rate": 4.4445453211941585e-06, + "loss": 0.2919, + "step": 18941 + }, + { + "epoch": 0.55, + "grad_norm": 1.2552302736365633, + "learning_rate": 4.44407852336976e-06, + "loss": 0.3198, + "step": 18942 + }, + { + "epoch": 0.55, + "grad_norm": 1.3083702136040052, + "learning_rate": 4.443611730451375e-06, + "loss": 0.3043, + "step": 18943 + }, + { + "epoch": 0.55, + "grad_norm": 1.4769986360543657, + "learning_rate": 4.443144942443121e-06, + "loss": 0.2856, + "step": 18944 + }, + { + "epoch": 0.55, + "grad_norm": 1.3153290463055451, + "learning_rate": 4.442678159349119e-06, + "loss": 0.2913, + "step": 18945 + }, + { + "epoch": 0.55, + "grad_norm": 1.5167326528548577, + "learning_rate": 4.44221138117349e-06, + "loss": 0.32, + "step": 18946 + }, + { + "epoch": 0.55, + "grad_norm": 1.4016127195959052, + "learning_rate": 4.441744607920351e-06, + "loss": 0.2969, + "step": 18947 + }, + { + "epoch": 0.55, + "grad_norm": 1.3088010569185184, + "learning_rate": 4.441277839593822e-06, + "loss": 0.3161, + "step": 18948 + }, + { + "epoch": 0.55, + "grad_norm": 1.5183244787523171, + "learning_rate": 4.440811076198022e-06, + "loss": 0.3003, + "step": 18949 + }, + { + "epoch": 0.55, + "grad_norm": 1.2095343283858426, + "learning_rate": 4.4403443177370705e-06, + "loss": 0.304, + "step": 18950 + }, + { + "epoch": 0.55, + "grad_norm": 1.2532364997394838, + "learning_rate": 4.439877564215088e-06, + "loss": 0.2907, + "step": 18951 + }, + { + "epoch": 0.55, + "grad_norm": 1.95149714652068, + "learning_rate": 4.439410815636192e-06, + "loss": 0.2968, + "step": 18952 + }, + { + "epoch": 0.55, + "grad_norm": 1.2423943922294567, + "learning_rate": 4.438944072004501e-06, + "loss": 0.2977, + "step": 18953 + }, + { + "epoch": 0.55, + "grad_norm": 1.7311182703459345, + "learning_rate": 4.438477333324134e-06, + "loss": 0.3087, + "step": 18954 + }, + { + "epoch": 0.55, + "grad_norm": 1.327486082196171, + "learning_rate": 4.43801059959921e-06, + "loss": 0.305, + "step": 18955 + }, + { + "epoch": 0.55, + "grad_norm": 1.5672488281233066, + "learning_rate": 4.437543870833852e-06, + "loss": 0.3043, + "step": 18956 + }, + { + "epoch": 0.55, + "grad_norm": 1.3000247529457478, + "learning_rate": 4.437077147032173e-06, + "loss": 0.3117, + "step": 18957 + }, + { + "epoch": 0.55, + "grad_norm": 1.583170504256178, + "learning_rate": 4.436610428198295e-06, + "loss": 0.3228, + "step": 18958 + }, + { + "epoch": 0.55, + "grad_norm": 1.2719765949039845, + "learning_rate": 4.436143714336335e-06, + "loss": 0.2925, + "step": 18959 + }, + { + "epoch": 0.55, + "grad_norm": 1.5351063460530965, + "learning_rate": 4.435677005450415e-06, + "loss": 0.2905, + "step": 18960 + }, + { + "epoch": 0.55, + "grad_norm": 1.1917618437243425, + "learning_rate": 4.435210301544651e-06, + "loss": 0.2801, + "step": 18961 + }, + { + "epoch": 0.55, + "grad_norm": 1.464725323695733, + "learning_rate": 4.434743602623163e-06, + "loss": 0.3099, + "step": 18962 + }, + { + "epoch": 0.55, + "grad_norm": 1.2660983209668946, + "learning_rate": 4.434276908690067e-06, + "loss": 0.2889, + "step": 18963 + }, + { + "epoch": 0.55, + "grad_norm": 1.4074424778832288, + "learning_rate": 4.433810219749485e-06, + "loss": 0.2876, + "step": 18964 + }, + { + "epoch": 0.55, + "grad_norm": 1.8767696842064947, + "learning_rate": 4.433343535805535e-06, + "loss": 0.3069, + "step": 18965 + }, + { + "epoch": 0.55, + "grad_norm": 1.3311083354799307, + "learning_rate": 4.432876856862333e-06, + "loss": 0.3076, + "step": 18966 + }, + { + "epoch": 0.55, + "grad_norm": 1.6838555625184695, + "learning_rate": 4.4324101829239994e-06, + "loss": 0.2946, + "step": 18967 + }, + { + "epoch": 0.55, + "grad_norm": 1.7681920976470848, + "learning_rate": 4.431943513994652e-06, + "loss": 0.3047, + "step": 18968 + }, + { + "epoch": 0.55, + "grad_norm": 1.3386865514995643, + "learning_rate": 4.43147685007841e-06, + "loss": 0.3006, + "step": 18969 + }, + { + "epoch": 0.55, + "grad_norm": 1.2879997541998884, + "learning_rate": 4.431010191179391e-06, + "loss": 0.3035, + "step": 18970 + }, + { + "epoch": 0.55, + "grad_norm": 1.2179179445550077, + "learning_rate": 4.430543537301713e-06, + "loss": 0.3041, + "step": 18971 + }, + { + "epoch": 0.55, + "grad_norm": 1.3112982972505753, + "learning_rate": 4.430076888449494e-06, + "loss": 0.3012, + "step": 18972 + }, + { + "epoch": 0.55, + "grad_norm": 1.1680524348403358, + "learning_rate": 4.4296102446268554e-06, + "loss": 0.2967, + "step": 18973 + }, + { + "epoch": 0.55, + "grad_norm": 1.4659100837464507, + "learning_rate": 4.42914360583791e-06, + "loss": 0.3035, + "step": 18974 + }, + { + "epoch": 0.55, + "grad_norm": 1.4671883305870044, + "learning_rate": 4.42867697208678e-06, + "loss": 0.2935, + "step": 18975 + }, + { + "epoch": 0.55, + "grad_norm": 1.646188927599788, + "learning_rate": 4.428210343377581e-06, + "loss": 0.2985, + "step": 18976 + }, + { + "epoch": 0.55, + "grad_norm": 1.2748868879910884, + "learning_rate": 4.4277437197144315e-06, + "loss": 0.307, + "step": 18977 + }, + { + "epoch": 0.55, + "grad_norm": 1.4175889515423374, + "learning_rate": 4.427277101101452e-06, + "loss": 0.304, + "step": 18978 + }, + { + "epoch": 0.55, + "grad_norm": 1.291556181571716, + "learning_rate": 4.426810487542759e-06, + "loss": 0.2861, + "step": 18979 + }, + { + "epoch": 0.55, + "grad_norm": 1.1815385257089732, + "learning_rate": 4.426343879042469e-06, + "loss": 0.2922, + "step": 18980 + }, + { + "epoch": 0.55, + "grad_norm": 1.3291592847476839, + "learning_rate": 4.4258772756047e-06, + "loss": 0.3038, + "step": 18981 + }, + { + "epoch": 0.55, + "grad_norm": 1.413026455537833, + "learning_rate": 4.425410677233571e-06, + "loss": 0.312, + "step": 18982 + }, + { + "epoch": 0.55, + "grad_norm": 1.3601691305283063, + "learning_rate": 4.424944083933198e-06, + "loss": 0.2912, + "step": 18983 + }, + { + "epoch": 0.55, + "grad_norm": 1.2357827099459915, + "learning_rate": 4.424477495707701e-06, + "loss": 0.2978, + "step": 18984 + }, + { + "epoch": 0.55, + "grad_norm": 1.276586521673891, + "learning_rate": 4.424010912561195e-06, + "loss": 0.2956, + "step": 18985 + }, + { + "epoch": 0.55, + "grad_norm": 1.2384969905873318, + "learning_rate": 4.423544334497801e-06, + "loss": 0.3029, + "step": 18986 + }, + { + "epoch": 0.55, + "grad_norm": 1.2416474666576616, + "learning_rate": 4.423077761521633e-06, + "loss": 0.302, + "step": 18987 + }, + { + "epoch": 0.55, + "grad_norm": 1.3283377668945249, + "learning_rate": 4.422611193636811e-06, + "loss": 0.306, + "step": 18988 + }, + { + "epoch": 0.55, + "grad_norm": 1.1916912673134183, + "learning_rate": 4.422144630847451e-06, + "loss": 0.2995, + "step": 18989 + }, + { + "epoch": 0.55, + "grad_norm": 1.6722649122737225, + "learning_rate": 4.421678073157672e-06, + "loss": 0.3086, + "step": 18990 + }, + { + "epoch": 0.55, + "grad_norm": 1.4694548514318804, + "learning_rate": 4.421211520571589e-06, + "loss": 0.3277, + "step": 18991 + }, + { + "epoch": 0.55, + "grad_norm": 1.3445640722715877, + "learning_rate": 4.420744973093323e-06, + "loss": 0.2987, + "step": 18992 + }, + { + "epoch": 0.55, + "grad_norm": 1.484742415730375, + "learning_rate": 4.420278430726987e-06, + "loss": 0.301, + "step": 18993 + }, + { + "epoch": 0.55, + "grad_norm": 1.2404465166882426, + "learning_rate": 4.419811893476701e-06, + "loss": 0.2924, + "step": 18994 + }, + { + "epoch": 0.55, + "grad_norm": 1.2148454531817077, + "learning_rate": 4.419345361346581e-06, + "loss": 0.2936, + "step": 18995 + }, + { + "epoch": 0.55, + "grad_norm": 1.3288636459176653, + "learning_rate": 4.418878834340745e-06, + "loss": 0.3063, + "step": 18996 + }, + { + "epoch": 0.55, + "grad_norm": 1.2000594215236817, + "learning_rate": 4.418412312463311e-06, + "loss": 0.2738, + "step": 18997 + }, + { + "epoch": 0.55, + "grad_norm": 1.4855169643624062, + "learning_rate": 4.417945795718392e-06, + "loss": 0.3018, + "step": 18998 + }, + { + "epoch": 0.55, + "grad_norm": 1.3720417270768193, + "learning_rate": 4.41747928411011e-06, + "loss": 0.3112, + "step": 18999 + }, + { + "epoch": 0.55, + "grad_norm": 0.9347865831290496, + "learning_rate": 4.417012777642579e-06, + "loss": 0.5841, + "step": 19000 + }, + { + "epoch": 0.55, + "grad_norm": 1.70728545150072, + "learning_rate": 4.416546276319917e-06, + "loss": 0.3031, + "step": 19001 + }, + { + "epoch": 0.55, + "grad_norm": 1.3983698242066753, + "learning_rate": 4.416079780146241e-06, + "loss": 0.3065, + "step": 19002 + }, + { + "epoch": 0.55, + "grad_norm": 1.2200715602835002, + "learning_rate": 4.415613289125667e-06, + "loss": 0.3087, + "step": 19003 + }, + { + "epoch": 0.55, + "grad_norm": 1.2703605737625339, + "learning_rate": 4.415146803262312e-06, + "loss": 0.3113, + "step": 19004 + }, + { + "epoch": 0.55, + "grad_norm": 1.2707160258472792, + "learning_rate": 4.414680322560293e-06, + "loss": 0.297, + "step": 19005 + }, + { + "epoch": 0.55, + "grad_norm": 1.4353033362099505, + "learning_rate": 4.414213847023726e-06, + "loss": 0.3011, + "step": 19006 + }, + { + "epoch": 0.55, + "grad_norm": 1.4383732720580242, + "learning_rate": 4.4137473766567314e-06, + "loss": 0.3086, + "step": 19007 + }, + { + "epoch": 0.55, + "grad_norm": 1.2427632374716542, + "learning_rate": 4.41328091146342e-06, + "loss": 0.2964, + "step": 19008 + }, + { + "epoch": 0.55, + "grad_norm": 1.1251475818065795, + "learning_rate": 4.412814451447911e-06, + "loss": 0.2923, + "step": 19009 + }, + { + "epoch": 0.55, + "grad_norm": 1.2150354735846052, + "learning_rate": 4.412347996614321e-06, + "loss": 0.2965, + "step": 19010 + }, + { + "epoch": 0.55, + "grad_norm": 1.1989864442291087, + "learning_rate": 4.411881546966768e-06, + "loss": 0.3031, + "step": 19011 + }, + { + "epoch": 0.55, + "grad_norm": 1.477661941175661, + "learning_rate": 4.411415102509365e-06, + "loss": 0.3278, + "step": 19012 + }, + { + "epoch": 0.55, + "grad_norm": 1.3591279650898636, + "learning_rate": 4.41094866324623e-06, + "loss": 0.2921, + "step": 19013 + }, + { + "epoch": 0.55, + "grad_norm": 1.5157210187428947, + "learning_rate": 4.410482229181481e-06, + "loss": 0.3007, + "step": 19014 + }, + { + "epoch": 0.55, + "grad_norm": 1.3946029722931166, + "learning_rate": 4.410015800319232e-06, + "loss": 0.3189, + "step": 19015 + }, + { + "epoch": 0.55, + "grad_norm": 1.38940547239325, + "learning_rate": 4.4095493766636015e-06, + "loss": 0.3002, + "step": 19016 + }, + { + "epoch": 0.55, + "grad_norm": 1.3100090346387225, + "learning_rate": 4.409082958218703e-06, + "loss": 0.3037, + "step": 19017 + }, + { + "epoch": 0.55, + "grad_norm": 1.4148783118108517, + "learning_rate": 4.408616544988654e-06, + "loss": 0.319, + "step": 19018 + }, + { + "epoch": 0.55, + "grad_norm": 1.4058040585821328, + "learning_rate": 4.40815013697757e-06, + "loss": 0.3255, + "step": 19019 + }, + { + "epoch": 0.55, + "grad_norm": 1.4876685276550856, + "learning_rate": 4.407683734189569e-06, + "loss": 0.3353, + "step": 19020 + }, + { + "epoch": 0.55, + "grad_norm": 1.2867289314010757, + "learning_rate": 4.407217336628765e-06, + "loss": 0.3039, + "step": 19021 + }, + { + "epoch": 0.55, + "grad_norm": 1.4128710549114813, + "learning_rate": 4.406750944299274e-06, + "loss": 0.2978, + "step": 19022 + }, + { + "epoch": 0.55, + "grad_norm": 1.2781871416470576, + "learning_rate": 4.4062845572052135e-06, + "loss": 0.3045, + "step": 19023 + }, + { + "epoch": 0.55, + "grad_norm": 1.3606988307050387, + "learning_rate": 4.4058181753506985e-06, + "loss": 0.283, + "step": 19024 + }, + { + "epoch": 0.55, + "grad_norm": 1.355463381463179, + "learning_rate": 4.405351798739844e-06, + "loss": 0.3086, + "step": 19025 + }, + { + "epoch": 0.55, + "grad_norm": 1.4785911004849754, + "learning_rate": 4.404885427376766e-06, + "loss": 0.3045, + "step": 19026 + }, + { + "epoch": 0.55, + "grad_norm": 1.27262903796852, + "learning_rate": 4.404419061265581e-06, + "loss": 0.2977, + "step": 19027 + }, + { + "epoch": 0.55, + "grad_norm": 1.2224197432385846, + "learning_rate": 4.403952700410406e-06, + "loss": 0.2853, + "step": 19028 + }, + { + "epoch": 0.55, + "grad_norm": 1.2453053849317535, + "learning_rate": 4.403486344815353e-06, + "loss": 0.2934, + "step": 19029 + }, + { + "epoch": 0.55, + "grad_norm": 1.319714626017261, + "learning_rate": 4.4030199944845405e-06, + "loss": 0.2939, + "step": 19030 + }, + { + "epoch": 0.55, + "grad_norm": 1.3065895337494189, + "learning_rate": 4.402553649422082e-06, + "loss": 0.2932, + "step": 19031 + }, + { + "epoch": 0.55, + "grad_norm": 1.576077033934409, + "learning_rate": 4.402087309632095e-06, + "loss": 0.3223, + "step": 19032 + }, + { + "epoch": 0.55, + "grad_norm": 1.2976776490367103, + "learning_rate": 4.4016209751186955e-06, + "loss": 0.3023, + "step": 19033 + }, + { + "epoch": 0.55, + "grad_norm": 1.5910460753901592, + "learning_rate": 4.4011546458859965e-06, + "loss": 0.3418, + "step": 19034 + }, + { + "epoch": 0.55, + "grad_norm": 1.5830013280070745, + "learning_rate": 4.400688321938116e-06, + "loss": 0.3091, + "step": 19035 + }, + { + "epoch": 0.55, + "grad_norm": 1.4924348217433265, + "learning_rate": 4.400222003279166e-06, + "loss": 0.3008, + "step": 19036 + }, + { + "epoch": 0.55, + "grad_norm": 2.105053844962565, + "learning_rate": 4.399755689913264e-06, + "loss": 0.3068, + "step": 19037 + }, + { + "epoch": 0.55, + "grad_norm": 1.2379614006282647, + "learning_rate": 4.399289381844522e-06, + "loss": 0.3078, + "step": 19038 + }, + { + "epoch": 0.55, + "grad_norm": 1.3218612173098037, + "learning_rate": 4.398823079077061e-06, + "loss": 0.2895, + "step": 19039 + }, + { + "epoch": 0.55, + "grad_norm": 1.2165166247461892, + "learning_rate": 4.3983567816149924e-06, + "loss": 0.3004, + "step": 19040 + }, + { + "epoch": 0.55, + "grad_norm": 1.3182492656221765, + "learning_rate": 4.39789048946243e-06, + "loss": 0.2906, + "step": 19041 + }, + { + "epoch": 0.55, + "grad_norm": 1.7333220009482126, + "learning_rate": 4.397424202623493e-06, + "loss": 0.3037, + "step": 19042 + }, + { + "epoch": 0.55, + "grad_norm": 1.252393374608936, + "learning_rate": 4.396957921102294e-06, + "loss": 0.3076, + "step": 19043 + }, + { + "epoch": 0.55, + "grad_norm": 1.2092614156024954, + "learning_rate": 4.396491644902947e-06, + "loss": 0.2947, + "step": 19044 + }, + { + "epoch": 0.55, + "grad_norm": 1.2362715055326603, + "learning_rate": 4.396025374029568e-06, + "loss": 0.2903, + "step": 19045 + }, + { + "epoch": 0.55, + "grad_norm": 1.2708947160455635, + "learning_rate": 4.395559108486272e-06, + "loss": 0.3241, + "step": 19046 + }, + { + "epoch": 0.55, + "grad_norm": 1.3010236965077036, + "learning_rate": 4.3950928482771746e-06, + "loss": 0.2993, + "step": 19047 + }, + { + "epoch": 0.55, + "grad_norm": 1.3144242702130546, + "learning_rate": 4.394626593406388e-06, + "loss": 0.2926, + "step": 19048 + }, + { + "epoch": 0.55, + "grad_norm": 1.396159494488647, + "learning_rate": 4.3941603438780285e-06, + "loss": 0.2948, + "step": 19049 + }, + { + "epoch": 0.55, + "grad_norm": 1.241825036294462, + "learning_rate": 4.393694099696211e-06, + "loss": 0.291, + "step": 19050 + }, + { + "epoch": 0.55, + "grad_norm": 1.2885972384802404, + "learning_rate": 4.39322786086505e-06, + "loss": 0.2997, + "step": 19051 + }, + { + "epoch": 0.55, + "grad_norm": 1.776407760805257, + "learning_rate": 4.3927616273886606e-06, + "loss": 0.3061, + "step": 19052 + }, + { + "epoch": 0.55, + "grad_norm": 4.67859753309842, + "learning_rate": 4.392295399271155e-06, + "loss": 0.3383, + "step": 19053 + }, + { + "epoch": 0.55, + "grad_norm": 1.279627008353553, + "learning_rate": 4.39182917651665e-06, + "loss": 0.2969, + "step": 19054 + }, + { + "epoch": 0.55, + "grad_norm": 1.3839554871358666, + "learning_rate": 4.391362959129259e-06, + "loss": 0.2874, + "step": 19055 + }, + { + "epoch": 0.55, + "grad_norm": 1.4352792790312354, + "learning_rate": 4.3908967471130974e-06, + "loss": 0.3116, + "step": 19056 + }, + { + "epoch": 0.55, + "grad_norm": 1.5263707343536699, + "learning_rate": 4.390430540472278e-06, + "loss": 0.3132, + "step": 19057 + }, + { + "epoch": 0.55, + "grad_norm": 1.2338898762727035, + "learning_rate": 4.389964339210916e-06, + "loss": 0.3099, + "step": 19058 + }, + { + "epoch": 0.55, + "grad_norm": 1.451247844420182, + "learning_rate": 4.389498143333125e-06, + "loss": 0.3103, + "step": 19059 + }, + { + "epoch": 0.55, + "grad_norm": 1.4925767362076925, + "learning_rate": 4.389031952843021e-06, + "loss": 0.3032, + "step": 19060 + }, + { + "epoch": 0.55, + "grad_norm": 1.3772673389441525, + "learning_rate": 4.388565767744716e-06, + "loss": 0.2996, + "step": 19061 + }, + { + "epoch": 0.55, + "grad_norm": 1.2727178895460132, + "learning_rate": 4.388099588042325e-06, + "loss": 0.2979, + "step": 19062 + }, + { + "epoch": 0.55, + "grad_norm": 1.3849650350748692, + "learning_rate": 4.387633413739963e-06, + "loss": 0.309, + "step": 19063 + }, + { + "epoch": 0.55, + "grad_norm": 1.3232658951413532, + "learning_rate": 4.387167244841742e-06, + "loss": 0.3215, + "step": 19064 + }, + { + "epoch": 0.55, + "grad_norm": 1.3563060508776072, + "learning_rate": 4.386701081351776e-06, + "loss": 0.2856, + "step": 19065 + }, + { + "epoch": 0.55, + "grad_norm": 1.2706909287651451, + "learning_rate": 4.386234923274182e-06, + "loss": 0.2937, + "step": 19066 + }, + { + "epoch": 0.55, + "grad_norm": 1.3159076830670418, + "learning_rate": 4.385768770613069e-06, + "loss": 0.305, + "step": 19067 + }, + { + "epoch": 0.55, + "grad_norm": 1.4275340550224134, + "learning_rate": 4.385302623372554e-06, + "loss": 0.2844, + "step": 19068 + }, + { + "epoch": 0.55, + "grad_norm": 1.2432836731644297, + "learning_rate": 4.384836481556751e-06, + "loss": 0.3039, + "step": 19069 + }, + { + "epoch": 0.55, + "grad_norm": 1.24618390513621, + "learning_rate": 4.384370345169773e-06, + "loss": 0.2842, + "step": 19070 + }, + { + "epoch": 0.55, + "grad_norm": 1.2483556385364774, + "learning_rate": 4.383904214215733e-06, + "loss": 0.3107, + "step": 19071 + }, + { + "epoch": 0.55, + "grad_norm": 1.614091981013293, + "learning_rate": 4.383438088698745e-06, + "loss": 0.3152, + "step": 19072 + }, + { + "epoch": 0.55, + "grad_norm": 1.6566487970751882, + "learning_rate": 4.382971968622922e-06, + "loss": 0.2802, + "step": 19073 + }, + { + "epoch": 0.55, + "grad_norm": 1.691684383424667, + "learning_rate": 4.3825058539923795e-06, + "loss": 0.2881, + "step": 19074 + }, + { + "epoch": 0.55, + "grad_norm": 1.219758694272485, + "learning_rate": 4.38203974481123e-06, + "loss": 0.2788, + "step": 19075 + }, + { + "epoch": 0.55, + "grad_norm": 1.3287224053220525, + "learning_rate": 4.381573641083585e-06, + "loss": 0.3152, + "step": 19076 + }, + { + "epoch": 0.55, + "grad_norm": 2.506140822100025, + "learning_rate": 4.38110754281356e-06, + "loss": 0.2854, + "step": 19077 + }, + { + "epoch": 0.55, + "grad_norm": 1.1962206054861109, + "learning_rate": 4.380641450005268e-06, + "loss": 0.2853, + "step": 19078 + }, + { + "epoch": 0.55, + "grad_norm": 1.194548558582478, + "learning_rate": 4.380175362662823e-06, + "loss": 0.2916, + "step": 19079 + }, + { + "epoch": 0.55, + "grad_norm": 1.238311807573213, + "learning_rate": 4.379709280790336e-06, + "loss": 0.2955, + "step": 19080 + }, + { + "epoch": 0.55, + "grad_norm": 1.3921327936324601, + "learning_rate": 4.379243204391922e-06, + "loss": 0.2897, + "step": 19081 + }, + { + "epoch": 0.55, + "grad_norm": 1.2083753864342839, + "learning_rate": 4.3787771334716934e-06, + "loss": 0.299, + "step": 19082 + }, + { + "epoch": 0.55, + "grad_norm": 1.2968551031468605, + "learning_rate": 4.378311068033764e-06, + "loss": 0.357, + "step": 19083 + }, + { + "epoch": 0.55, + "grad_norm": 1.3203818256498074, + "learning_rate": 4.377845008082246e-06, + "loss": 0.294, + "step": 19084 + }, + { + "epoch": 0.55, + "grad_norm": 2.1283359362192305, + "learning_rate": 4.377378953621252e-06, + "loss": 0.2925, + "step": 19085 + }, + { + "epoch": 0.55, + "grad_norm": 1.61164778916555, + "learning_rate": 4.376912904654896e-06, + "loss": 0.2817, + "step": 19086 + }, + { + "epoch": 0.55, + "grad_norm": 1.298129702097069, + "learning_rate": 4.3764468611872915e-06, + "loss": 0.2955, + "step": 19087 + }, + { + "epoch": 0.55, + "grad_norm": 1.3633873491764044, + "learning_rate": 4.37598082322255e-06, + "loss": 0.3126, + "step": 19088 + }, + { + "epoch": 0.55, + "grad_norm": 0.9142802103528134, + "learning_rate": 4.375514790764785e-06, + "loss": 0.5601, + "step": 19089 + }, + { + "epoch": 0.55, + "grad_norm": 1.3522695665611328, + "learning_rate": 4.375048763818108e-06, + "loss": 0.308, + "step": 19090 + }, + { + "epoch": 0.55, + "grad_norm": 1.436954489183651, + "learning_rate": 4.374582742386634e-06, + "loss": 0.2758, + "step": 19091 + }, + { + "epoch": 0.55, + "grad_norm": 1.4519887656225667, + "learning_rate": 4.374116726474473e-06, + "loss": 0.3066, + "step": 19092 + }, + { + "epoch": 0.55, + "grad_norm": 1.3069641575139748, + "learning_rate": 4.373650716085739e-06, + "loss": 0.2901, + "step": 19093 + }, + { + "epoch": 0.55, + "grad_norm": 1.1968488491397153, + "learning_rate": 4.373184711224545e-06, + "loss": 0.2789, + "step": 19094 + }, + { + "epoch": 0.55, + "grad_norm": 1.7795011356793593, + "learning_rate": 4.372718711895002e-06, + "loss": 0.3088, + "step": 19095 + }, + { + "epoch": 0.55, + "grad_norm": 1.28478790411837, + "learning_rate": 4.372252718101223e-06, + "loss": 0.2941, + "step": 19096 + }, + { + "epoch": 0.55, + "grad_norm": 1.285638567588655, + "learning_rate": 4.371786729847322e-06, + "loss": 0.2979, + "step": 19097 + }, + { + "epoch": 0.55, + "grad_norm": 1.5797014984092346, + "learning_rate": 4.37132074713741e-06, + "loss": 0.3079, + "step": 19098 + }, + { + "epoch": 0.55, + "grad_norm": 1.406991106393308, + "learning_rate": 4.370854769975598e-06, + "loss": 0.3056, + "step": 19099 + }, + { + "epoch": 0.55, + "grad_norm": 1.2234283001982038, + "learning_rate": 4.3703887983659995e-06, + "loss": 0.2916, + "step": 19100 + }, + { + "epoch": 0.55, + "grad_norm": 1.3284136477627029, + "learning_rate": 4.369922832312728e-06, + "loss": 0.2914, + "step": 19101 + }, + { + "epoch": 0.55, + "grad_norm": 1.481543817836026, + "learning_rate": 4.369456871819895e-06, + "loss": 0.2979, + "step": 19102 + }, + { + "epoch": 0.55, + "grad_norm": 1.3317134082689417, + "learning_rate": 4.368990916891611e-06, + "loss": 0.2887, + "step": 19103 + }, + { + "epoch": 0.55, + "grad_norm": 1.2486065993551736, + "learning_rate": 4.3685249675319894e-06, + "loss": 0.3001, + "step": 19104 + }, + { + "epoch": 0.55, + "grad_norm": 1.4486372184539875, + "learning_rate": 4.3680590237451415e-06, + "loss": 0.2981, + "step": 19105 + }, + { + "epoch": 0.55, + "grad_norm": 1.3040828932740873, + "learning_rate": 4.367593085535181e-06, + "loss": 0.2941, + "step": 19106 + }, + { + "epoch": 0.55, + "grad_norm": 1.5016703594805934, + "learning_rate": 4.367127152906218e-06, + "loss": 0.3161, + "step": 19107 + }, + { + "epoch": 0.55, + "grad_norm": 1.2431383654561312, + "learning_rate": 4.3666612258623646e-06, + "loss": 0.2885, + "step": 19108 + }, + { + "epoch": 0.55, + "grad_norm": 1.321744251257569, + "learning_rate": 4.366195304407732e-06, + "loss": 0.3039, + "step": 19109 + }, + { + "epoch": 0.55, + "grad_norm": 1.4231691528064916, + "learning_rate": 4.3657293885464335e-06, + "loss": 0.3211, + "step": 19110 + }, + { + "epoch": 0.55, + "grad_norm": 1.2133200582261412, + "learning_rate": 4.3652634782825814e-06, + "loss": 0.2846, + "step": 19111 + }, + { + "epoch": 0.55, + "grad_norm": 1.3597337069902995, + "learning_rate": 4.364797573620285e-06, + "loss": 0.308, + "step": 19112 + }, + { + "epoch": 0.55, + "grad_norm": 1.286531133501408, + "learning_rate": 4.3643316745636574e-06, + "loss": 0.295, + "step": 19113 + }, + { + "epoch": 0.55, + "grad_norm": 1.3111939319004393, + "learning_rate": 4.363865781116809e-06, + "loss": 0.2977, + "step": 19114 + }, + { + "epoch": 0.55, + "grad_norm": 1.170534481384878, + "learning_rate": 4.363399893283854e-06, + "loss": 0.2753, + "step": 19115 + }, + { + "epoch": 0.55, + "grad_norm": 1.2135234906407912, + "learning_rate": 4.3629340110689e-06, + "loss": 0.2884, + "step": 19116 + }, + { + "epoch": 0.55, + "grad_norm": 1.2657812662104089, + "learning_rate": 4.362468134476061e-06, + "loss": 0.3367, + "step": 19117 + }, + { + "epoch": 0.55, + "grad_norm": 1.4004487526757838, + "learning_rate": 4.362002263509448e-06, + "loss": 0.3008, + "step": 19118 + }, + { + "epoch": 0.55, + "grad_norm": 1.2158638181252057, + "learning_rate": 4.361536398173175e-06, + "loss": 0.2873, + "step": 19119 + }, + { + "epoch": 0.55, + "grad_norm": 1.3192274021375516, + "learning_rate": 4.361070538471347e-06, + "loss": 0.2823, + "step": 19120 + }, + { + "epoch": 0.55, + "grad_norm": 1.2872624512817508, + "learning_rate": 4.36060468440808e-06, + "loss": 0.2981, + "step": 19121 + }, + { + "epoch": 0.55, + "grad_norm": 1.2412222206211734, + "learning_rate": 4.360138835987483e-06, + "loss": 0.3038, + "step": 19122 + }, + { + "epoch": 0.55, + "grad_norm": 1.3025672202073946, + "learning_rate": 4.359672993213667e-06, + "loss": 0.2832, + "step": 19123 + }, + { + "epoch": 0.55, + "grad_norm": 1.6500117761002182, + "learning_rate": 4.359207156090744e-06, + "loss": 0.326, + "step": 19124 + }, + { + "epoch": 0.55, + "grad_norm": 1.7536413928786088, + "learning_rate": 4.358741324622827e-06, + "loss": 0.3006, + "step": 19125 + }, + { + "epoch": 0.55, + "grad_norm": 1.4586742179658905, + "learning_rate": 4.358275498814023e-06, + "loss": 0.3104, + "step": 19126 + }, + { + "epoch": 0.55, + "grad_norm": 1.349300023602422, + "learning_rate": 4.357809678668446e-06, + "loss": 0.2834, + "step": 19127 + }, + { + "epoch": 0.55, + "grad_norm": 1.3872340216698051, + "learning_rate": 4.357343864190204e-06, + "loss": 0.3195, + "step": 19128 + }, + { + "epoch": 0.55, + "grad_norm": 1.492245794499011, + "learning_rate": 4.3568780553834114e-06, + "loss": 0.2896, + "step": 19129 + }, + { + "epoch": 0.55, + "grad_norm": 1.216497321987597, + "learning_rate": 4.356412252252176e-06, + "loss": 0.2875, + "step": 19130 + }, + { + "epoch": 0.55, + "grad_norm": 1.2409030531486238, + "learning_rate": 4.355946454800611e-06, + "loss": 0.2838, + "step": 19131 + }, + { + "epoch": 0.55, + "grad_norm": 1.4258876412752315, + "learning_rate": 4.355480663032824e-06, + "loss": 0.3534, + "step": 19132 + }, + { + "epoch": 0.55, + "grad_norm": 1.1839569668159744, + "learning_rate": 4.355014876952927e-06, + "loss": 0.3016, + "step": 19133 + }, + { + "epoch": 0.55, + "grad_norm": 1.2610046455582755, + "learning_rate": 4.3545490965650334e-06, + "loss": 0.3025, + "step": 19134 + }, + { + "epoch": 0.56, + "grad_norm": 1.2455715166867938, + "learning_rate": 4.354083321873249e-06, + "loss": 0.3071, + "step": 19135 + }, + { + "epoch": 0.56, + "grad_norm": 1.2820007671118223, + "learning_rate": 4.353617552881687e-06, + "loss": 0.2666, + "step": 19136 + }, + { + "epoch": 0.56, + "grad_norm": 1.3297831112165763, + "learning_rate": 4.3531517895944575e-06, + "loss": 0.2962, + "step": 19137 + }, + { + "epoch": 0.56, + "grad_norm": 1.2025816957226054, + "learning_rate": 4.352686032015671e-06, + "loss": 0.2871, + "step": 19138 + }, + { + "epoch": 0.56, + "grad_norm": 1.2262716201961827, + "learning_rate": 4.352220280149436e-06, + "loss": 0.294, + "step": 19139 + }, + { + "epoch": 0.56, + "grad_norm": 1.1424223418478126, + "learning_rate": 4.351754533999865e-06, + "loss": 0.2893, + "step": 19140 + }, + { + "epoch": 0.56, + "grad_norm": 1.268078000727318, + "learning_rate": 4.3512887935710676e-06, + "loss": 0.2824, + "step": 19141 + }, + { + "epoch": 0.56, + "grad_norm": 1.2279234012349116, + "learning_rate": 4.350823058867155e-06, + "loss": 0.3004, + "step": 19142 + }, + { + "epoch": 0.56, + "grad_norm": 1.2625507911183516, + "learning_rate": 4.3503573298922344e-06, + "loss": 0.2954, + "step": 19143 + }, + { + "epoch": 0.56, + "grad_norm": 1.2499776947933643, + "learning_rate": 4.349891606650418e-06, + "loss": 0.3018, + "step": 19144 + }, + { + "epoch": 0.56, + "grad_norm": 1.3227806770684762, + "learning_rate": 4.3494258891458155e-06, + "loss": 0.3088, + "step": 19145 + }, + { + "epoch": 0.56, + "grad_norm": 1.252023061169692, + "learning_rate": 4.348960177382536e-06, + "loss": 0.3051, + "step": 19146 + }, + { + "epoch": 0.56, + "grad_norm": 1.4206631122187596, + "learning_rate": 4.348494471364692e-06, + "loss": 0.3134, + "step": 19147 + }, + { + "epoch": 0.56, + "grad_norm": 0.9224512292168835, + "learning_rate": 4.348028771096391e-06, + "loss": 0.5517, + "step": 19148 + }, + { + "epoch": 0.56, + "grad_norm": 1.3570587226036934, + "learning_rate": 4.347563076581743e-06, + "loss": 0.2817, + "step": 19149 + }, + { + "epoch": 0.56, + "grad_norm": 1.3526129609397322, + "learning_rate": 4.347097387824858e-06, + "loss": 0.3113, + "step": 19150 + }, + { + "epoch": 0.56, + "grad_norm": 1.2682423899449373, + "learning_rate": 4.346631704829845e-06, + "loss": 0.2933, + "step": 19151 + }, + { + "epoch": 0.56, + "grad_norm": 1.6443279860402353, + "learning_rate": 4.3461660276008146e-06, + "loss": 0.3023, + "step": 19152 + }, + { + "epoch": 0.56, + "grad_norm": 1.231092795602596, + "learning_rate": 4.345700356141878e-06, + "loss": 0.3055, + "step": 19153 + }, + { + "epoch": 0.56, + "grad_norm": 1.4894757496119264, + "learning_rate": 4.345234690457141e-06, + "loss": 0.3131, + "step": 19154 + }, + { + "epoch": 0.56, + "grad_norm": 1.248741603218482, + "learning_rate": 4.344769030550715e-06, + "loss": 0.2957, + "step": 19155 + }, + { + "epoch": 0.56, + "grad_norm": 1.3041951524350202, + "learning_rate": 4.34430337642671e-06, + "loss": 0.3096, + "step": 19156 + }, + { + "epoch": 0.56, + "grad_norm": 1.6857848467444925, + "learning_rate": 4.343837728089237e-06, + "loss": 0.3178, + "step": 19157 + }, + { + "epoch": 0.56, + "grad_norm": 1.3159734000520835, + "learning_rate": 4.343372085542401e-06, + "loss": 0.3077, + "step": 19158 + }, + { + "epoch": 0.56, + "grad_norm": 1.3448708908179758, + "learning_rate": 4.342906448790315e-06, + "loss": 0.3131, + "step": 19159 + }, + { + "epoch": 0.56, + "grad_norm": 0.9150190851177717, + "learning_rate": 4.342440817837086e-06, + "loss": 0.5773, + "step": 19160 + }, + { + "epoch": 0.56, + "grad_norm": 1.2140558063417446, + "learning_rate": 4.341975192686824e-06, + "loss": 0.3038, + "step": 19161 + }, + { + "epoch": 0.56, + "grad_norm": 1.2123506223033984, + "learning_rate": 4.3415095733436385e-06, + "loss": 0.2978, + "step": 19162 + }, + { + "epoch": 0.56, + "grad_norm": 1.2667377950612215, + "learning_rate": 4.341043959811638e-06, + "loss": 0.3118, + "step": 19163 + }, + { + "epoch": 0.56, + "grad_norm": 2.4412946072592234, + "learning_rate": 4.340578352094932e-06, + "loss": 0.3175, + "step": 19164 + }, + { + "epoch": 0.56, + "grad_norm": 1.2912582360398264, + "learning_rate": 4.340112750197629e-06, + "loss": 0.3015, + "step": 19165 + }, + { + "epoch": 0.56, + "grad_norm": 1.1476146195304697, + "learning_rate": 4.33964715412384e-06, + "loss": 0.2853, + "step": 19166 + }, + { + "epoch": 0.56, + "grad_norm": 1.3410592830895907, + "learning_rate": 4.339181563877671e-06, + "loss": 0.3183, + "step": 19167 + }, + { + "epoch": 0.56, + "grad_norm": 1.2203170117952247, + "learning_rate": 4.338715979463231e-06, + "loss": 0.2871, + "step": 19168 + }, + { + "epoch": 0.56, + "grad_norm": 1.2560495199846384, + "learning_rate": 4.338250400884631e-06, + "loss": 0.2813, + "step": 19169 + }, + { + "epoch": 0.56, + "grad_norm": 1.2026280387576815, + "learning_rate": 4.337784828145979e-06, + "loss": 0.2999, + "step": 19170 + }, + { + "epoch": 0.56, + "grad_norm": 1.189905523270696, + "learning_rate": 4.337319261251383e-06, + "loss": 0.3145, + "step": 19171 + }, + { + "epoch": 0.56, + "grad_norm": 2.1042150452082913, + "learning_rate": 4.336853700204951e-06, + "loss": 0.2908, + "step": 19172 + }, + { + "epoch": 0.56, + "grad_norm": 1.3276778348871892, + "learning_rate": 4.336388145010792e-06, + "loss": 0.2895, + "step": 19173 + }, + { + "epoch": 0.56, + "grad_norm": 1.4528037268745198, + "learning_rate": 4.335922595673018e-06, + "loss": 0.2893, + "step": 19174 + }, + { + "epoch": 0.56, + "grad_norm": 1.1856907070855098, + "learning_rate": 4.335457052195731e-06, + "loss": 0.2858, + "step": 19175 + }, + { + "epoch": 0.56, + "grad_norm": 1.6298426547456009, + "learning_rate": 4.334991514583047e-06, + "loss": 0.3276, + "step": 19176 + }, + { + "epoch": 0.56, + "grad_norm": 1.4365148740474245, + "learning_rate": 4.334525982839068e-06, + "loss": 0.3116, + "step": 19177 + }, + { + "epoch": 0.56, + "grad_norm": 1.4432943651835508, + "learning_rate": 4.334060456967903e-06, + "loss": 0.2934, + "step": 19178 + }, + { + "epoch": 0.56, + "grad_norm": 1.3587397617214774, + "learning_rate": 4.333594936973663e-06, + "loss": 0.3089, + "step": 19179 + }, + { + "epoch": 0.56, + "grad_norm": 1.422482485396901, + "learning_rate": 4.333129422860456e-06, + "loss": 0.2969, + "step": 19180 + }, + { + "epoch": 0.56, + "grad_norm": 1.3040064946221623, + "learning_rate": 4.332663914632388e-06, + "loss": 0.3119, + "step": 19181 + }, + { + "epoch": 0.56, + "grad_norm": 1.636285484043392, + "learning_rate": 4.3321984122935684e-06, + "loss": 0.2863, + "step": 19182 + }, + { + "epoch": 0.56, + "grad_norm": 1.3104151537479976, + "learning_rate": 4.331732915848105e-06, + "loss": 0.3085, + "step": 19183 + }, + { + "epoch": 0.56, + "grad_norm": 1.4089457434211166, + "learning_rate": 4.3312674253001066e-06, + "loss": 0.3019, + "step": 19184 + }, + { + "epoch": 0.56, + "grad_norm": 1.2806782382529756, + "learning_rate": 4.330801940653681e-06, + "loss": 0.2928, + "step": 19185 + }, + { + "epoch": 0.56, + "grad_norm": 1.3685820374259707, + "learning_rate": 4.330336461912935e-06, + "loss": 0.2941, + "step": 19186 + }, + { + "epoch": 0.56, + "grad_norm": 1.3251515585783513, + "learning_rate": 4.329870989081977e-06, + "loss": 0.2913, + "step": 19187 + }, + { + "epoch": 0.56, + "grad_norm": 1.977366147715757, + "learning_rate": 4.3294055221649155e-06, + "loss": 0.2874, + "step": 19188 + }, + { + "epoch": 0.56, + "grad_norm": 1.2380277690703143, + "learning_rate": 4.328940061165858e-06, + "loss": 0.3261, + "step": 19189 + }, + { + "epoch": 0.56, + "grad_norm": 1.4325355611588153, + "learning_rate": 4.328474606088912e-06, + "loss": 0.29, + "step": 19190 + }, + { + "epoch": 0.56, + "grad_norm": 1.4694829013397952, + "learning_rate": 4.3280091569381835e-06, + "loss": 0.2952, + "step": 19191 + }, + { + "epoch": 0.56, + "grad_norm": 1.3162421398928388, + "learning_rate": 4.327543713717783e-06, + "loss": 0.2928, + "step": 19192 + }, + { + "epoch": 0.56, + "grad_norm": 1.2295983518040476, + "learning_rate": 4.3270782764318166e-06, + "loss": 0.275, + "step": 19193 + }, + { + "epoch": 0.56, + "grad_norm": 2.168997011290336, + "learning_rate": 4.326612845084392e-06, + "loss": 0.3208, + "step": 19194 + }, + { + "epoch": 0.56, + "grad_norm": 1.4322289377060655, + "learning_rate": 4.326147419679615e-06, + "loss": 0.2918, + "step": 19195 + }, + { + "epoch": 0.56, + "grad_norm": 1.2357102788599674, + "learning_rate": 4.3256820002215955e-06, + "loss": 0.2909, + "step": 19196 + }, + { + "epoch": 0.56, + "grad_norm": 1.4579254132967685, + "learning_rate": 4.3252165867144405e-06, + "loss": 0.2967, + "step": 19197 + }, + { + "epoch": 0.56, + "grad_norm": 1.3995907984263602, + "learning_rate": 4.324751179162256e-06, + "loss": 0.301, + "step": 19198 + }, + { + "epoch": 0.56, + "grad_norm": 1.7955298695125872, + "learning_rate": 4.324285777569149e-06, + "loss": 0.3251, + "step": 19199 + }, + { + "epoch": 0.56, + "grad_norm": 1.2513570399200429, + "learning_rate": 4.323820381939229e-06, + "loss": 0.3132, + "step": 19200 + }, + { + "epoch": 0.56, + "grad_norm": 1.2964913831465195, + "learning_rate": 4.3233549922766005e-06, + "loss": 0.313, + "step": 19201 + }, + { + "epoch": 0.56, + "grad_norm": 1.3384913736512931, + "learning_rate": 4.322889608585374e-06, + "loss": 0.2947, + "step": 19202 + }, + { + "epoch": 0.56, + "grad_norm": 1.2431578173353584, + "learning_rate": 4.322424230869652e-06, + "loss": 0.302, + "step": 19203 + }, + { + "epoch": 0.56, + "grad_norm": 2.400851486333448, + "learning_rate": 4.321958859133545e-06, + "loss": 0.2918, + "step": 19204 + }, + { + "epoch": 0.56, + "grad_norm": 1.1948291390026724, + "learning_rate": 4.321493493381157e-06, + "loss": 0.3072, + "step": 19205 + }, + { + "epoch": 0.56, + "grad_norm": 1.3736599707108401, + "learning_rate": 4.321028133616598e-06, + "loss": 0.3007, + "step": 19206 + }, + { + "epoch": 0.56, + "grad_norm": 1.4808698819883857, + "learning_rate": 4.320562779843972e-06, + "loss": 0.2781, + "step": 19207 + }, + { + "epoch": 0.56, + "grad_norm": 1.7538560337022775, + "learning_rate": 4.320097432067389e-06, + "loss": 0.2888, + "step": 19208 + }, + { + "epoch": 0.56, + "grad_norm": 1.311121123954803, + "learning_rate": 4.31963209029095e-06, + "loss": 0.2762, + "step": 19209 + }, + { + "epoch": 0.56, + "grad_norm": 1.2759726619163545, + "learning_rate": 4.319166754518768e-06, + "loss": 0.319, + "step": 19210 + }, + { + "epoch": 0.56, + "grad_norm": 1.184782485282998, + "learning_rate": 4.318701424754946e-06, + "loss": 0.3082, + "step": 19211 + }, + { + "epoch": 0.56, + "grad_norm": 1.4906168149041796, + "learning_rate": 4.318236101003593e-06, + "loss": 0.3293, + "step": 19212 + }, + { + "epoch": 0.56, + "grad_norm": 1.2655790798664754, + "learning_rate": 4.317770783268813e-06, + "loss": 0.2945, + "step": 19213 + }, + { + "epoch": 0.56, + "grad_norm": 1.2866063918304889, + "learning_rate": 4.317305471554713e-06, + "loss": 0.3105, + "step": 19214 + }, + { + "epoch": 0.56, + "grad_norm": 1.195747829554051, + "learning_rate": 4.316840165865399e-06, + "loss": 0.2722, + "step": 19215 + }, + { + "epoch": 0.56, + "grad_norm": 1.4524440752109928, + "learning_rate": 4.31637486620498e-06, + "loss": 0.3092, + "step": 19216 + }, + { + "epoch": 0.56, + "grad_norm": 1.3023272278283233, + "learning_rate": 4.31590957257756e-06, + "loss": 0.3207, + "step": 19217 + }, + { + "epoch": 0.56, + "grad_norm": 1.4631594082866064, + "learning_rate": 4.315444284987244e-06, + "loss": 0.3103, + "step": 19218 + }, + { + "epoch": 0.56, + "grad_norm": 1.2667057387973588, + "learning_rate": 4.314979003438141e-06, + "loss": 0.3243, + "step": 19219 + }, + { + "epoch": 0.56, + "grad_norm": 1.3149124931306162, + "learning_rate": 4.3145137279343556e-06, + "loss": 0.3015, + "step": 19220 + }, + { + "epoch": 0.56, + "grad_norm": 1.2582635179170554, + "learning_rate": 4.314048458479995e-06, + "loss": 0.2985, + "step": 19221 + }, + { + "epoch": 0.56, + "grad_norm": 2.9461196269575285, + "learning_rate": 4.313583195079164e-06, + "loss": 0.3511, + "step": 19222 + }, + { + "epoch": 0.56, + "grad_norm": 2.1803225362524117, + "learning_rate": 4.313117937735969e-06, + "loss": 0.3061, + "step": 19223 + }, + { + "epoch": 0.56, + "grad_norm": 1.3501089153078671, + "learning_rate": 4.312652686454515e-06, + "loss": 0.3115, + "step": 19224 + }, + { + "epoch": 0.56, + "grad_norm": 0.9431672523045571, + "learning_rate": 4.3121874412389095e-06, + "loss": 0.5491, + "step": 19225 + }, + { + "epoch": 0.56, + "grad_norm": 1.3058122148186342, + "learning_rate": 4.311722202093257e-06, + "loss": 0.2908, + "step": 19226 + }, + { + "epoch": 0.56, + "grad_norm": 1.3938647398644182, + "learning_rate": 4.3112569690216635e-06, + "loss": 0.3074, + "step": 19227 + }, + { + "epoch": 0.56, + "grad_norm": 1.2223975519311574, + "learning_rate": 4.310791742028234e-06, + "loss": 0.2977, + "step": 19228 + }, + { + "epoch": 0.56, + "grad_norm": 1.3001875202434712, + "learning_rate": 4.310326521117078e-06, + "loss": 0.3302, + "step": 19229 + }, + { + "epoch": 0.56, + "grad_norm": 1.333430880643345, + "learning_rate": 4.309861306292296e-06, + "loss": 0.2895, + "step": 19230 + }, + { + "epoch": 0.56, + "grad_norm": 1.3909408670398973, + "learning_rate": 4.309396097557996e-06, + "loss": 0.3206, + "step": 19231 + }, + { + "epoch": 0.56, + "grad_norm": 1.4387268198195977, + "learning_rate": 4.308930894918285e-06, + "loss": 0.3372, + "step": 19232 + }, + { + "epoch": 0.56, + "grad_norm": 2.230596494557063, + "learning_rate": 4.3084656983772644e-06, + "loss": 0.3009, + "step": 19233 + }, + { + "epoch": 0.56, + "grad_norm": 1.850475432685325, + "learning_rate": 4.308000507939041e-06, + "loss": 0.3151, + "step": 19234 + }, + { + "epoch": 0.56, + "grad_norm": 1.436239156634683, + "learning_rate": 4.307535323607723e-06, + "loss": 0.3085, + "step": 19235 + }, + { + "epoch": 0.56, + "grad_norm": 2.9667998150637973, + "learning_rate": 4.307070145387412e-06, + "loss": 0.3131, + "step": 19236 + }, + { + "epoch": 0.56, + "grad_norm": 1.5709012691886488, + "learning_rate": 4.306604973282214e-06, + "loss": 0.3237, + "step": 19237 + }, + { + "epoch": 0.56, + "grad_norm": 1.433832525979478, + "learning_rate": 4.306139807296236e-06, + "loss": 0.2917, + "step": 19238 + }, + { + "epoch": 0.56, + "grad_norm": 1.275784272909622, + "learning_rate": 4.305674647433582e-06, + "loss": 0.2917, + "step": 19239 + }, + { + "epoch": 0.56, + "grad_norm": 1.045762643421143, + "learning_rate": 4.305209493698356e-06, + "loss": 0.6346, + "step": 19240 + }, + { + "epoch": 0.56, + "grad_norm": 1.243831571519463, + "learning_rate": 4.304744346094664e-06, + "loss": 0.2987, + "step": 19241 + }, + { + "epoch": 0.56, + "grad_norm": 1.5434497151865174, + "learning_rate": 4.30427920462661e-06, + "loss": 0.3328, + "step": 19242 + }, + { + "epoch": 0.56, + "grad_norm": 1.3623978061010638, + "learning_rate": 4.303814069298302e-06, + "loss": 0.3068, + "step": 19243 + }, + { + "epoch": 0.56, + "grad_norm": 1.255893688409705, + "learning_rate": 4.303348940113842e-06, + "loss": 0.2926, + "step": 19244 + }, + { + "epoch": 0.56, + "grad_norm": 1.3712588906879852, + "learning_rate": 4.302883817077335e-06, + "loss": 0.2813, + "step": 19245 + }, + { + "epoch": 0.56, + "grad_norm": 1.2760349017641877, + "learning_rate": 4.302418700192885e-06, + "loss": 0.2929, + "step": 19246 + }, + { + "epoch": 0.56, + "grad_norm": 1.3045662600014947, + "learning_rate": 4.301953589464598e-06, + "loss": 0.2959, + "step": 19247 + }, + { + "epoch": 0.56, + "grad_norm": 2.7656929736588127, + "learning_rate": 4.30148848489658e-06, + "loss": 0.3199, + "step": 19248 + }, + { + "epoch": 0.56, + "grad_norm": 1.3899013686090178, + "learning_rate": 4.301023386492933e-06, + "loss": 0.3169, + "step": 19249 + }, + { + "epoch": 0.56, + "grad_norm": 1.366692699280232, + "learning_rate": 4.300558294257762e-06, + "loss": 0.3023, + "step": 19250 + }, + { + "epoch": 0.56, + "grad_norm": 1.32054549824683, + "learning_rate": 4.300093208195171e-06, + "loss": 0.3087, + "step": 19251 + }, + { + "epoch": 0.56, + "grad_norm": 1.4479284252659592, + "learning_rate": 4.299628128309267e-06, + "loss": 0.3125, + "step": 19252 + }, + { + "epoch": 0.56, + "grad_norm": 1.2321630470837879, + "learning_rate": 4.299163054604152e-06, + "loss": 0.2924, + "step": 19253 + }, + { + "epoch": 0.56, + "grad_norm": 1.414176697353369, + "learning_rate": 4.2986979870839296e-06, + "loss": 0.3476, + "step": 19254 + }, + { + "epoch": 0.56, + "grad_norm": 1.34687842188314, + "learning_rate": 4.298232925752706e-06, + "loss": 0.2982, + "step": 19255 + }, + { + "epoch": 0.56, + "grad_norm": 1.2939833163023124, + "learning_rate": 4.2977678706145845e-06, + "loss": 0.2956, + "step": 19256 + }, + { + "epoch": 0.56, + "grad_norm": 1.2375375534846564, + "learning_rate": 4.297302821673671e-06, + "loss": 0.3175, + "step": 19257 + }, + { + "epoch": 0.56, + "grad_norm": 1.38765005396352, + "learning_rate": 4.296837778934066e-06, + "loss": 0.3155, + "step": 19258 + }, + { + "epoch": 0.56, + "grad_norm": 1.360714113931477, + "learning_rate": 4.2963727423998754e-06, + "loss": 0.3106, + "step": 19259 + }, + { + "epoch": 0.56, + "grad_norm": 1.2710889853814467, + "learning_rate": 4.295907712075205e-06, + "loss": 0.3169, + "step": 19260 + }, + { + "epoch": 0.56, + "grad_norm": 1.4921792294873844, + "learning_rate": 4.2954426879641555e-06, + "loss": 0.2802, + "step": 19261 + }, + { + "epoch": 0.56, + "grad_norm": 1.3468665099971038, + "learning_rate": 4.294977670070832e-06, + "loss": 0.29, + "step": 19262 + }, + { + "epoch": 0.56, + "grad_norm": 1.3401822562752779, + "learning_rate": 4.29451265839934e-06, + "loss": 0.2869, + "step": 19263 + }, + { + "epoch": 0.56, + "grad_norm": 1.2830530700297174, + "learning_rate": 4.29404765295378e-06, + "loss": 0.3127, + "step": 19264 + }, + { + "epoch": 0.56, + "grad_norm": 1.3331600871041303, + "learning_rate": 4.2935826537382565e-06, + "loss": 0.2954, + "step": 19265 + }, + { + "epoch": 0.56, + "grad_norm": 1.3016330801745752, + "learning_rate": 4.2931176607568745e-06, + "loss": 0.3233, + "step": 19266 + }, + { + "epoch": 0.56, + "grad_norm": 4.584956188221566, + "learning_rate": 4.292652674013738e-06, + "loss": 0.3124, + "step": 19267 + }, + { + "epoch": 0.56, + "grad_norm": 1.219606659033393, + "learning_rate": 4.292187693512948e-06, + "loss": 0.3086, + "step": 19268 + }, + { + "epoch": 0.56, + "grad_norm": 1.3263120369569372, + "learning_rate": 4.291722719258611e-06, + "loss": 0.3146, + "step": 19269 + }, + { + "epoch": 0.56, + "grad_norm": 1.4801665825109969, + "learning_rate": 4.291257751254828e-06, + "loss": 0.306, + "step": 19270 + }, + { + "epoch": 0.56, + "grad_norm": 1.338860059064285, + "learning_rate": 4.290792789505704e-06, + "loss": 0.3113, + "step": 19271 + }, + { + "epoch": 0.56, + "grad_norm": 1.2789399304904758, + "learning_rate": 4.29032783401534e-06, + "loss": 0.3187, + "step": 19272 + }, + { + "epoch": 0.56, + "grad_norm": 2.1159109844136577, + "learning_rate": 4.289862884787841e-06, + "loss": 0.3061, + "step": 19273 + }, + { + "epoch": 0.56, + "grad_norm": 1.2861631321124318, + "learning_rate": 4.289397941827311e-06, + "loss": 0.3196, + "step": 19274 + }, + { + "epoch": 0.56, + "grad_norm": 1.4927791235772874, + "learning_rate": 4.288933005137852e-06, + "loss": 0.3174, + "step": 19275 + }, + { + "epoch": 0.56, + "grad_norm": 1.5044037707703453, + "learning_rate": 4.288468074723567e-06, + "loss": 0.3309, + "step": 19276 + }, + { + "epoch": 0.56, + "grad_norm": 1.5831706662407763, + "learning_rate": 4.2880031505885575e-06, + "loss": 0.3019, + "step": 19277 + }, + { + "epoch": 0.56, + "grad_norm": 1.3534682248921794, + "learning_rate": 4.2875382327369295e-06, + "loss": 0.2918, + "step": 19278 + }, + { + "epoch": 0.56, + "grad_norm": 1.4482366333821335, + "learning_rate": 4.287073321172784e-06, + "loss": 0.2975, + "step": 19279 + }, + { + "epoch": 0.56, + "grad_norm": 1.3210909698544773, + "learning_rate": 4.286608415900225e-06, + "loss": 0.2894, + "step": 19280 + }, + { + "epoch": 0.56, + "grad_norm": 0.9517363184127702, + "learning_rate": 4.286143516923354e-06, + "loss": 0.5723, + "step": 19281 + }, + { + "epoch": 0.56, + "grad_norm": 1.3737208032541284, + "learning_rate": 4.285678624246275e-06, + "loss": 0.3158, + "step": 19282 + }, + { + "epoch": 0.56, + "grad_norm": 1.362404852049993, + "learning_rate": 4.28521373787309e-06, + "loss": 0.2985, + "step": 19283 + }, + { + "epoch": 0.56, + "grad_norm": 1.177570337471973, + "learning_rate": 4.284748857807902e-06, + "loss": 0.3023, + "step": 19284 + }, + { + "epoch": 0.56, + "grad_norm": 1.4450348944778304, + "learning_rate": 4.2842839840548125e-06, + "loss": 0.2929, + "step": 19285 + }, + { + "epoch": 0.56, + "grad_norm": 1.3030431098808448, + "learning_rate": 4.283819116617925e-06, + "loss": 0.2775, + "step": 19286 + }, + { + "epoch": 0.56, + "grad_norm": 1.288662663213297, + "learning_rate": 4.283354255501341e-06, + "loss": 0.2996, + "step": 19287 + }, + { + "epoch": 0.56, + "grad_norm": 1.2199519678249584, + "learning_rate": 4.282889400709166e-06, + "loss": 0.3028, + "step": 19288 + }, + { + "epoch": 0.56, + "grad_norm": 1.2198181100653718, + "learning_rate": 4.282424552245499e-06, + "loss": 0.2772, + "step": 19289 + }, + { + "epoch": 0.56, + "grad_norm": 1.2938273560276008, + "learning_rate": 4.281959710114443e-06, + "loss": 0.2953, + "step": 19290 + }, + { + "epoch": 0.56, + "grad_norm": 1.4632941004514513, + "learning_rate": 4.2814948743201e-06, + "loss": 0.2909, + "step": 19291 + }, + { + "epoch": 0.56, + "grad_norm": 1.2607520472083826, + "learning_rate": 4.2810300448665735e-06, + "loss": 0.2911, + "step": 19292 + }, + { + "epoch": 0.56, + "grad_norm": 1.4540831865098494, + "learning_rate": 4.280565221757963e-06, + "loss": 0.3084, + "step": 19293 + }, + { + "epoch": 0.56, + "grad_norm": 1.8212825338325827, + "learning_rate": 4.280100404998375e-06, + "loss": 0.3038, + "step": 19294 + }, + { + "epoch": 0.56, + "grad_norm": 1.245159525132898, + "learning_rate": 4.279635594591907e-06, + "loss": 0.3041, + "step": 19295 + }, + { + "epoch": 0.56, + "grad_norm": 1.2499568120113833, + "learning_rate": 4.279170790542663e-06, + "loss": 0.3001, + "step": 19296 + }, + { + "epoch": 0.56, + "grad_norm": 1.3731409515696165, + "learning_rate": 4.278705992854745e-06, + "loss": 0.3101, + "step": 19297 + }, + { + "epoch": 0.56, + "grad_norm": 1.251612633081372, + "learning_rate": 4.278241201532255e-06, + "loss": 0.3018, + "step": 19298 + }, + { + "epoch": 0.56, + "grad_norm": 1.265767570183629, + "learning_rate": 4.277776416579295e-06, + "loss": 0.2888, + "step": 19299 + }, + { + "epoch": 0.56, + "grad_norm": 1.333561725031594, + "learning_rate": 4.277311637999965e-06, + "loss": 0.3106, + "step": 19300 + }, + { + "epoch": 0.56, + "grad_norm": 1.278293584606856, + "learning_rate": 4.276846865798368e-06, + "loss": 0.3121, + "step": 19301 + }, + { + "epoch": 0.56, + "grad_norm": 2.1983030589206285, + "learning_rate": 4.276382099978606e-06, + "loss": 0.3013, + "step": 19302 + }, + { + "epoch": 0.56, + "grad_norm": 1.3508012222445434, + "learning_rate": 4.275917340544779e-06, + "loss": 0.3131, + "step": 19303 + }, + { + "epoch": 0.56, + "grad_norm": 1.3613459817628315, + "learning_rate": 4.27545258750099e-06, + "loss": 0.2872, + "step": 19304 + }, + { + "epoch": 0.56, + "grad_norm": 1.2944623183926582, + "learning_rate": 4.274987840851341e-06, + "loss": 0.2803, + "step": 19305 + }, + { + "epoch": 0.56, + "grad_norm": 1.03763677479079, + "learning_rate": 4.27452310059993e-06, + "loss": 0.5938, + "step": 19306 + }, + { + "epoch": 0.56, + "grad_norm": 1.4406507415092378, + "learning_rate": 4.274058366750864e-06, + "loss": 0.2844, + "step": 19307 + }, + { + "epoch": 0.56, + "grad_norm": 1.2860552931697178, + "learning_rate": 4.273593639308238e-06, + "loss": 0.3328, + "step": 19308 + }, + { + "epoch": 0.56, + "grad_norm": 2.408163165244875, + "learning_rate": 4.273128918276157e-06, + "loss": 0.3408, + "step": 19309 + }, + { + "epoch": 0.56, + "grad_norm": 1.412960633624044, + "learning_rate": 4.272664203658722e-06, + "loss": 0.2945, + "step": 19310 + }, + { + "epoch": 0.56, + "grad_norm": 1.5171982765881864, + "learning_rate": 4.272199495460034e-06, + "loss": 0.3138, + "step": 19311 + }, + { + "epoch": 0.56, + "grad_norm": 1.4105603940258642, + "learning_rate": 4.271734793684192e-06, + "loss": 0.2862, + "step": 19312 + }, + { + "epoch": 0.56, + "grad_norm": 1.4569727525879397, + "learning_rate": 4.271270098335299e-06, + "loss": 0.3179, + "step": 19313 + }, + { + "epoch": 0.56, + "grad_norm": 1.5712113978728002, + "learning_rate": 4.270805409417455e-06, + "loss": 0.289, + "step": 19314 + }, + { + "epoch": 0.56, + "grad_norm": 1.282004242774577, + "learning_rate": 4.270340726934762e-06, + "loss": 0.2964, + "step": 19315 + }, + { + "epoch": 0.56, + "grad_norm": 1.3856794913883599, + "learning_rate": 4.2698760508913215e-06, + "loss": 0.3011, + "step": 19316 + }, + { + "epoch": 0.56, + "grad_norm": 1.4099426644384125, + "learning_rate": 4.269411381291232e-06, + "loss": 0.2932, + "step": 19317 + }, + { + "epoch": 0.56, + "grad_norm": 1.468942882197448, + "learning_rate": 4.268946718138595e-06, + "loss": 0.3031, + "step": 19318 + }, + { + "epoch": 0.56, + "grad_norm": 2.3999290168762233, + "learning_rate": 4.268482061437511e-06, + "loss": 0.3094, + "step": 19319 + }, + { + "epoch": 0.56, + "grad_norm": 1.2096300885199367, + "learning_rate": 4.268017411192081e-06, + "loss": 0.2988, + "step": 19320 + }, + { + "epoch": 0.56, + "grad_norm": 1.4666829148800857, + "learning_rate": 4.267552767406405e-06, + "loss": 0.3055, + "step": 19321 + }, + { + "epoch": 0.56, + "grad_norm": 1.3104942985786552, + "learning_rate": 4.267088130084585e-06, + "loss": 0.2944, + "step": 19322 + }, + { + "epoch": 0.56, + "grad_norm": 2.0235777879737515, + "learning_rate": 4.26662349923072e-06, + "loss": 0.3089, + "step": 19323 + }, + { + "epoch": 0.56, + "grad_norm": 1.4704806148238005, + "learning_rate": 4.26615887484891e-06, + "loss": 0.3095, + "step": 19324 + }, + { + "epoch": 0.56, + "grad_norm": 1.2480909472864072, + "learning_rate": 4.265694256943257e-06, + "loss": 0.321, + "step": 19325 + }, + { + "epoch": 0.56, + "grad_norm": 1.4299566851642551, + "learning_rate": 4.26522964551786e-06, + "loss": 0.2859, + "step": 19326 + }, + { + "epoch": 0.56, + "grad_norm": 1.3010599100589637, + "learning_rate": 4.264765040576819e-06, + "loss": 0.3054, + "step": 19327 + }, + { + "epoch": 0.56, + "grad_norm": 1.5484974272478353, + "learning_rate": 4.264300442124235e-06, + "loss": 0.2978, + "step": 19328 + }, + { + "epoch": 0.56, + "grad_norm": 1.2538207408768522, + "learning_rate": 4.263835850164208e-06, + "loss": 0.314, + "step": 19329 + }, + { + "epoch": 0.56, + "grad_norm": 1.2701896088443634, + "learning_rate": 4.263371264700838e-06, + "loss": 0.2967, + "step": 19330 + }, + { + "epoch": 0.56, + "grad_norm": 1.4356075956975485, + "learning_rate": 4.262906685738225e-06, + "loss": 0.3189, + "step": 19331 + }, + { + "epoch": 0.56, + "grad_norm": 1.391344270076231, + "learning_rate": 4.262442113280468e-06, + "loss": 0.2803, + "step": 19332 + }, + { + "epoch": 0.56, + "grad_norm": 1.244775240443527, + "learning_rate": 4.261977547331667e-06, + "loss": 0.295, + "step": 19333 + }, + { + "epoch": 0.56, + "grad_norm": 1.2600093094542275, + "learning_rate": 4.261512987895922e-06, + "loss": 0.2868, + "step": 19334 + }, + { + "epoch": 0.56, + "grad_norm": 1.3508107766297475, + "learning_rate": 4.261048434977336e-06, + "loss": 0.2955, + "step": 19335 + }, + { + "epoch": 0.56, + "grad_norm": 1.345216520821566, + "learning_rate": 4.260583888580004e-06, + "loss": 0.2888, + "step": 19336 + }, + { + "epoch": 0.56, + "grad_norm": 1.5681527175411936, + "learning_rate": 4.260119348708026e-06, + "loss": 0.2992, + "step": 19337 + }, + { + "epoch": 0.56, + "grad_norm": 1.5212600397521634, + "learning_rate": 4.259654815365503e-06, + "loss": 0.3151, + "step": 19338 + }, + { + "epoch": 0.56, + "grad_norm": 1.2505256634222661, + "learning_rate": 4.259190288556537e-06, + "loss": 0.3094, + "step": 19339 + }, + { + "epoch": 0.56, + "grad_norm": 1.231207161106332, + "learning_rate": 4.258725768285224e-06, + "loss": 0.3004, + "step": 19340 + }, + { + "epoch": 0.56, + "grad_norm": 1.209724377735798, + "learning_rate": 4.258261254555664e-06, + "loss": 0.2821, + "step": 19341 + }, + { + "epoch": 0.56, + "grad_norm": 1.235363847308409, + "learning_rate": 4.257796747371955e-06, + "loss": 0.2852, + "step": 19342 + }, + { + "epoch": 0.56, + "grad_norm": 1.4560490044071728, + "learning_rate": 4.257332246738201e-06, + "loss": 0.3257, + "step": 19343 + }, + { + "epoch": 0.56, + "grad_norm": 1.1498126166010092, + "learning_rate": 4.2568677526584976e-06, + "loss": 0.2806, + "step": 19344 + }, + { + "epoch": 0.56, + "grad_norm": 1.5787419767303803, + "learning_rate": 4.256403265136944e-06, + "loss": 0.3082, + "step": 19345 + }, + { + "epoch": 0.56, + "grad_norm": 1.172833179579952, + "learning_rate": 4.255938784177639e-06, + "loss": 0.2865, + "step": 19346 + }, + { + "epoch": 0.56, + "grad_norm": 1.284812871506322, + "learning_rate": 4.255474309784683e-06, + "loss": 0.3066, + "step": 19347 + }, + { + "epoch": 0.56, + "grad_norm": 1.6113383905458538, + "learning_rate": 4.255009841962175e-06, + "loss": 0.3009, + "step": 19348 + }, + { + "epoch": 0.56, + "grad_norm": 1.3313007618026, + "learning_rate": 4.254545380714213e-06, + "loss": 0.3024, + "step": 19349 + }, + { + "epoch": 0.56, + "grad_norm": 1.3770232079879046, + "learning_rate": 4.254080926044895e-06, + "loss": 0.3206, + "step": 19350 + }, + { + "epoch": 0.56, + "grad_norm": 1.2941797783804463, + "learning_rate": 4.253616477958322e-06, + "loss": 0.2926, + "step": 19351 + }, + { + "epoch": 0.56, + "grad_norm": 1.2650053248982576, + "learning_rate": 4.253152036458592e-06, + "loss": 0.2886, + "step": 19352 + }, + { + "epoch": 0.56, + "grad_norm": 1.300301851995844, + "learning_rate": 4.252687601549803e-06, + "loss": 0.2941, + "step": 19353 + }, + { + "epoch": 0.56, + "grad_norm": 1.17936289171841, + "learning_rate": 4.252223173236056e-06, + "loss": 0.286, + "step": 19354 + }, + { + "epoch": 0.56, + "grad_norm": 1.3702393530227204, + "learning_rate": 4.251758751521447e-06, + "loss": 0.3385, + "step": 19355 + }, + { + "epoch": 0.56, + "grad_norm": 1.2978725523378503, + "learning_rate": 4.251294336410074e-06, + "loss": 0.2921, + "step": 19356 + }, + { + "epoch": 0.56, + "grad_norm": 2.310321626842916, + "learning_rate": 4.250829927906037e-06, + "loss": 0.3218, + "step": 19357 + }, + { + "epoch": 0.56, + "grad_norm": 1.3527105269473874, + "learning_rate": 4.250365526013436e-06, + "loss": 0.3048, + "step": 19358 + }, + { + "epoch": 0.56, + "grad_norm": 2.595716505456418, + "learning_rate": 4.249901130736367e-06, + "loss": 0.3145, + "step": 19359 + }, + { + "epoch": 0.56, + "grad_norm": 1.2661426235384776, + "learning_rate": 4.249436742078927e-06, + "loss": 0.3089, + "step": 19360 + }, + { + "epoch": 0.56, + "grad_norm": 1.3600677312379426, + "learning_rate": 4.248972360045218e-06, + "loss": 0.309, + "step": 19361 + }, + { + "epoch": 0.56, + "grad_norm": 1.2114875139741843, + "learning_rate": 4.248507984639336e-06, + "loss": 0.2941, + "step": 19362 + }, + { + "epoch": 0.56, + "grad_norm": 1.5424839538598518, + "learning_rate": 4.248043615865379e-06, + "loss": 0.2918, + "step": 19363 + }, + { + "epoch": 0.56, + "grad_norm": 1.63929467523705, + "learning_rate": 4.247579253727444e-06, + "loss": 0.3119, + "step": 19364 + }, + { + "epoch": 0.56, + "grad_norm": 1.349690213509485, + "learning_rate": 4.247114898229632e-06, + "loss": 0.3129, + "step": 19365 + }, + { + "epoch": 0.56, + "grad_norm": 1.323870742649056, + "learning_rate": 4.24665054937604e-06, + "loss": 0.2903, + "step": 19366 + }, + { + "epoch": 0.56, + "grad_norm": 1.4238026219548663, + "learning_rate": 4.246186207170765e-06, + "loss": 0.2855, + "step": 19367 + }, + { + "epoch": 0.56, + "grad_norm": 1.4876565623769307, + "learning_rate": 4.245721871617904e-06, + "loss": 0.335, + "step": 19368 + }, + { + "epoch": 0.56, + "grad_norm": 1.3168017672902068, + "learning_rate": 4.245257542721556e-06, + "loss": 0.2831, + "step": 19369 + }, + { + "epoch": 0.56, + "grad_norm": 1.3134891898440904, + "learning_rate": 4.244793220485819e-06, + "loss": 0.2945, + "step": 19370 + }, + { + "epoch": 0.56, + "grad_norm": 1.284959397481306, + "learning_rate": 4.2443289049147905e-06, + "loss": 0.3205, + "step": 19371 + }, + { + "epoch": 0.56, + "grad_norm": 1.2606767640148546, + "learning_rate": 4.24386459601257e-06, + "loss": 0.2995, + "step": 19372 + }, + { + "epoch": 0.56, + "grad_norm": 1.3430249109697976, + "learning_rate": 4.24340029378325e-06, + "loss": 0.2894, + "step": 19373 + }, + { + "epoch": 0.56, + "grad_norm": 1.342954285888902, + "learning_rate": 4.24293599823093e-06, + "loss": 0.3003, + "step": 19374 + }, + { + "epoch": 0.56, + "grad_norm": 1.5142243271878892, + "learning_rate": 4.24247170935971e-06, + "loss": 0.3057, + "step": 19375 + }, + { + "epoch": 0.56, + "grad_norm": 1.793210525119607, + "learning_rate": 4.242007427173684e-06, + "loss": 0.2949, + "step": 19376 + }, + { + "epoch": 0.56, + "grad_norm": 1.831455610220274, + "learning_rate": 4.2415431516769524e-06, + "loss": 0.3034, + "step": 19377 + }, + { + "epoch": 0.56, + "grad_norm": 1.1832347821317502, + "learning_rate": 4.2410788828736105e-06, + "loss": 0.2914, + "step": 19378 + }, + { + "epoch": 0.56, + "grad_norm": 1.4482982111128002, + "learning_rate": 4.240614620767755e-06, + "loss": 0.2869, + "step": 19379 + }, + { + "epoch": 0.56, + "grad_norm": 1.2505506998805387, + "learning_rate": 4.240150365363485e-06, + "loss": 0.2946, + "step": 19380 + }, + { + "epoch": 0.56, + "grad_norm": 1.2708906516253158, + "learning_rate": 4.239686116664896e-06, + "loss": 0.3114, + "step": 19381 + }, + { + "epoch": 0.56, + "grad_norm": 1.3046257995307413, + "learning_rate": 4.239221874676085e-06, + "loss": 0.3057, + "step": 19382 + }, + { + "epoch": 0.56, + "grad_norm": 1.2966330579652776, + "learning_rate": 4.23875763940115e-06, + "loss": 0.3014, + "step": 19383 + }, + { + "epoch": 0.56, + "grad_norm": 1.9083443184936908, + "learning_rate": 4.238293410844187e-06, + "loss": 0.3071, + "step": 19384 + }, + { + "epoch": 0.56, + "grad_norm": 1.4173785923463045, + "learning_rate": 4.237829189009294e-06, + "loss": 0.3129, + "step": 19385 + }, + { + "epoch": 0.56, + "grad_norm": 1.4380246509480619, + "learning_rate": 4.237364973900566e-06, + "loss": 0.3181, + "step": 19386 + }, + { + "epoch": 0.56, + "grad_norm": 1.321364474487095, + "learning_rate": 4.2369007655221e-06, + "loss": 0.2981, + "step": 19387 + }, + { + "epoch": 0.56, + "grad_norm": 1.2422205735454281, + "learning_rate": 4.236436563877993e-06, + "loss": 0.2843, + "step": 19388 + }, + { + "epoch": 0.56, + "grad_norm": 1.209292077559713, + "learning_rate": 4.235972368972343e-06, + "loss": 0.2813, + "step": 19389 + }, + { + "epoch": 0.56, + "grad_norm": 1.4037774376223409, + "learning_rate": 4.235508180809246e-06, + "loss": 0.3073, + "step": 19390 + }, + { + "epoch": 0.56, + "grad_norm": 1.2893070477614779, + "learning_rate": 4.2350439993927964e-06, + "loss": 0.3058, + "step": 19391 + }, + { + "epoch": 0.56, + "grad_norm": 1.9705219049298117, + "learning_rate": 4.234579824727092e-06, + "loss": 0.2964, + "step": 19392 + }, + { + "epoch": 0.56, + "grad_norm": 1.5886227051499124, + "learning_rate": 4.234115656816229e-06, + "loss": 0.291, + "step": 19393 + }, + { + "epoch": 0.56, + "grad_norm": 1.1457280762496151, + "learning_rate": 4.233651495664305e-06, + "loss": 0.2834, + "step": 19394 + }, + { + "epoch": 0.56, + "grad_norm": 0.9288380645659807, + "learning_rate": 4.2331873412754146e-06, + "loss": 0.6524, + "step": 19395 + }, + { + "epoch": 0.56, + "grad_norm": 1.6050573180129883, + "learning_rate": 4.2327231936536535e-06, + "loss": 0.3032, + "step": 19396 + }, + { + "epoch": 0.56, + "grad_norm": 1.2865474954775589, + "learning_rate": 4.23225905280312e-06, + "loss": 0.293, + "step": 19397 + }, + { + "epoch": 0.56, + "grad_norm": 1.335827121476693, + "learning_rate": 4.231794918727908e-06, + "loss": 0.3275, + "step": 19398 + }, + { + "epoch": 0.56, + "grad_norm": 1.3882872693322086, + "learning_rate": 4.231330791432115e-06, + "loss": 0.302, + "step": 19399 + }, + { + "epoch": 0.56, + "grad_norm": 1.342177900230222, + "learning_rate": 4.230866670919838e-06, + "loss": 0.3164, + "step": 19400 + }, + { + "epoch": 0.56, + "grad_norm": 1.3680839542046233, + "learning_rate": 4.230402557195168e-06, + "loss": 0.283, + "step": 19401 + }, + { + "epoch": 0.56, + "grad_norm": 1.5634668233729785, + "learning_rate": 4.229938450262205e-06, + "loss": 0.2964, + "step": 19402 + }, + { + "epoch": 0.56, + "grad_norm": 1.3145754126313174, + "learning_rate": 4.2294743501250435e-06, + "loss": 0.3027, + "step": 19403 + }, + { + "epoch": 0.56, + "grad_norm": 1.339709121395914, + "learning_rate": 4.229010256787781e-06, + "loss": 0.3073, + "step": 19404 + }, + { + "epoch": 0.56, + "grad_norm": 1.4622083872945546, + "learning_rate": 4.2285461702545095e-06, + "loss": 0.3138, + "step": 19405 + }, + { + "epoch": 0.56, + "grad_norm": 1.3395500466633818, + "learning_rate": 4.228082090529327e-06, + "loss": 0.3113, + "step": 19406 + }, + { + "epoch": 0.56, + "grad_norm": 1.39304915217907, + "learning_rate": 4.227618017616329e-06, + "loss": 0.2882, + "step": 19407 + }, + { + "epoch": 0.56, + "grad_norm": 1.2395753202504791, + "learning_rate": 4.227153951519611e-06, + "loss": 0.2873, + "step": 19408 + }, + { + "epoch": 0.56, + "grad_norm": 1.4834019256999422, + "learning_rate": 4.226689892243268e-06, + "loss": 0.3014, + "step": 19409 + }, + { + "epoch": 0.56, + "grad_norm": 1.320871440621436, + "learning_rate": 4.226225839791394e-06, + "loss": 0.2915, + "step": 19410 + }, + { + "epoch": 0.56, + "grad_norm": 1.4928144584956795, + "learning_rate": 4.2257617941680864e-06, + "loss": 0.3369, + "step": 19411 + }, + { + "epoch": 0.56, + "grad_norm": 1.1747006352074054, + "learning_rate": 4.225297755377439e-06, + "loss": 0.2809, + "step": 19412 + }, + { + "epoch": 0.56, + "grad_norm": 1.4505018135042056, + "learning_rate": 4.224833723423548e-06, + "loss": 0.3026, + "step": 19413 + }, + { + "epoch": 0.56, + "grad_norm": 1.644660699715059, + "learning_rate": 4.2243696983105084e-06, + "loss": 0.3086, + "step": 19414 + }, + { + "epoch": 0.56, + "grad_norm": 1.6651348082708866, + "learning_rate": 4.223905680042414e-06, + "loss": 0.3217, + "step": 19415 + }, + { + "epoch": 0.56, + "grad_norm": 1.4467735793851897, + "learning_rate": 4.22344166862336e-06, + "loss": 0.2958, + "step": 19416 + }, + { + "epoch": 0.56, + "grad_norm": 1.2757427872320322, + "learning_rate": 4.222977664057444e-06, + "loss": 0.3049, + "step": 19417 + }, + { + "epoch": 0.56, + "grad_norm": 1.2106442219842959, + "learning_rate": 4.222513666348757e-06, + "loss": 0.3112, + "step": 19418 + }, + { + "epoch": 0.56, + "grad_norm": 1.580368493799571, + "learning_rate": 4.222049675501396e-06, + "loss": 0.3028, + "step": 19419 + }, + { + "epoch": 0.56, + "grad_norm": 2.9088400310065325, + "learning_rate": 4.221585691519455e-06, + "loss": 0.3012, + "step": 19420 + }, + { + "epoch": 0.56, + "grad_norm": 1.3487859076910562, + "learning_rate": 4.221121714407031e-06, + "loss": 0.2773, + "step": 19421 + }, + { + "epoch": 0.56, + "grad_norm": 1.1454898777457327, + "learning_rate": 4.220657744168214e-06, + "loss": 0.278, + "step": 19422 + }, + { + "epoch": 0.56, + "grad_norm": 2.014366899639717, + "learning_rate": 4.220193780807101e-06, + "loss": 0.277, + "step": 19423 + }, + { + "epoch": 0.56, + "grad_norm": 1.236736011818911, + "learning_rate": 4.2197298243277876e-06, + "loss": 0.3015, + "step": 19424 + }, + { + "epoch": 0.56, + "grad_norm": 1.4865767788270248, + "learning_rate": 4.219265874734367e-06, + "loss": 0.3314, + "step": 19425 + }, + { + "epoch": 0.56, + "grad_norm": 2.2258297702502357, + "learning_rate": 4.218801932030934e-06, + "loss": 0.2985, + "step": 19426 + }, + { + "epoch": 0.56, + "grad_norm": 1.9728644775789648, + "learning_rate": 4.218337996221582e-06, + "loss": 0.3078, + "step": 19427 + }, + { + "epoch": 0.56, + "grad_norm": 1.615429036496485, + "learning_rate": 4.217874067310405e-06, + "loss": 0.3217, + "step": 19428 + }, + { + "epoch": 0.56, + "grad_norm": 1.1816631558242605, + "learning_rate": 4.217410145301502e-06, + "loss": 0.2943, + "step": 19429 + }, + { + "epoch": 0.56, + "grad_norm": 1.434864478912237, + "learning_rate": 4.216946230198959e-06, + "loss": 0.3146, + "step": 19430 + }, + { + "epoch": 0.56, + "grad_norm": 1.898180585591915, + "learning_rate": 4.216482322006876e-06, + "loss": 0.3141, + "step": 19431 + }, + { + "epoch": 0.56, + "grad_norm": 1.3533505537222503, + "learning_rate": 4.216018420729346e-06, + "loss": 0.2962, + "step": 19432 + }, + { + "epoch": 0.56, + "grad_norm": 1.1931135285266457, + "learning_rate": 4.21555452637046e-06, + "loss": 0.2863, + "step": 19433 + }, + { + "epoch": 0.56, + "grad_norm": 1.39099226817138, + "learning_rate": 4.215090638934314e-06, + "loss": 0.2895, + "step": 19434 + }, + { + "epoch": 0.56, + "grad_norm": 0.932725825801551, + "learning_rate": 4.214626758425003e-06, + "loss": 0.581, + "step": 19435 + }, + { + "epoch": 0.56, + "grad_norm": 1.6075925707916021, + "learning_rate": 4.2141628848466195e-06, + "loss": 0.3048, + "step": 19436 + }, + { + "epoch": 0.56, + "grad_norm": 1.478091909444, + "learning_rate": 4.213699018203257e-06, + "loss": 0.3055, + "step": 19437 + }, + { + "epoch": 0.56, + "grad_norm": 1.4718036045203173, + "learning_rate": 4.213235158499009e-06, + "loss": 0.2925, + "step": 19438 + }, + { + "epoch": 0.56, + "grad_norm": 1.4760552135450222, + "learning_rate": 4.212771305737969e-06, + "loss": 0.3019, + "step": 19439 + }, + { + "epoch": 0.56, + "grad_norm": 1.3095822678379696, + "learning_rate": 4.212307459924233e-06, + "loss": 0.2896, + "step": 19440 + }, + { + "epoch": 0.56, + "grad_norm": 1.3642454437444367, + "learning_rate": 4.211843621061891e-06, + "loss": 0.294, + "step": 19441 + }, + { + "epoch": 0.56, + "grad_norm": 1.7511633265893, + "learning_rate": 4.211379789155038e-06, + "loss": 0.2973, + "step": 19442 + }, + { + "epoch": 0.56, + "grad_norm": 2.229927155208036, + "learning_rate": 4.210915964207767e-06, + "loss": 0.3003, + "step": 19443 + }, + { + "epoch": 0.56, + "grad_norm": 1.3982325554192383, + "learning_rate": 4.210452146224172e-06, + "loss": 0.3103, + "step": 19444 + }, + { + "epoch": 0.56, + "grad_norm": 1.6426345036346153, + "learning_rate": 4.209988335208344e-06, + "loss": 0.3085, + "step": 19445 + }, + { + "epoch": 0.56, + "grad_norm": 1.366463135013452, + "learning_rate": 4.209524531164378e-06, + "loss": 0.3149, + "step": 19446 + }, + { + "epoch": 0.56, + "grad_norm": 1.2533370736242708, + "learning_rate": 4.209060734096368e-06, + "loss": 0.2994, + "step": 19447 + }, + { + "epoch": 0.56, + "grad_norm": 1.2478960845854625, + "learning_rate": 4.208596944008405e-06, + "loss": 0.3065, + "step": 19448 + }, + { + "epoch": 0.56, + "grad_norm": 1.2576934950666303, + "learning_rate": 4.208133160904584e-06, + "loss": 0.2749, + "step": 19449 + }, + { + "epoch": 0.56, + "grad_norm": 1.7813281066076658, + "learning_rate": 4.2076693847889945e-06, + "loss": 0.3534, + "step": 19450 + }, + { + "epoch": 0.56, + "grad_norm": 1.3026625271334864, + "learning_rate": 4.207205615665733e-06, + "loss": 0.3041, + "step": 19451 + }, + { + "epoch": 0.56, + "grad_norm": 1.2442819651674124, + "learning_rate": 4.206741853538891e-06, + "loss": 0.2992, + "step": 19452 + }, + { + "epoch": 0.56, + "grad_norm": 1.9566778646584366, + "learning_rate": 4.20627809841256e-06, + "loss": 0.3006, + "step": 19453 + }, + { + "epoch": 0.56, + "grad_norm": 1.3862297171524354, + "learning_rate": 4.205814350290834e-06, + "loss": 0.3091, + "step": 19454 + }, + { + "epoch": 0.56, + "grad_norm": 1.4350878738860993, + "learning_rate": 4.205350609177804e-06, + "loss": 0.278, + "step": 19455 + }, + { + "epoch": 0.56, + "grad_norm": 1.4568123312869568, + "learning_rate": 4.2048868750775655e-06, + "loss": 0.2948, + "step": 19456 + }, + { + "epoch": 0.56, + "grad_norm": 1.4270902106762688, + "learning_rate": 4.20442314799421e-06, + "loss": 0.3084, + "step": 19457 + }, + { + "epoch": 0.56, + "grad_norm": 1.320656598557789, + "learning_rate": 4.203959427931827e-06, + "loss": 0.3036, + "step": 19458 + }, + { + "epoch": 0.56, + "grad_norm": 1.3080153924219418, + "learning_rate": 4.2034957148945125e-06, + "loss": 0.3302, + "step": 19459 + }, + { + "epoch": 0.56, + "grad_norm": 1.384405091756647, + "learning_rate": 4.203032008886355e-06, + "loss": 0.3113, + "step": 19460 + }, + { + "epoch": 0.56, + "grad_norm": 1.3870862425231394, + "learning_rate": 4.20256830991145e-06, + "loss": 0.3017, + "step": 19461 + }, + { + "epoch": 0.56, + "grad_norm": 1.4444654162924415, + "learning_rate": 4.202104617973888e-06, + "loss": 0.3032, + "step": 19462 + }, + { + "epoch": 0.56, + "grad_norm": 1.3976414634641867, + "learning_rate": 4.2016409330777626e-06, + "loss": 0.3045, + "step": 19463 + }, + { + "epoch": 0.56, + "grad_norm": 1.2736993772282381, + "learning_rate": 4.201177255227165e-06, + "loss": 0.2845, + "step": 19464 + }, + { + "epoch": 0.56, + "grad_norm": 1.2072344665478414, + "learning_rate": 4.200713584426185e-06, + "loss": 0.283, + "step": 19465 + }, + { + "epoch": 0.56, + "grad_norm": 1.2465459885722423, + "learning_rate": 4.200249920678918e-06, + "loss": 0.277, + "step": 19466 + }, + { + "epoch": 0.56, + "grad_norm": 5.977032704057781, + "learning_rate": 4.199786263989453e-06, + "loss": 0.2939, + "step": 19467 + }, + { + "epoch": 0.56, + "grad_norm": 1.5306324957717903, + "learning_rate": 4.1993226143618845e-06, + "loss": 0.3239, + "step": 19468 + }, + { + "epoch": 0.56, + "grad_norm": 1.8497537271454139, + "learning_rate": 4.198858971800302e-06, + "loss": 0.2855, + "step": 19469 + }, + { + "epoch": 0.56, + "grad_norm": 1.3686808070977634, + "learning_rate": 4.198395336308799e-06, + "loss": 0.307, + "step": 19470 + }, + { + "epoch": 0.56, + "grad_norm": 1.3846593521736306, + "learning_rate": 4.197931707891465e-06, + "loss": 0.2951, + "step": 19471 + }, + { + "epoch": 0.56, + "grad_norm": 1.2322730847020944, + "learning_rate": 4.197468086552394e-06, + "loss": 0.2909, + "step": 19472 + }, + { + "epoch": 0.56, + "grad_norm": 2.1868753806600902, + "learning_rate": 4.1970044722956746e-06, + "loss": 0.33, + "step": 19473 + }, + { + "epoch": 0.56, + "grad_norm": 1.818780581308467, + "learning_rate": 4.1965408651254e-06, + "loss": 0.2926, + "step": 19474 + }, + { + "epoch": 0.56, + "grad_norm": 1.230180415522597, + "learning_rate": 4.1960772650456605e-06, + "loss": 0.2833, + "step": 19475 + }, + { + "epoch": 0.56, + "grad_norm": 1.3233971587132913, + "learning_rate": 4.19561367206055e-06, + "loss": 0.3036, + "step": 19476 + }, + { + "epoch": 0.56, + "grad_norm": 1.2628006751993013, + "learning_rate": 4.1951500861741556e-06, + "loss": 0.3056, + "step": 19477 + }, + { + "epoch": 0.56, + "grad_norm": 1.385378886278859, + "learning_rate": 4.194686507390572e-06, + "loss": 0.3114, + "step": 19478 + }, + { + "epoch": 0.56, + "grad_norm": 1.7797327613883505, + "learning_rate": 4.194222935713887e-06, + "loss": 0.3294, + "step": 19479 + }, + { + "epoch": 0.57, + "grad_norm": 1.5966877875549061, + "learning_rate": 4.1937593711481965e-06, + "loss": 0.3012, + "step": 19480 + }, + { + "epoch": 0.57, + "grad_norm": 1.5190067682215902, + "learning_rate": 4.193295813697587e-06, + "loss": 0.2802, + "step": 19481 + }, + { + "epoch": 0.57, + "grad_norm": 1.4980653230011458, + "learning_rate": 4.19283226336615e-06, + "loss": 0.3121, + "step": 19482 + }, + { + "epoch": 0.57, + "grad_norm": 1.5643123539068493, + "learning_rate": 4.192368720157978e-06, + "loss": 0.2904, + "step": 19483 + }, + { + "epoch": 0.57, + "grad_norm": 1.2569356434124743, + "learning_rate": 4.191905184077161e-06, + "loss": 0.2836, + "step": 19484 + }, + { + "epoch": 0.57, + "grad_norm": 1.2827240661119108, + "learning_rate": 4.191441655127791e-06, + "loss": 0.3149, + "step": 19485 + }, + { + "epoch": 0.57, + "grad_norm": 1.4534003415083523, + "learning_rate": 4.1909781333139565e-06, + "loss": 0.2968, + "step": 19486 + }, + { + "epoch": 0.57, + "grad_norm": 1.642935258428419, + "learning_rate": 4.190514618639749e-06, + "loss": 0.304, + "step": 19487 + }, + { + "epoch": 0.57, + "grad_norm": 1.406124457811945, + "learning_rate": 4.190051111109259e-06, + "loss": 0.2884, + "step": 19488 + }, + { + "epoch": 0.57, + "grad_norm": 1.2927046589083964, + "learning_rate": 4.189587610726577e-06, + "loss": 0.2962, + "step": 19489 + }, + { + "epoch": 0.57, + "grad_norm": 1.972034443971625, + "learning_rate": 4.189124117495792e-06, + "loss": 0.3056, + "step": 19490 + }, + { + "epoch": 0.57, + "grad_norm": 1.4501189133082382, + "learning_rate": 4.188660631420997e-06, + "loss": 0.2851, + "step": 19491 + }, + { + "epoch": 0.57, + "grad_norm": 1.4537478064468283, + "learning_rate": 4.188197152506281e-06, + "loss": 0.3119, + "step": 19492 + }, + { + "epoch": 0.57, + "grad_norm": 2.4676264897475444, + "learning_rate": 4.187733680755733e-06, + "loss": 0.3091, + "step": 19493 + }, + { + "epoch": 0.57, + "grad_norm": 1.2699248080963117, + "learning_rate": 4.1872702161734445e-06, + "loss": 0.309, + "step": 19494 + }, + { + "epoch": 0.57, + "grad_norm": 1.3225068234890145, + "learning_rate": 4.186806758763507e-06, + "loss": 0.3164, + "step": 19495 + }, + { + "epoch": 0.57, + "grad_norm": 1.292881245738346, + "learning_rate": 4.186343308530006e-06, + "loss": 0.2838, + "step": 19496 + }, + { + "epoch": 0.57, + "grad_norm": 1.2938461570820745, + "learning_rate": 4.185879865477036e-06, + "loss": 0.3286, + "step": 19497 + }, + { + "epoch": 0.57, + "grad_norm": 1.4094675503097875, + "learning_rate": 4.1854164296086855e-06, + "loss": 0.3106, + "step": 19498 + }, + { + "epoch": 0.57, + "grad_norm": 1.8654467319352805, + "learning_rate": 4.184953000929044e-06, + "loss": 0.3249, + "step": 19499 + }, + { + "epoch": 0.57, + "grad_norm": 1.4108736664969213, + "learning_rate": 4.184489579442201e-06, + "loss": 0.2881, + "step": 19500 + }, + { + "epoch": 0.57, + "grad_norm": 1.3869151240488675, + "learning_rate": 4.184026165152247e-06, + "loss": 0.3067, + "step": 19501 + }, + { + "epoch": 0.57, + "grad_norm": 1.435116423731059, + "learning_rate": 4.18356275806327e-06, + "loss": 0.2982, + "step": 19502 + }, + { + "epoch": 0.57, + "grad_norm": 1.4117913193004612, + "learning_rate": 4.18309935817936e-06, + "loss": 0.2983, + "step": 19503 + }, + { + "epoch": 0.57, + "grad_norm": 1.2588660748319995, + "learning_rate": 4.18263596550461e-06, + "loss": 0.3125, + "step": 19504 + }, + { + "epoch": 0.57, + "grad_norm": 1.3383025116547083, + "learning_rate": 4.182172580043105e-06, + "loss": 0.3014, + "step": 19505 + }, + { + "epoch": 0.57, + "grad_norm": 1.371366468449458, + "learning_rate": 4.181709201798936e-06, + "loss": 0.3157, + "step": 19506 + }, + { + "epoch": 0.57, + "grad_norm": 1.6020187685795082, + "learning_rate": 4.1812458307761925e-06, + "loss": 0.329, + "step": 19507 + }, + { + "epoch": 0.57, + "grad_norm": 1.482465816696904, + "learning_rate": 4.180782466978965e-06, + "loss": 0.3047, + "step": 19508 + }, + { + "epoch": 0.57, + "grad_norm": 1.3387692418302957, + "learning_rate": 4.180319110411339e-06, + "loss": 0.2967, + "step": 19509 + }, + { + "epoch": 0.57, + "grad_norm": 3.1283652147561534, + "learning_rate": 4.179855761077407e-06, + "loss": 0.3053, + "step": 19510 + }, + { + "epoch": 0.57, + "grad_norm": 1.218613464427973, + "learning_rate": 4.179392418981258e-06, + "loss": 0.2933, + "step": 19511 + }, + { + "epoch": 0.57, + "grad_norm": 1.4829722925634572, + "learning_rate": 4.1789290841269795e-06, + "loss": 0.3186, + "step": 19512 + }, + { + "epoch": 0.57, + "grad_norm": 1.3713376179605712, + "learning_rate": 4.178465756518661e-06, + "loss": 0.3205, + "step": 19513 + }, + { + "epoch": 0.57, + "grad_norm": 1.4383755636560482, + "learning_rate": 4.178002436160391e-06, + "loss": 0.3039, + "step": 19514 + }, + { + "epoch": 0.57, + "grad_norm": 1.5495102699238676, + "learning_rate": 4.177539123056259e-06, + "loss": 0.2982, + "step": 19515 + }, + { + "epoch": 0.57, + "grad_norm": 1.5475144335991216, + "learning_rate": 4.177075817210352e-06, + "loss": 0.2903, + "step": 19516 + }, + { + "epoch": 0.57, + "grad_norm": 1.2236782040157643, + "learning_rate": 4.176612518626761e-06, + "loss": 0.2909, + "step": 19517 + }, + { + "epoch": 0.57, + "grad_norm": 1.2989291918088273, + "learning_rate": 4.176149227309574e-06, + "loss": 0.2882, + "step": 19518 + }, + { + "epoch": 0.57, + "grad_norm": 1.3058711326918162, + "learning_rate": 4.175685943262878e-06, + "loss": 0.2951, + "step": 19519 + }, + { + "epoch": 0.57, + "grad_norm": 1.4754778215072621, + "learning_rate": 4.175222666490763e-06, + "loss": 0.291, + "step": 19520 + }, + { + "epoch": 0.57, + "grad_norm": 1.3755614446367632, + "learning_rate": 4.1747593969973185e-06, + "loss": 0.2986, + "step": 19521 + }, + { + "epoch": 0.57, + "grad_norm": 1.568580238048063, + "learning_rate": 4.17429613478663e-06, + "loss": 0.2902, + "step": 19522 + }, + { + "epoch": 0.57, + "grad_norm": 1.3024712292986527, + "learning_rate": 4.173832879862789e-06, + "loss": 0.2708, + "step": 19523 + }, + { + "epoch": 0.57, + "grad_norm": 1.4160699735987468, + "learning_rate": 4.17336963222988e-06, + "loss": 0.2969, + "step": 19524 + }, + { + "epoch": 0.57, + "grad_norm": 1.4423890122126513, + "learning_rate": 4.172906391891994e-06, + "loss": 0.3196, + "step": 19525 + }, + { + "epoch": 0.57, + "grad_norm": 1.4835826479009033, + "learning_rate": 4.172443158853218e-06, + "loss": 0.343, + "step": 19526 + }, + { + "epoch": 0.57, + "grad_norm": 3.5942471559461384, + "learning_rate": 4.171979933117641e-06, + "loss": 0.3125, + "step": 19527 + }, + { + "epoch": 0.57, + "grad_norm": 1.350824615096396, + "learning_rate": 4.17151671468935e-06, + "loss": 0.3016, + "step": 19528 + }, + { + "epoch": 0.57, + "grad_norm": 1.4203278321541317, + "learning_rate": 4.171053503572432e-06, + "loss": 0.3497, + "step": 19529 + }, + { + "epoch": 0.57, + "grad_norm": 1.2708107080287019, + "learning_rate": 4.170590299770977e-06, + "loss": 0.2892, + "step": 19530 + }, + { + "epoch": 0.57, + "grad_norm": 1.3445574547003407, + "learning_rate": 4.170127103289073e-06, + "loss": 0.2841, + "step": 19531 + }, + { + "epoch": 0.57, + "grad_norm": 3.615033114187595, + "learning_rate": 4.169663914130805e-06, + "loss": 0.3015, + "step": 19532 + }, + { + "epoch": 0.57, + "grad_norm": 1.2937225238144578, + "learning_rate": 4.169200732300262e-06, + "loss": 0.3073, + "step": 19533 + }, + { + "epoch": 0.57, + "grad_norm": 1.275384309529545, + "learning_rate": 4.168737557801532e-06, + "loss": 0.3176, + "step": 19534 + }, + { + "epoch": 0.57, + "grad_norm": 1.289734771268545, + "learning_rate": 4.1682743906387035e-06, + "loss": 0.3263, + "step": 19535 + }, + { + "epoch": 0.57, + "grad_norm": 1.9127797884880842, + "learning_rate": 4.167811230815862e-06, + "loss": 0.3099, + "step": 19536 + }, + { + "epoch": 0.57, + "grad_norm": 1.2830589983033582, + "learning_rate": 4.1673480783370946e-06, + "loss": 0.3134, + "step": 19537 + }, + { + "epoch": 0.57, + "grad_norm": 1.475825170341685, + "learning_rate": 4.16688493320649e-06, + "loss": 0.3108, + "step": 19538 + }, + { + "epoch": 0.57, + "grad_norm": 1.2859137667006537, + "learning_rate": 4.166421795428135e-06, + "loss": 0.3091, + "step": 19539 + }, + { + "epoch": 0.57, + "grad_norm": 1.2327750159647959, + "learning_rate": 4.165958665006119e-06, + "loss": 0.2889, + "step": 19540 + }, + { + "epoch": 0.57, + "grad_norm": 1.3550249037754811, + "learning_rate": 4.165495541944527e-06, + "loss": 0.2835, + "step": 19541 + }, + { + "epoch": 0.57, + "grad_norm": 1.6596897146466294, + "learning_rate": 4.165032426247444e-06, + "loss": 0.2854, + "step": 19542 + }, + { + "epoch": 0.57, + "grad_norm": 1.4167971496095333, + "learning_rate": 4.164569317918959e-06, + "loss": 0.3041, + "step": 19543 + }, + { + "epoch": 0.57, + "grad_norm": 0.9360289998737645, + "learning_rate": 4.1641062169631605e-06, + "loss": 0.5624, + "step": 19544 + }, + { + "epoch": 0.57, + "grad_norm": 1.2463568824952407, + "learning_rate": 4.163643123384132e-06, + "loss": 0.294, + "step": 19545 + }, + { + "epoch": 0.57, + "grad_norm": 1.4180398690187224, + "learning_rate": 4.163180037185964e-06, + "loss": 0.3165, + "step": 19546 + }, + { + "epoch": 0.57, + "grad_norm": 1.3544593628163473, + "learning_rate": 4.162716958372741e-06, + "loss": 0.3023, + "step": 19547 + }, + { + "epoch": 0.57, + "grad_norm": 1.2314453963131278, + "learning_rate": 4.16225388694855e-06, + "loss": 0.2853, + "step": 19548 + }, + { + "epoch": 0.57, + "grad_norm": 1.3965409505040443, + "learning_rate": 4.161790822917478e-06, + "loss": 0.2823, + "step": 19549 + }, + { + "epoch": 0.57, + "grad_norm": 0.9396654251299732, + "learning_rate": 4.161327766283611e-06, + "loss": 0.5598, + "step": 19550 + }, + { + "epoch": 0.57, + "grad_norm": 1.4167662696308412, + "learning_rate": 4.1608647170510355e-06, + "loss": 0.275, + "step": 19551 + }, + { + "epoch": 0.57, + "grad_norm": 1.3212637602321895, + "learning_rate": 4.160401675223838e-06, + "loss": 0.2876, + "step": 19552 + }, + { + "epoch": 0.57, + "grad_norm": 0.9106302195230005, + "learning_rate": 4.1599386408061045e-06, + "loss": 0.587, + "step": 19553 + }, + { + "epoch": 0.57, + "grad_norm": 1.7341926289090364, + "learning_rate": 4.159475613801924e-06, + "loss": 0.2991, + "step": 19554 + }, + { + "epoch": 0.57, + "grad_norm": 1.3904712679767077, + "learning_rate": 4.159012594215379e-06, + "loss": 0.3043, + "step": 19555 + }, + { + "epoch": 0.57, + "grad_norm": 1.1631403770586828, + "learning_rate": 4.1585495820505574e-06, + "loss": 0.2989, + "step": 19556 + }, + { + "epoch": 0.57, + "grad_norm": 1.28386637457725, + "learning_rate": 4.158086577311545e-06, + "loss": 0.3279, + "step": 19557 + }, + { + "epoch": 0.57, + "grad_norm": 1.5080736725010213, + "learning_rate": 4.157623580002428e-06, + "loss": 0.2895, + "step": 19558 + }, + { + "epoch": 0.57, + "grad_norm": 1.1813675279486857, + "learning_rate": 4.157160590127292e-06, + "loss": 0.2794, + "step": 19559 + }, + { + "epoch": 0.57, + "grad_norm": 1.3535618558220315, + "learning_rate": 4.156697607690224e-06, + "loss": 0.305, + "step": 19560 + }, + { + "epoch": 0.57, + "grad_norm": 1.2580697766530922, + "learning_rate": 4.156234632695308e-06, + "loss": 0.2867, + "step": 19561 + }, + { + "epoch": 0.57, + "grad_norm": 1.5833446087433194, + "learning_rate": 4.155771665146631e-06, + "loss": 0.2919, + "step": 19562 + }, + { + "epoch": 0.57, + "grad_norm": 1.2904479076729483, + "learning_rate": 4.15530870504828e-06, + "loss": 0.3265, + "step": 19563 + }, + { + "epoch": 0.57, + "grad_norm": 1.7965264489599897, + "learning_rate": 4.154845752404337e-06, + "loss": 0.2925, + "step": 19564 + }, + { + "epoch": 0.57, + "grad_norm": 1.6856540822540131, + "learning_rate": 4.15438280721889e-06, + "loss": 0.3171, + "step": 19565 + }, + { + "epoch": 0.57, + "grad_norm": 1.46331054089969, + "learning_rate": 4.1539198694960235e-06, + "loss": 0.301, + "step": 19566 + }, + { + "epoch": 0.57, + "grad_norm": 1.3084433245278724, + "learning_rate": 4.1534569392398246e-06, + "loss": 0.2838, + "step": 19567 + }, + { + "epoch": 0.57, + "grad_norm": 1.2460042433512901, + "learning_rate": 4.1529940164543775e-06, + "loss": 0.2986, + "step": 19568 + }, + { + "epoch": 0.57, + "grad_norm": 1.323915428478468, + "learning_rate": 4.15253110114377e-06, + "loss": 0.3135, + "step": 19569 + }, + { + "epoch": 0.57, + "grad_norm": 1.1758046123579773, + "learning_rate": 4.152068193312081e-06, + "loss": 0.2931, + "step": 19570 + }, + { + "epoch": 0.57, + "grad_norm": 1.2725001100864446, + "learning_rate": 4.151605292963401e-06, + "loss": 0.316, + "step": 19571 + }, + { + "epoch": 0.57, + "grad_norm": 1.1558549874226463, + "learning_rate": 4.151142400101814e-06, + "loss": 0.2713, + "step": 19572 + }, + { + "epoch": 0.57, + "grad_norm": 1.2149288177476691, + "learning_rate": 4.150679514731405e-06, + "loss": 0.2801, + "step": 19573 + }, + { + "epoch": 0.57, + "grad_norm": 1.2887661039998775, + "learning_rate": 4.150216636856258e-06, + "loss": 0.2949, + "step": 19574 + }, + { + "epoch": 0.57, + "grad_norm": 1.2495999118781793, + "learning_rate": 4.149753766480458e-06, + "loss": 0.2852, + "step": 19575 + }, + { + "epoch": 0.57, + "grad_norm": 1.276785030232985, + "learning_rate": 4.149290903608092e-06, + "loss": 0.3045, + "step": 19576 + }, + { + "epoch": 0.57, + "grad_norm": 8.006136426180776, + "learning_rate": 4.1488280482432425e-06, + "loss": 0.2859, + "step": 19577 + }, + { + "epoch": 0.57, + "grad_norm": 1.3292664742101894, + "learning_rate": 4.148365200389994e-06, + "loss": 0.2797, + "step": 19578 + }, + { + "epoch": 0.57, + "grad_norm": 2.0146985048742185, + "learning_rate": 4.147902360052433e-06, + "loss": 0.2899, + "step": 19579 + }, + { + "epoch": 0.57, + "grad_norm": 1.2589790344360077, + "learning_rate": 4.147439527234643e-06, + "loss": 0.3074, + "step": 19580 + }, + { + "epoch": 0.57, + "grad_norm": 1.336072146340624, + "learning_rate": 4.146976701940708e-06, + "loss": 0.3259, + "step": 19581 + }, + { + "epoch": 0.57, + "grad_norm": 1.341272748231608, + "learning_rate": 4.146513884174714e-06, + "loss": 0.2796, + "step": 19582 + }, + { + "epoch": 0.57, + "grad_norm": 1.816877477262294, + "learning_rate": 4.146051073940744e-06, + "loss": 0.2963, + "step": 19583 + }, + { + "epoch": 0.57, + "grad_norm": 1.2333261610500388, + "learning_rate": 4.145588271242883e-06, + "loss": 0.2776, + "step": 19584 + }, + { + "epoch": 0.57, + "grad_norm": 1.422168073041657, + "learning_rate": 4.145125476085214e-06, + "loss": 0.2885, + "step": 19585 + }, + { + "epoch": 0.57, + "grad_norm": 1.3412724491454702, + "learning_rate": 4.144662688471824e-06, + "loss": 0.312, + "step": 19586 + }, + { + "epoch": 0.57, + "grad_norm": 1.2343233291699056, + "learning_rate": 4.1441999084067945e-06, + "loss": 0.3065, + "step": 19587 + }, + { + "epoch": 0.57, + "grad_norm": 1.2222536132657646, + "learning_rate": 4.14373713589421e-06, + "loss": 0.2911, + "step": 19588 + }, + { + "epoch": 0.57, + "grad_norm": 1.2786686024053804, + "learning_rate": 4.143274370938154e-06, + "loss": 0.3028, + "step": 19589 + }, + { + "epoch": 0.57, + "grad_norm": 1.2929225377498217, + "learning_rate": 4.1428116135427135e-06, + "loss": 0.2972, + "step": 19590 + }, + { + "epoch": 0.57, + "grad_norm": 1.3214476074137524, + "learning_rate": 4.142348863711968e-06, + "loss": 0.3066, + "step": 19591 + }, + { + "epoch": 0.57, + "grad_norm": 1.1937245295210972, + "learning_rate": 4.141886121450003e-06, + "loss": 0.295, + "step": 19592 + }, + { + "epoch": 0.57, + "grad_norm": 1.2789900722322098, + "learning_rate": 4.141423386760903e-06, + "loss": 0.3141, + "step": 19593 + }, + { + "epoch": 0.57, + "grad_norm": 1.2388261198909638, + "learning_rate": 4.140960659648751e-06, + "loss": 0.3157, + "step": 19594 + }, + { + "epoch": 0.57, + "grad_norm": 1.4502555227705374, + "learning_rate": 4.140497940117633e-06, + "loss": 0.2974, + "step": 19595 + }, + { + "epoch": 0.57, + "grad_norm": 1.3977544255101375, + "learning_rate": 4.140035228171628e-06, + "loss": 0.3036, + "step": 19596 + }, + { + "epoch": 0.57, + "grad_norm": 1.2738720070607616, + "learning_rate": 4.139572523814823e-06, + "loss": 0.2959, + "step": 19597 + }, + { + "epoch": 0.57, + "grad_norm": 1.4056749284511285, + "learning_rate": 4.1391098270513e-06, + "loss": 0.2854, + "step": 19598 + }, + { + "epoch": 0.57, + "grad_norm": 1.3348042339258859, + "learning_rate": 4.138647137885141e-06, + "loss": 0.3354, + "step": 19599 + }, + { + "epoch": 0.57, + "grad_norm": 1.3534913577085685, + "learning_rate": 4.13818445632043e-06, + "loss": 0.2924, + "step": 19600 + }, + { + "epoch": 0.57, + "grad_norm": 1.1778801935400525, + "learning_rate": 4.137721782361253e-06, + "loss": 0.2979, + "step": 19601 + }, + { + "epoch": 0.57, + "grad_norm": 1.7433368468796169, + "learning_rate": 4.13725911601169e-06, + "loss": 0.291, + "step": 19602 + }, + { + "epoch": 0.57, + "grad_norm": 1.4311157039887559, + "learning_rate": 4.136796457275824e-06, + "loss": 0.3108, + "step": 19603 + }, + { + "epoch": 0.57, + "grad_norm": 1.2636716543760047, + "learning_rate": 4.136333806157738e-06, + "loss": 0.3195, + "step": 19604 + }, + { + "epoch": 0.57, + "grad_norm": 1.3648232092069899, + "learning_rate": 4.135871162661518e-06, + "loss": 0.3232, + "step": 19605 + }, + { + "epoch": 0.57, + "grad_norm": 1.3492735735633978, + "learning_rate": 4.135408526791244e-06, + "loss": 0.2969, + "step": 19606 + }, + { + "epoch": 0.57, + "grad_norm": 1.5628587941959189, + "learning_rate": 4.134945898550999e-06, + "loss": 0.2921, + "step": 19607 + }, + { + "epoch": 0.57, + "grad_norm": 1.2948025656679722, + "learning_rate": 4.134483277944865e-06, + "loss": 0.2987, + "step": 19608 + }, + { + "epoch": 0.57, + "grad_norm": 1.1656728663890523, + "learning_rate": 4.134020664976927e-06, + "loss": 0.2866, + "step": 19609 + }, + { + "epoch": 0.57, + "grad_norm": 1.8102972604474925, + "learning_rate": 4.133558059651266e-06, + "loss": 0.2754, + "step": 19610 + }, + { + "epoch": 0.57, + "grad_norm": 1.3197935797686038, + "learning_rate": 4.133095461971963e-06, + "loss": 0.3114, + "step": 19611 + }, + { + "epoch": 0.57, + "grad_norm": 1.2501870831907687, + "learning_rate": 4.132632871943104e-06, + "loss": 0.2949, + "step": 19612 + }, + { + "epoch": 0.57, + "grad_norm": 1.3129131404147723, + "learning_rate": 4.13217028956877e-06, + "loss": 0.2974, + "step": 19613 + }, + { + "epoch": 0.57, + "grad_norm": 1.3058122791667677, + "learning_rate": 4.131707714853042e-06, + "loss": 0.3115, + "step": 19614 + }, + { + "epoch": 0.57, + "grad_norm": 1.3425721180667853, + "learning_rate": 4.131245147800003e-06, + "loss": 0.2855, + "step": 19615 + }, + { + "epoch": 0.57, + "grad_norm": 1.5954851927075895, + "learning_rate": 4.130782588413735e-06, + "loss": 0.3214, + "step": 19616 + }, + { + "epoch": 0.57, + "grad_norm": 1.3100196151840706, + "learning_rate": 4.1303200366983206e-06, + "loss": 0.2943, + "step": 19617 + }, + { + "epoch": 0.57, + "grad_norm": 2.0916996011681834, + "learning_rate": 4.129857492657843e-06, + "loss": 0.3369, + "step": 19618 + }, + { + "epoch": 0.57, + "grad_norm": 1.447222653475309, + "learning_rate": 4.12939495629638e-06, + "loss": 0.317, + "step": 19619 + }, + { + "epoch": 0.57, + "grad_norm": 1.2434057188792418, + "learning_rate": 4.128932427618018e-06, + "loss": 0.2998, + "step": 19620 + }, + { + "epoch": 0.57, + "grad_norm": 1.3581125884187075, + "learning_rate": 4.128469906626836e-06, + "loss": 0.2952, + "step": 19621 + }, + { + "epoch": 0.57, + "grad_norm": 1.3888116149130059, + "learning_rate": 4.128007393326918e-06, + "loss": 0.2941, + "step": 19622 + }, + { + "epoch": 0.57, + "grad_norm": 1.2772745561812793, + "learning_rate": 4.1275448877223435e-06, + "loss": 0.2902, + "step": 19623 + }, + { + "epoch": 0.57, + "grad_norm": 1.2672982550322653, + "learning_rate": 4.127082389817194e-06, + "loss": 0.2995, + "step": 19624 + }, + { + "epoch": 0.57, + "grad_norm": 0.9129217049077386, + "learning_rate": 4.126619899615556e-06, + "loss": 0.5866, + "step": 19625 + }, + { + "epoch": 0.57, + "grad_norm": 1.306772068222025, + "learning_rate": 4.126157417121504e-06, + "loss": 0.3214, + "step": 19626 + }, + { + "epoch": 0.57, + "grad_norm": 2.154205271792237, + "learning_rate": 4.125694942339122e-06, + "loss": 0.325, + "step": 19627 + }, + { + "epoch": 0.57, + "grad_norm": 1.4160704124751746, + "learning_rate": 4.125232475272494e-06, + "loss": 0.295, + "step": 19628 + }, + { + "epoch": 0.57, + "grad_norm": 1.2875055263913475, + "learning_rate": 4.124770015925697e-06, + "loss": 0.3009, + "step": 19629 + }, + { + "epoch": 0.57, + "grad_norm": 1.4599003770810655, + "learning_rate": 4.124307564302815e-06, + "loss": 0.3125, + "step": 19630 + }, + { + "epoch": 0.57, + "grad_norm": 1.2516194161138645, + "learning_rate": 4.123845120407928e-06, + "loss": 0.2941, + "step": 19631 + }, + { + "epoch": 0.57, + "grad_norm": 1.3683316801962628, + "learning_rate": 4.123382684245118e-06, + "loss": 0.2936, + "step": 19632 + }, + { + "epoch": 0.57, + "grad_norm": 1.4040335258913543, + "learning_rate": 4.122920255818466e-06, + "loss": 0.3167, + "step": 19633 + }, + { + "epoch": 0.57, + "grad_norm": 1.3516194706197402, + "learning_rate": 4.122457835132052e-06, + "loss": 0.2945, + "step": 19634 + }, + { + "epoch": 0.57, + "grad_norm": 1.2440255676590357, + "learning_rate": 4.1219954221899575e-06, + "loss": 0.3088, + "step": 19635 + }, + { + "epoch": 0.57, + "grad_norm": 1.3280600961683848, + "learning_rate": 4.121533016996262e-06, + "loss": 0.2919, + "step": 19636 + }, + { + "epoch": 0.57, + "grad_norm": 4.435945034185594, + "learning_rate": 4.12107061955505e-06, + "loss": 0.2866, + "step": 19637 + }, + { + "epoch": 0.57, + "grad_norm": 1.2741216736524177, + "learning_rate": 4.120608229870398e-06, + "loss": 0.2901, + "step": 19638 + }, + { + "epoch": 0.57, + "grad_norm": 1.3493676454406711, + "learning_rate": 4.120145847946387e-06, + "loss": 0.2875, + "step": 19639 + }, + { + "epoch": 0.57, + "grad_norm": 1.3549280531274082, + "learning_rate": 4.1196834737871e-06, + "loss": 0.3009, + "step": 19640 + }, + { + "epoch": 0.57, + "grad_norm": 1.3382858261556818, + "learning_rate": 4.119221107396616e-06, + "loss": 0.282, + "step": 19641 + }, + { + "epoch": 0.57, + "grad_norm": 1.4428466660883916, + "learning_rate": 4.118758748779015e-06, + "loss": 0.3162, + "step": 19642 + }, + { + "epoch": 0.57, + "grad_norm": 1.5196130610600815, + "learning_rate": 4.118296397938378e-06, + "loss": 0.3043, + "step": 19643 + }, + { + "epoch": 0.57, + "grad_norm": 1.3741327344758738, + "learning_rate": 4.117834054878785e-06, + "loss": 0.3101, + "step": 19644 + }, + { + "epoch": 0.57, + "grad_norm": 2.1044208749914417, + "learning_rate": 4.1173717196043175e-06, + "loss": 0.2916, + "step": 19645 + }, + { + "epoch": 0.57, + "grad_norm": 1.3792394104289538, + "learning_rate": 4.116909392119053e-06, + "loss": 0.324, + "step": 19646 + }, + { + "epoch": 0.57, + "grad_norm": 2.294467425505689, + "learning_rate": 4.116447072427073e-06, + "loss": 0.3089, + "step": 19647 + }, + { + "epoch": 0.57, + "grad_norm": 1.567791877195792, + "learning_rate": 4.1159847605324575e-06, + "loss": 0.2955, + "step": 19648 + }, + { + "epoch": 0.57, + "grad_norm": 1.2634665729654924, + "learning_rate": 4.115522456439287e-06, + "loss": 0.3091, + "step": 19649 + }, + { + "epoch": 0.57, + "grad_norm": 1.3227897415263539, + "learning_rate": 4.115060160151639e-06, + "loss": 0.2847, + "step": 19650 + }, + { + "epoch": 0.57, + "grad_norm": 1.2756350363189195, + "learning_rate": 4.114597871673596e-06, + "loss": 0.2933, + "step": 19651 + }, + { + "epoch": 0.57, + "grad_norm": 1.2919022097686088, + "learning_rate": 4.114135591009237e-06, + "loss": 0.3021, + "step": 19652 + }, + { + "epoch": 0.57, + "grad_norm": 1.2402948637840827, + "learning_rate": 4.113673318162642e-06, + "loss": 0.2901, + "step": 19653 + }, + { + "epoch": 0.57, + "grad_norm": 1.3198471549292736, + "learning_rate": 4.1132110531378885e-06, + "loss": 0.2959, + "step": 19654 + }, + { + "epoch": 0.57, + "grad_norm": 1.411656965750204, + "learning_rate": 4.112748795939057e-06, + "loss": 0.2946, + "step": 19655 + }, + { + "epoch": 0.57, + "grad_norm": 1.260597840523331, + "learning_rate": 4.112286546570229e-06, + "loss": 0.3334, + "step": 19656 + }, + { + "epoch": 0.57, + "grad_norm": 1.3036437423021923, + "learning_rate": 4.11182430503548e-06, + "loss": 0.3081, + "step": 19657 + }, + { + "epoch": 0.57, + "grad_norm": 1.1776472412520713, + "learning_rate": 4.111362071338892e-06, + "loss": 0.2968, + "step": 19658 + }, + { + "epoch": 0.57, + "grad_norm": 1.9496043822887903, + "learning_rate": 4.1108998454845436e-06, + "loss": 0.2892, + "step": 19659 + }, + { + "epoch": 0.57, + "grad_norm": 1.3368756331324574, + "learning_rate": 4.110437627476514e-06, + "loss": 0.2908, + "step": 19660 + }, + { + "epoch": 0.57, + "grad_norm": 1.234164015733999, + "learning_rate": 4.109975417318882e-06, + "loss": 0.2828, + "step": 19661 + }, + { + "epoch": 0.57, + "grad_norm": 1.2995358411688458, + "learning_rate": 4.109513215015727e-06, + "loss": 0.2969, + "step": 19662 + }, + { + "epoch": 0.57, + "grad_norm": 1.2379225575609023, + "learning_rate": 4.109051020571127e-06, + "loss": 0.2861, + "step": 19663 + }, + { + "epoch": 0.57, + "grad_norm": 1.3319026701939465, + "learning_rate": 4.108588833989163e-06, + "loss": 0.3124, + "step": 19664 + }, + { + "epoch": 0.57, + "grad_norm": 1.6615733321683666, + "learning_rate": 4.108126655273912e-06, + "loss": 0.3053, + "step": 19665 + }, + { + "epoch": 0.57, + "grad_norm": 2.105580573693356, + "learning_rate": 4.1076644844294525e-06, + "loss": 0.2968, + "step": 19666 + }, + { + "epoch": 0.57, + "grad_norm": 1.2905527070159857, + "learning_rate": 4.107202321459864e-06, + "loss": 0.3076, + "step": 19667 + }, + { + "epoch": 0.57, + "grad_norm": 1.2074856238785052, + "learning_rate": 4.106740166369226e-06, + "loss": 0.2791, + "step": 19668 + }, + { + "epoch": 0.57, + "grad_norm": 1.5525169517605864, + "learning_rate": 4.1062780191616155e-06, + "loss": 0.3127, + "step": 19669 + }, + { + "epoch": 0.57, + "grad_norm": 1.3313914528059805, + "learning_rate": 4.10581587984111e-06, + "loss": 0.3036, + "step": 19670 + }, + { + "epoch": 0.57, + "grad_norm": 1.4848099055455406, + "learning_rate": 4.10535374841179e-06, + "loss": 0.2942, + "step": 19671 + }, + { + "epoch": 0.57, + "grad_norm": 1.3650921212856046, + "learning_rate": 4.104891624877734e-06, + "loss": 0.2813, + "step": 19672 + }, + { + "epoch": 0.57, + "grad_norm": 1.1650688430141771, + "learning_rate": 4.104429509243019e-06, + "loss": 0.2883, + "step": 19673 + }, + { + "epoch": 0.57, + "grad_norm": 1.1830797317041863, + "learning_rate": 4.103967401511723e-06, + "loss": 0.2987, + "step": 19674 + }, + { + "epoch": 0.57, + "grad_norm": 1.2625414831064357, + "learning_rate": 4.103505301687925e-06, + "loss": 0.2898, + "step": 19675 + }, + { + "epoch": 0.57, + "grad_norm": 1.5319566597119831, + "learning_rate": 4.1030432097757025e-06, + "loss": 0.3113, + "step": 19676 + }, + { + "epoch": 0.57, + "grad_norm": 1.4397459060533282, + "learning_rate": 4.1025811257791335e-06, + "loss": 0.3126, + "step": 19677 + }, + { + "epoch": 0.57, + "grad_norm": 1.2866953865439927, + "learning_rate": 4.102119049702296e-06, + "loss": 0.3044, + "step": 19678 + }, + { + "epoch": 0.57, + "grad_norm": 1.3281019358793988, + "learning_rate": 4.101656981549268e-06, + "loss": 0.3255, + "step": 19679 + }, + { + "epoch": 0.57, + "grad_norm": 2.2156230282861658, + "learning_rate": 4.101194921324127e-06, + "loss": 0.3058, + "step": 19680 + }, + { + "epoch": 0.57, + "grad_norm": 1.4500271083823857, + "learning_rate": 4.1007328690309515e-06, + "loss": 0.3022, + "step": 19681 + }, + { + "epoch": 0.57, + "grad_norm": 1.9582552521397198, + "learning_rate": 4.1002708246738185e-06, + "loss": 0.2948, + "step": 19682 + }, + { + "epoch": 0.57, + "grad_norm": 1.4850349225340542, + "learning_rate": 4.099808788256806e-06, + "loss": 0.3026, + "step": 19683 + }, + { + "epoch": 0.57, + "grad_norm": 2.4733654864381562, + "learning_rate": 4.099346759783989e-06, + "loss": 0.3009, + "step": 19684 + }, + { + "epoch": 0.57, + "grad_norm": 1.3190876981587882, + "learning_rate": 4.0988847392594475e-06, + "loss": 0.301, + "step": 19685 + }, + { + "epoch": 0.57, + "grad_norm": 1.4783615414788491, + "learning_rate": 4.098422726687258e-06, + "loss": 0.2963, + "step": 19686 + }, + { + "epoch": 0.57, + "grad_norm": 1.2564421183846861, + "learning_rate": 4.097960722071499e-06, + "loss": 0.2981, + "step": 19687 + }, + { + "epoch": 0.57, + "grad_norm": 1.5083793823046026, + "learning_rate": 4.097498725416246e-06, + "loss": 0.3331, + "step": 19688 + }, + { + "epoch": 0.57, + "grad_norm": 1.3900927238835976, + "learning_rate": 4.097036736725576e-06, + "loss": 0.2968, + "step": 19689 + }, + { + "epoch": 0.57, + "grad_norm": 1.4543218453720497, + "learning_rate": 4.096574756003567e-06, + "loss": 0.2946, + "step": 19690 + }, + { + "epoch": 0.57, + "grad_norm": 1.3643138715848229, + "learning_rate": 4.096112783254295e-06, + "loss": 0.2993, + "step": 19691 + }, + { + "epoch": 0.57, + "grad_norm": 1.2794318458498122, + "learning_rate": 4.09565081848184e-06, + "loss": 0.3026, + "step": 19692 + }, + { + "epoch": 0.57, + "grad_norm": 1.2594572717985393, + "learning_rate": 4.0951888616902754e-06, + "loss": 0.3252, + "step": 19693 + }, + { + "epoch": 0.57, + "grad_norm": 1.2192092060061754, + "learning_rate": 4.094726912883679e-06, + "loss": 0.2821, + "step": 19694 + }, + { + "epoch": 0.57, + "grad_norm": 1.4051242569131326, + "learning_rate": 4.094264972066127e-06, + "loss": 0.3116, + "step": 19695 + }, + { + "epoch": 0.57, + "grad_norm": 1.3943814918100441, + "learning_rate": 4.0938030392416985e-06, + "loss": 0.3281, + "step": 19696 + }, + { + "epoch": 0.57, + "grad_norm": 1.2315226967569115, + "learning_rate": 4.093341114414468e-06, + "loss": 0.2852, + "step": 19697 + }, + { + "epoch": 0.57, + "grad_norm": 1.4845904504908103, + "learning_rate": 4.092879197588511e-06, + "loss": 0.3069, + "step": 19698 + }, + { + "epoch": 0.57, + "grad_norm": 0.9535829597675979, + "learning_rate": 4.092417288767905e-06, + "loss": 0.641, + "step": 19699 + }, + { + "epoch": 0.57, + "grad_norm": 1.5558168097403908, + "learning_rate": 4.091955387956727e-06, + "loss": 0.3148, + "step": 19700 + }, + { + "epoch": 0.57, + "grad_norm": 1.280355449594046, + "learning_rate": 4.091493495159054e-06, + "loss": 0.3007, + "step": 19701 + }, + { + "epoch": 0.57, + "grad_norm": 1.3444054119769138, + "learning_rate": 4.091031610378959e-06, + "loss": 0.3029, + "step": 19702 + }, + { + "epoch": 0.57, + "grad_norm": 2.027966884098591, + "learning_rate": 4.090569733620521e-06, + "loss": 0.3072, + "step": 19703 + }, + { + "epoch": 0.57, + "grad_norm": 1.3730872087406167, + "learning_rate": 4.090107864887816e-06, + "loss": 0.2856, + "step": 19704 + }, + { + "epoch": 0.57, + "grad_norm": 1.474209404863073, + "learning_rate": 4.089646004184918e-06, + "loss": 0.305, + "step": 19705 + }, + { + "epoch": 0.57, + "grad_norm": 1.2746674672885057, + "learning_rate": 4.089184151515904e-06, + "loss": 0.2772, + "step": 19706 + }, + { + "epoch": 0.57, + "grad_norm": 1.6840112693411828, + "learning_rate": 4.08872230688485e-06, + "loss": 0.2957, + "step": 19707 + }, + { + "epoch": 0.57, + "grad_norm": 1.4111989582323268, + "learning_rate": 4.088260470295832e-06, + "loss": 0.3051, + "step": 19708 + }, + { + "epoch": 0.57, + "grad_norm": 1.2029457397360566, + "learning_rate": 4.0877986417529264e-06, + "loss": 0.3159, + "step": 19709 + }, + { + "epoch": 0.57, + "grad_norm": 1.2496555015406219, + "learning_rate": 4.087336821260209e-06, + "loss": 0.3006, + "step": 19710 + }, + { + "epoch": 0.57, + "grad_norm": 1.159198043469234, + "learning_rate": 4.086875008821752e-06, + "loss": 0.2692, + "step": 19711 + }, + { + "epoch": 0.57, + "grad_norm": 1.3413476434946232, + "learning_rate": 4.086413204441633e-06, + "loss": 0.3114, + "step": 19712 + }, + { + "epoch": 0.57, + "grad_norm": 1.320020649273697, + "learning_rate": 4.085951408123928e-06, + "loss": 0.2861, + "step": 19713 + }, + { + "epoch": 0.57, + "grad_norm": 1.4815937988678936, + "learning_rate": 4.085489619872712e-06, + "loss": 0.3166, + "step": 19714 + }, + { + "epoch": 0.57, + "grad_norm": 1.3821055198923804, + "learning_rate": 4.085027839692061e-06, + "loss": 0.3085, + "step": 19715 + }, + { + "epoch": 0.57, + "grad_norm": 1.2892065098316239, + "learning_rate": 4.084566067586049e-06, + "loss": 0.2553, + "step": 19716 + }, + { + "epoch": 0.57, + "grad_norm": 1.3562597057231969, + "learning_rate": 4.084104303558752e-06, + "loss": 0.3041, + "step": 19717 + }, + { + "epoch": 0.57, + "grad_norm": 1.2885487841785195, + "learning_rate": 4.0836425476142425e-06, + "loss": 0.2917, + "step": 19718 + }, + { + "epoch": 0.57, + "grad_norm": 1.3763292546424575, + "learning_rate": 4.083180799756601e-06, + "loss": 0.2912, + "step": 19719 + }, + { + "epoch": 0.57, + "grad_norm": 1.4060215623873091, + "learning_rate": 4.082719059989897e-06, + "loss": 0.3058, + "step": 19720 + }, + { + "epoch": 0.57, + "grad_norm": 2.149858195972106, + "learning_rate": 4.0822573283182075e-06, + "loss": 0.3277, + "step": 19721 + }, + { + "epoch": 0.57, + "grad_norm": 1.279366747843464, + "learning_rate": 4.081795604745608e-06, + "loss": 0.2895, + "step": 19722 + }, + { + "epoch": 0.57, + "grad_norm": 1.2532401174547656, + "learning_rate": 4.081333889276172e-06, + "loss": 0.3036, + "step": 19723 + }, + { + "epoch": 0.57, + "grad_norm": 1.3018844352284966, + "learning_rate": 4.080872181913975e-06, + "loss": 0.2889, + "step": 19724 + }, + { + "epoch": 0.57, + "grad_norm": 1.2680356320541541, + "learning_rate": 4.080410482663091e-06, + "loss": 0.2989, + "step": 19725 + }, + { + "epoch": 0.57, + "grad_norm": 1.6807562095291726, + "learning_rate": 4.079948791527594e-06, + "loss": 0.3171, + "step": 19726 + }, + { + "epoch": 0.57, + "grad_norm": 1.662133746128373, + "learning_rate": 4.0794871085115595e-06, + "loss": 0.2991, + "step": 19727 + }, + { + "epoch": 0.57, + "grad_norm": 1.2089265170167522, + "learning_rate": 4.079025433619063e-06, + "loss": 0.2839, + "step": 19728 + }, + { + "epoch": 0.57, + "grad_norm": 1.2967257812243231, + "learning_rate": 4.078563766854175e-06, + "loss": 0.2935, + "step": 19729 + }, + { + "epoch": 0.57, + "grad_norm": 1.3478027063668752, + "learning_rate": 4.0781021082209735e-06, + "loss": 0.301, + "step": 19730 + }, + { + "epoch": 0.57, + "grad_norm": 1.231384579581071, + "learning_rate": 4.07764045772353e-06, + "loss": 0.2843, + "step": 19731 + }, + { + "epoch": 0.57, + "grad_norm": 1.1738532364627403, + "learning_rate": 4.077178815365921e-06, + "loss": 0.2995, + "step": 19732 + }, + { + "epoch": 0.57, + "grad_norm": 1.2263680644628316, + "learning_rate": 4.076717181152218e-06, + "loss": 0.3223, + "step": 19733 + }, + { + "epoch": 0.57, + "grad_norm": 1.296704916738336, + "learning_rate": 4.0762555550864965e-06, + "loss": 0.3114, + "step": 19734 + }, + { + "epoch": 0.57, + "grad_norm": 1.2443783125883674, + "learning_rate": 4.07579393717283e-06, + "loss": 0.3206, + "step": 19735 + }, + { + "epoch": 0.57, + "grad_norm": 1.4310564128100864, + "learning_rate": 4.075332327415293e-06, + "loss": 0.2857, + "step": 19736 + }, + { + "epoch": 0.57, + "grad_norm": 1.868674780362126, + "learning_rate": 4.074870725817958e-06, + "loss": 0.3151, + "step": 19737 + }, + { + "epoch": 0.57, + "grad_norm": 1.507262058194435, + "learning_rate": 4.0744091323849e-06, + "loss": 0.2871, + "step": 19738 + }, + { + "epoch": 0.57, + "grad_norm": 1.2968580845849607, + "learning_rate": 4.0739475471201905e-06, + "loss": 0.3242, + "step": 19739 + }, + { + "epoch": 0.57, + "grad_norm": 1.581277552536141, + "learning_rate": 4.073485970027904e-06, + "loss": 0.3196, + "step": 19740 + }, + { + "epoch": 0.57, + "grad_norm": 1.330467601076372, + "learning_rate": 4.073024401112115e-06, + "loss": 0.2987, + "step": 19741 + }, + { + "epoch": 0.57, + "grad_norm": 1.3637945782469472, + "learning_rate": 4.0725628403768954e-06, + "loss": 0.3022, + "step": 19742 + }, + { + "epoch": 0.57, + "grad_norm": 1.2315833288344376, + "learning_rate": 4.072101287826319e-06, + "loss": 0.2904, + "step": 19743 + }, + { + "epoch": 0.57, + "grad_norm": 1.4643929921254533, + "learning_rate": 4.071639743464459e-06, + "loss": 0.2921, + "step": 19744 + }, + { + "epoch": 0.57, + "grad_norm": 1.4700954406554045, + "learning_rate": 4.071178207295388e-06, + "loss": 0.2947, + "step": 19745 + }, + { + "epoch": 0.57, + "grad_norm": 1.2069035657492286, + "learning_rate": 4.070716679323181e-06, + "loss": 0.2691, + "step": 19746 + }, + { + "epoch": 0.57, + "grad_norm": 1.3700360977140027, + "learning_rate": 4.070255159551908e-06, + "loss": 0.2891, + "step": 19747 + }, + { + "epoch": 0.57, + "grad_norm": 1.8047316619932148, + "learning_rate": 4.069793647985644e-06, + "loss": 0.3008, + "step": 19748 + }, + { + "epoch": 0.57, + "grad_norm": 1.5043792229701964, + "learning_rate": 4.069332144628462e-06, + "loss": 0.3102, + "step": 19749 + }, + { + "epoch": 0.57, + "grad_norm": 1.3433706196870074, + "learning_rate": 4.068870649484433e-06, + "loss": 0.2791, + "step": 19750 + }, + { + "epoch": 0.57, + "grad_norm": 1.147650219211986, + "learning_rate": 4.068409162557633e-06, + "loss": 0.2679, + "step": 19751 + }, + { + "epoch": 0.57, + "grad_norm": 1.8252713203261939, + "learning_rate": 4.06794768385213e-06, + "loss": 0.2934, + "step": 19752 + }, + { + "epoch": 0.57, + "grad_norm": 1.2483132219112663, + "learning_rate": 4.067486213372e-06, + "loss": 0.2975, + "step": 19753 + }, + { + "epoch": 0.57, + "grad_norm": 2.253164764212971, + "learning_rate": 4.067024751121315e-06, + "loss": 0.3037, + "step": 19754 + }, + { + "epoch": 0.57, + "grad_norm": 1.2967730919471152, + "learning_rate": 4.066563297104147e-06, + "loss": 0.3106, + "step": 19755 + }, + { + "epoch": 0.57, + "grad_norm": 1.412283109401631, + "learning_rate": 4.066101851324568e-06, + "loss": 0.3049, + "step": 19756 + }, + { + "epoch": 0.57, + "grad_norm": 1.318285097623458, + "learning_rate": 4.06564041378665e-06, + "loss": 0.3118, + "step": 19757 + }, + { + "epoch": 0.57, + "grad_norm": 1.3250127302419274, + "learning_rate": 4.065178984494465e-06, + "loss": 0.2778, + "step": 19758 + }, + { + "epoch": 0.57, + "grad_norm": 1.1968424837415155, + "learning_rate": 4.064717563452088e-06, + "loss": 0.2867, + "step": 19759 + }, + { + "epoch": 0.57, + "grad_norm": 1.3424237677549116, + "learning_rate": 4.064256150663588e-06, + "loss": 0.2952, + "step": 19760 + }, + { + "epoch": 0.57, + "grad_norm": 12.560773070878392, + "learning_rate": 4.063794746133038e-06, + "loss": 0.2931, + "step": 19761 + }, + { + "epoch": 0.57, + "grad_norm": 1.385566446767884, + "learning_rate": 4.063333349864509e-06, + "loss": 0.2949, + "step": 19762 + }, + { + "epoch": 0.57, + "grad_norm": 1.57770377320988, + "learning_rate": 4.062871961862074e-06, + "loss": 0.3013, + "step": 19763 + }, + { + "epoch": 0.57, + "grad_norm": 1.4134015163044735, + "learning_rate": 4.062410582129804e-06, + "loss": 0.3185, + "step": 19764 + }, + { + "epoch": 0.57, + "grad_norm": 1.5662927357660794, + "learning_rate": 4.061949210671772e-06, + "loss": 0.2972, + "step": 19765 + }, + { + "epoch": 0.57, + "grad_norm": 1.3467743579510574, + "learning_rate": 4.0614878474920495e-06, + "loss": 0.2903, + "step": 19766 + }, + { + "epoch": 0.57, + "grad_norm": 1.273571037861524, + "learning_rate": 4.061026492594705e-06, + "loss": 0.3059, + "step": 19767 + }, + { + "epoch": 0.57, + "grad_norm": 1.3502926053647002, + "learning_rate": 4.060565145983813e-06, + "loss": 0.3058, + "step": 19768 + }, + { + "epoch": 0.57, + "grad_norm": 1.2493052933801518, + "learning_rate": 4.060103807663444e-06, + "loss": 0.2884, + "step": 19769 + }, + { + "epoch": 0.57, + "grad_norm": 2.167619172030652, + "learning_rate": 4.059642477637669e-06, + "loss": 0.3078, + "step": 19770 + }, + { + "epoch": 0.57, + "grad_norm": 2.2795183577113147, + "learning_rate": 4.05918115591056e-06, + "loss": 0.3374, + "step": 19771 + }, + { + "epoch": 0.57, + "grad_norm": 1.6932775487287604, + "learning_rate": 4.058719842486187e-06, + "loss": 0.3042, + "step": 19772 + }, + { + "epoch": 0.57, + "grad_norm": 1.2836523692718427, + "learning_rate": 4.0582585373686216e-06, + "loss": 0.291, + "step": 19773 + }, + { + "epoch": 0.57, + "grad_norm": 1.3312217126941475, + "learning_rate": 4.0577972405619366e-06, + "loss": 0.2891, + "step": 19774 + }, + { + "epoch": 0.57, + "grad_norm": 1.385912414814704, + "learning_rate": 4.0573359520702e-06, + "loss": 0.2956, + "step": 19775 + }, + { + "epoch": 0.57, + "grad_norm": 1.3059777477755148, + "learning_rate": 4.056874671897484e-06, + "loss": 0.2893, + "step": 19776 + }, + { + "epoch": 0.57, + "grad_norm": 1.5376328590529504, + "learning_rate": 4.05641340004786e-06, + "loss": 0.2772, + "step": 19777 + }, + { + "epoch": 0.57, + "grad_norm": 1.3661329148140675, + "learning_rate": 4.055952136525399e-06, + "loss": 0.3165, + "step": 19778 + }, + { + "epoch": 0.57, + "grad_norm": 1.326630812376288, + "learning_rate": 4.0554908813341696e-06, + "loss": 0.3061, + "step": 19779 + }, + { + "epoch": 0.57, + "grad_norm": 1.16297396463777, + "learning_rate": 4.055029634478244e-06, + "loss": 0.2913, + "step": 19780 + }, + { + "epoch": 0.57, + "grad_norm": 1.7847456414557041, + "learning_rate": 4.054568395961692e-06, + "loss": 0.2869, + "step": 19781 + }, + { + "epoch": 0.57, + "grad_norm": 1.4950296550417945, + "learning_rate": 4.054107165788586e-06, + "loss": 0.2956, + "step": 19782 + }, + { + "epoch": 0.57, + "grad_norm": 1.2234242353083864, + "learning_rate": 4.053645943962994e-06, + "loss": 0.2956, + "step": 19783 + }, + { + "epoch": 0.57, + "grad_norm": 1.3240442190330541, + "learning_rate": 4.053184730488986e-06, + "loss": 0.297, + "step": 19784 + }, + { + "epoch": 0.57, + "grad_norm": 1.2275937325973127, + "learning_rate": 4.0527235253706335e-06, + "loss": 0.2955, + "step": 19785 + }, + { + "epoch": 0.57, + "grad_norm": 1.580771497236829, + "learning_rate": 4.052262328612007e-06, + "loss": 0.2958, + "step": 19786 + }, + { + "epoch": 0.57, + "grad_norm": 1.3848447475284595, + "learning_rate": 4.051801140217176e-06, + "loss": 0.2765, + "step": 19787 + }, + { + "epoch": 0.57, + "grad_norm": 1.4352887296978591, + "learning_rate": 4.05133996019021e-06, + "loss": 0.2852, + "step": 19788 + }, + { + "epoch": 0.57, + "grad_norm": 1.258519203199762, + "learning_rate": 4.050878788535179e-06, + "loss": 0.302, + "step": 19789 + }, + { + "epoch": 0.57, + "grad_norm": 1.2517710568628184, + "learning_rate": 4.050417625256155e-06, + "loss": 0.2827, + "step": 19790 + }, + { + "epoch": 0.57, + "grad_norm": 1.1647299361374308, + "learning_rate": 4.049956470357206e-06, + "loss": 0.2918, + "step": 19791 + }, + { + "epoch": 0.57, + "grad_norm": 1.2744223662291752, + "learning_rate": 4.0494953238424e-06, + "loss": 0.3018, + "step": 19792 + }, + { + "epoch": 0.57, + "grad_norm": 1.9232174966198137, + "learning_rate": 4.0490341857158076e-06, + "loss": 0.3493, + "step": 19793 + }, + { + "epoch": 0.57, + "grad_norm": 2.621155043321726, + "learning_rate": 4.048573055981503e-06, + "loss": 0.3065, + "step": 19794 + }, + { + "epoch": 0.57, + "grad_norm": 1.214275998922031, + "learning_rate": 4.048111934643548e-06, + "loss": 0.2976, + "step": 19795 + }, + { + "epoch": 0.57, + "grad_norm": 1.3894124080667554, + "learning_rate": 4.047650821706017e-06, + "loss": 0.3238, + "step": 19796 + }, + { + "epoch": 0.57, + "grad_norm": 1.328853870958548, + "learning_rate": 4.047189717172978e-06, + "loss": 0.2954, + "step": 19797 + }, + { + "epoch": 0.57, + "grad_norm": 1.3445795803003864, + "learning_rate": 4.0467286210485e-06, + "loss": 0.3186, + "step": 19798 + }, + { + "epoch": 0.57, + "grad_norm": 1.214987857325983, + "learning_rate": 4.046267533336653e-06, + "loss": 0.31, + "step": 19799 + }, + { + "epoch": 0.57, + "grad_norm": 2.050312688894693, + "learning_rate": 4.045806454041505e-06, + "loss": 0.2999, + "step": 19800 + }, + { + "epoch": 0.57, + "grad_norm": 1.3060920580755164, + "learning_rate": 4.045345383167126e-06, + "loss": 0.3101, + "step": 19801 + }, + { + "epoch": 0.57, + "grad_norm": 1.2359964916726232, + "learning_rate": 4.044884320717585e-06, + "loss": 0.2854, + "step": 19802 + }, + { + "epoch": 0.57, + "grad_norm": 1.3323715902768538, + "learning_rate": 4.04442326669695e-06, + "loss": 0.3139, + "step": 19803 + }, + { + "epoch": 0.57, + "grad_norm": 1.3892698512618753, + "learning_rate": 4.043962221109289e-06, + "loss": 0.323, + "step": 19804 + }, + { + "epoch": 0.57, + "grad_norm": 1.404325362675656, + "learning_rate": 4.043501183958674e-06, + "loss": 0.288, + "step": 19805 + }, + { + "epoch": 0.57, + "grad_norm": 1.2376451773531407, + "learning_rate": 4.043040155249171e-06, + "loss": 0.3109, + "step": 19806 + }, + { + "epoch": 0.57, + "grad_norm": 1.4382264757959404, + "learning_rate": 4.0425791349848486e-06, + "loss": 0.2972, + "step": 19807 + }, + { + "epoch": 0.57, + "grad_norm": 1.2444252626010375, + "learning_rate": 4.042118123169776e-06, + "loss": 0.2935, + "step": 19808 + }, + { + "epoch": 0.57, + "grad_norm": 1.3932247071644843, + "learning_rate": 4.041657119808023e-06, + "loss": 0.2975, + "step": 19809 + }, + { + "epoch": 0.57, + "grad_norm": 1.6366495841062254, + "learning_rate": 4.041196124903656e-06, + "loss": 0.3285, + "step": 19810 + }, + { + "epoch": 0.57, + "grad_norm": 1.4352588893089488, + "learning_rate": 4.040735138460743e-06, + "loss": 0.313, + "step": 19811 + }, + { + "epoch": 0.57, + "grad_norm": 1.3748720326989539, + "learning_rate": 4.040274160483354e-06, + "loss": 0.3023, + "step": 19812 + }, + { + "epoch": 0.57, + "grad_norm": 1.2170088913768229, + "learning_rate": 4.0398131909755546e-06, + "loss": 0.3003, + "step": 19813 + }, + { + "epoch": 0.57, + "grad_norm": 1.3419047645161868, + "learning_rate": 4.039352229941417e-06, + "loss": 0.2816, + "step": 19814 + }, + { + "epoch": 0.57, + "grad_norm": 1.417105977552861, + "learning_rate": 4.0388912773850044e-06, + "loss": 0.3099, + "step": 19815 + }, + { + "epoch": 0.57, + "grad_norm": 1.3312983724988674, + "learning_rate": 4.0384303333103876e-06, + "loss": 0.286, + "step": 19816 + }, + { + "epoch": 0.57, + "grad_norm": 1.5341534266359027, + "learning_rate": 4.037969397721633e-06, + "loss": 0.3428, + "step": 19817 + }, + { + "epoch": 0.57, + "grad_norm": 1.4047911358648362, + "learning_rate": 4.037508470622811e-06, + "loss": 0.3072, + "step": 19818 + }, + { + "epoch": 0.57, + "grad_norm": 1.4454956536491137, + "learning_rate": 4.037047552017986e-06, + "loss": 0.3098, + "step": 19819 + }, + { + "epoch": 0.57, + "grad_norm": 1.2187361779513335, + "learning_rate": 4.036586641911228e-06, + "loss": 0.292, + "step": 19820 + }, + { + "epoch": 0.57, + "grad_norm": 1.4147568468368525, + "learning_rate": 4.036125740306602e-06, + "loss": 0.2806, + "step": 19821 + }, + { + "epoch": 0.57, + "grad_norm": 1.2479656595736695, + "learning_rate": 4.035664847208181e-06, + "loss": 0.3029, + "step": 19822 + }, + { + "epoch": 0.57, + "grad_norm": 1.3631807623755898, + "learning_rate": 4.035203962620024e-06, + "loss": 0.2863, + "step": 19823 + }, + { + "epoch": 0.57, + "grad_norm": 1.3523294142806876, + "learning_rate": 4.0347430865462044e-06, + "loss": 0.3078, + "step": 19824 + }, + { + "epoch": 0.58, + "grad_norm": 1.2837304831862133, + "learning_rate": 4.034282218990788e-06, + "loss": 0.3113, + "step": 19825 + }, + { + "epoch": 0.58, + "grad_norm": 1.5093236153579956, + "learning_rate": 4.033821359957841e-06, + "loss": 0.2975, + "step": 19826 + }, + { + "epoch": 0.58, + "grad_norm": 1.3233464654382983, + "learning_rate": 4.033360509451431e-06, + "loss": 0.2885, + "step": 19827 + }, + { + "epoch": 0.58, + "grad_norm": 1.2500608835441285, + "learning_rate": 4.032899667475626e-06, + "loss": 0.296, + "step": 19828 + }, + { + "epoch": 0.58, + "grad_norm": 1.3525186857647986, + "learning_rate": 4.0324388340344935e-06, + "loss": 0.3058, + "step": 19829 + }, + { + "epoch": 0.58, + "grad_norm": 1.362446655669356, + "learning_rate": 4.031978009132097e-06, + "loss": 0.3116, + "step": 19830 + }, + { + "epoch": 0.58, + "grad_norm": 1.277576427557903, + "learning_rate": 4.031517192772505e-06, + "loss": 0.3185, + "step": 19831 + }, + { + "epoch": 0.58, + "grad_norm": 1.4477324114946526, + "learning_rate": 4.031056384959785e-06, + "loss": 0.3366, + "step": 19832 + }, + { + "epoch": 0.58, + "grad_norm": 1.3979027655424696, + "learning_rate": 4.030595585698005e-06, + "loss": 0.3166, + "step": 19833 + }, + { + "epoch": 0.58, + "grad_norm": 1.7080192395520293, + "learning_rate": 4.030134794991227e-06, + "loss": 0.2797, + "step": 19834 + }, + { + "epoch": 0.58, + "grad_norm": 1.245425410572394, + "learning_rate": 4.029674012843521e-06, + "loss": 0.2922, + "step": 19835 + }, + { + "epoch": 0.58, + "grad_norm": 1.7689623331617483, + "learning_rate": 4.029213239258953e-06, + "loss": 0.3188, + "step": 19836 + }, + { + "epoch": 0.58, + "grad_norm": 1.2343600539207578, + "learning_rate": 4.0287524742415895e-06, + "loss": 0.3145, + "step": 19837 + }, + { + "epoch": 0.58, + "grad_norm": 1.330923276280299, + "learning_rate": 4.028291717795496e-06, + "loss": 0.2869, + "step": 19838 + }, + { + "epoch": 0.58, + "grad_norm": 1.5055273199160881, + "learning_rate": 4.027830969924738e-06, + "loss": 0.3119, + "step": 19839 + }, + { + "epoch": 0.58, + "grad_norm": 1.4091732793211238, + "learning_rate": 4.027370230633382e-06, + "loss": 0.3157, + "step": 19840 + }, + { + "epoch": 0.58, + "grad_norm": 1.2671523621562075, + "learning_rate": 4.0269094999254955e-06, + "loss": 0.3119, + "step": 19841 + }, + { + "epoch": 0.58, + "grad_norm": 1.3784324327000461, + "learning_rate": 4.026448777805144e-06, + "loss": 0.2777, + "step": 19842 + }, + { + "epoch": 0.58, + "grad_norm": 1.281942439077509, + "learning_rate": 4.025988064276392e-06, + "loss": 0.2859, + "step": 19843 + }, + { + "epoch": 0.58, + "grad_norm": 1.2764935496445124, + "learning_rate": 4.025527359343306e-06, + "loss": 0.2876, + "step": 19844 + }, + { + "epoch": 0.58, + "grad_norm": 1.598964957961361, + "learning_rate": 4.025066663009952e-06, + "loss": 0.3009, + "step": 19845 + }, + { + "epoch": 0.58, + "grad_norm": 1.2058173988530207, + "learning_rate": 4.0246059752803964e-06, + "loss": 0.2916, + "step": 19846 + }, + { + "epoch": 0.58, + "grad_norm": 1.2418899330059459, + "learning_rate": 4.024145296158703e-06, + "loss": 0.2765, + "step": 19847 + }, + { + "epoch": 0.58, + "grad_norm": 1.394366569645048, + "learning_rate": 4.023684625648938e-06, + "loss": 0.2955, + "step": 19848 + }, + { + "epoch": 0.58, + "grad_norm": 1.295493087920715, + "learning_rate": 4.023223963755168e-06, + "loss": 0.2899, + "step": 19849 + }, + { + "epoch": 0.58, + "grad_norm": 1.2125169099515565, + "learning_rate": 4.022763310481458e-06, + "loss": 0.3034, + "step": 19850 + }, + { + "epoch": 0.58, + "grad_norm": 1.2654542769549348, + "learning_rate": 4.022302665831871e-06, + "loss": 0.3405, + "step": 19851 + }, + { + "epoch": 0.58, + "grad_norm": 1.1978872172599393, + "learning_rate": 4.021842029810475e-06, + "loss": 0.285, + "step": 19852 + }, + { + "epoch": 0.58, + "grad_norm": 1.1681340780584306, + "learning_rate": 4.021381402421333e-06, + "loss": 0.2964, + "step": 19853 + }, + { + "epoch": 0.58, + "grad_norm": 1.2505934244315953, + "learning_rate": 4.020920783668511e-06, + "loss": 0.3082, + "step": 19854 + }, + { + "epoch": 0.58, + "grad_norm": 1.5699590534463193, + "learning_rate": 4.020460173556074e-06, + "loss": 0.3101, + "step": 19855 + }, + { + "epoch": 0.58, + "grad_norm": 1.1909034902205071, + "learning_rate": 4.019999572088087e-06, + "loss": 0.2878, + "step": 19856 + }, + { + "epoch": 0.58, + "grad_norm": 1.4056088621059404, + "learning_rate": 4.019538979268615e-06, + "loss": 0.2963, + "step": 19857 + }, + { + "epoch": 0.58, + "grad_norm": 1.2351148282831717, + "learning_rate": 4.019078395101722e-06, + "loss": 0.2775, + "step": 19858 + }, + { + "epoch": 0.58, + "grad_norm": 1.1616576020716878, + "learning_rate": 4.018617819591472e-06, + "loss": 0.2803, + "step": 19859 + }, + { + "epoch": 0.58, + "grad_norm": 1.238000025338495, + "learning_rate": 4.018157252741932e-06, + "loss": 0.2949, + "step": 19860 + }, + { + "epoch": 0.58, + "grad_norm": 1.5759359849176402, + "learning_rate": 4.017696694557165e-06, + "loss": 0.3161, + "step": 19861 + }, + { + "epoch": 0.58, + "grad_norm": 2.2152463610102977, + "learning_rate": 4.017236145041235e-06, + "loss": 0.2901, + "step": 19862 + }, + { + "epoch": 0.58, + "grad_norm": 2.3002085374863737, + "learning_rate": 4.016775604198207e-06, + "loss": 0.3146, + "step": 19863 + }, + { + "epoch": 0.58, + "grad_norm": 1.3821576972749479, + "learning_rate": 4.016315072032145e-06, + "loss": 0.3538, + "step": 19864 + }, + { + "epoch": 0.58, + "grad_norm": 2.089034189949676, + "learning_rate": 4.0158545485471145e-06, + "loss": 0.293, + "step": 19865 + }, + { + "epoch": 0.58, + "grad_norm": 1.2859401183350196, + "learning_rate": 4.015394033747178e-06, + "loss": 0.2996, + "step": 19866 + }, + { + "epoch": 0.58, + "grad_norm": 1.3047461963354388, + "learning_rate": 4.0149335276364e-06, + "loss": 0.3023, + "step": 19867 + }, + { + "epoch": 0.58, + "grad_norm": 1.246877405159786, + "learning_rate": 4.014473030218844e-06, + "loss": 0.2951, + "step": 19868 + }, + { + "epoch": 0.58, + "grad_norm": 1.2525397637444673, + "learning_rate": 4.014012541498575e-06, + "loss": 0.2914, + "step": 19869 + }, + { + "epoch": 0.58, + "grad_norm": 1.3344282339237048, + "learning_rate": 4.013552061479657e-06, + "loss": 0.312, + "step": 19870 + }, + { + "epoch": 0.58, + "grad_norm": 1.3944258869059818, + "learning_rate": 4.013091590166151e-06, + "loss": 0.3035, + "step": 19871 + }, + { + "epoch": 0.58, + "grad_norm": 2.195959592180963, + "learning_rate": 4.012631127562124e-06, + "loss": 0.3055, + "step": 19872 + }, + { + "epoch": 0.58, + "grad_norm": 1.3397488155043602, + "learning_rate": 4.01217067367164e-06, + "loss": 0.2761, + "step": 19873 + }, + { + "epoch": 0.58, + "grad_norm": 1.1958000119162064, + "learning_rate": 4.011710228498758e-06, + "loss": 0.2824, + "step": 19874 + }, + { + "epoch": 0.58, + "grad_norm": 1.29542930576865, + "learning_rate": 4.011249792047545e-06, + "loss": 0.2853, + "step": 19875 + }, + { + "epoch": 0.58, + "grad_norm": 0.89588885990756, + "learning_rate": 4.010789364322063e-06, + "loss": 0.583, + "step": 19876 + }, + { + "epoch": 0.58, + "grad_norm": 1.3141252433434085, + "learning_rate": 4.010328945326376e-06, + "loss": 0.2896, + "step": 19877 + }, + { + "epoch": 0.58, + "grad_norm": 1.6910922599891731, + "learning_rate": 4.009868535064549e-06, + "loss": 0.3139, + "step": 19878 + }, + { + "epoch": 0.58, + "grad_norm": 1.248952919306253, + "learning_rate": 4.009408133540642e-06, + "loss": 0.2983, + "step": 19879 + }, + { + "epoch": 0.58, + "grad_norm": 1.1582516925359971, + "learning_rate": 4.008947740758719e-06, + "loss": 0.2795, + "step": 19880 + }, + { + "epoch": 0.58, + "grad_norm": 1.2898496509725503, + "learning_rate": 4.008487356722844e-06, + "loss": 0.3326, + "step": 19881 + }, + { + "epoch": 0.58, + "grad_norm": 1.2366121346430403, + "learning_rate": 4.008026981437077e-06, + "loss": 0.2775, + "step": 19882 + }, + { + "epoch": 0.58, + "grad_norm": 1.2433064520113426, + "learning_rate": 4.007566614905484e-06, + "loss": 0.2949, + "step": 19883 + }, + { + "epoch": 0.58, + "grad_norm": 1.6629796623388933, + "learning_rate": 4.007106257132127e-06, + "loss": 0.2941, + "step": 19884 + }, + { + "epoch": 0.58, + "grad_norm": 1.2873504489434955, + "learning_rate": 4.006645908121068e-06, + "loss": 0.3, + "step": 19885 + }, + { + "epoch": 0.58, + "grad_norm": 1.1809515265253288, + "learning_rate": 4.00618556787637e-06, + "loss": 0.296, + "step": 19886 + }, + { + "epoch": 0.58, + "grad_norm": 1.236888973926579, + "learning_rate": 4.005725236402094e-06, + "loss": 0.2966, + "step": 19887 + }, + { + "epoch": 0.58, + "grad_norm": 1.228264173173096, + "learning_rate": 4.005264913702306e-06, + "loss": 0.3128, + "step": 19888 + }, + { + "epoch": 0.58, + "grad_norm": 1.2840119308103572, + "learning_rate": 4.004804599781066e-06, + "loss": 0.3097, + "step": 19889 + }, + { + "epoch": 0.58, + "grad_norm": 1.7588844280212745, + "learning_rate": 4.0043442946424346e-06, + "loss": 0.3257, + "step": 19890 + }, + { + "epoch": 0.58, + "grad_norm": 1.2259096987162463, + "learning_rate": 4.003883998290476e-06, + "loss": 0.2952, + "step": 19891 + }, + { + "epoch": 0.58, + "grad_norm": 1.2070166873604775, + "learning_rate": 4.003423710729254e-06, + "loss": 0.2888, + "step": 19892 + }, + { + "epoch": 0.58, + "grad_norm": 1.5932845562463025, + "learning_rate": 4.002963431962828e-06, + "loss": 0.2802, + "step": 19893 + }, + { + "epoch": 0.58, + "grad_norm": 1.3223802500106374, + "learning_rate": 4.0025031619952605e-06, + "loss": 0.3029, + "step": 19894 + }, + { + "epoch": 0.58, + "grad_norm": 3.1907267107931903, + "learning_rate": 4.002042900830613e-06, + "loss": 0.3047, + "step": 19895 + }, + { + "epoch": 0.58, + "grad_norm": 1.3052748322377072, + "learning_rate": 4.001582648472949e-06, + "loss": 0.3021, + "step": 19896 + }, + { + "epoch": 0.58, + "grad_norm": 1.2684616502865478, + "learning_rate": 4.00112240492633e-06, + "loss": 0.295, + "step": 19897 + }, + { + "epoch": 0.58, + "grad_norm": 1.2864544901462556, + "learning_rate": 4.000662170194815e-06, + "loss": 0.2892, + "step": 19898 + }, + { + "epoch": 0.58, + "grad_norm": 1.22275687507729, + "learning_rate": 4.000201944282468e-06, + "loss": 0.3153, + "step": 19899 + }, + { + "epoch": 0.58, + "grad_norm": 1.4153658119042678, + "learning_rate": 3.99974172719335e-06, + "loss": 0.3051, + "step": 19900 + }, + { + "epoch": 0.58, + "grad_norm": 1.2662085270847443, + "learning_rate": 3.999281518931524e-06, + "loss": 0.3161, + "step": 19901 + }, + { + "epoch": 0.58, + "grad_norm": 1.1504769651888496, + "learning_rate": 3.998821319501048e-06, + "loss": 0.2717, + "step": 19902 + }, + { + "epoch": 0.58, + "grad_norm": 1.3286904310452679, + "learning_rate": 3.998361128905984e-06, + "loss": 0.2934, + "step": 19903 + }, + { + "epoch": 0.58, + "grad_norm": 1.700510591522422, + "learning_rate": 3.997900947150395e-06, + "loss": 0.3083, + "step": 19904 + }, + { + "epoch": 0.58, + "grad_norm": 1.2932428921325645, + "learning_rate": 3.997440774238343e-06, + "loss": 0.2951, + "step": 19905 + }, + { + "epoch": 0.58, + "grad_norm": 1.4845773497930062, + "learning_rate": 3.996980610173886e-06, + "loss": 0.2913, + "step": 19906 + }, + { + "epoch": 0.58, + "grad_norm": 1.302580676058512, + "learning_rate": 3.996520454961087e-06, + "loss": 0.2969, + "step": 19907 + }, + { + "epoch": 0.58, + "grad_norm": 1.2718628669646705, + "learning_rate": 3.996060308604006e-06, + "loss": 0.3048, + "step": 19908 + }, + { + "epoch": 0.58, + "grad_norm": 2.4550017670983926, + "learning_rate": 3.995600171106703e-06, + "loss": 0.2983, + "step": 19909 + }, + { + "epoch": 0.58, + "grad_norm": 1.2250364413421992, + "learning_rate": 3.995140042473239e-06, + "loss": 0.2995, + "step": 19910 + }, + { + "epoch": 0.58, + "grad_norm": 1.0192952440975853, + "learning_rate": 3.994679922707677e-06, + "loss": 0.6165, + "step": 19911 + }, + { + "epoch": 0.58, + "grad_norm": 1.2227849922445335, + "learning_rate": 3.994219811814075e-06, + "loss": 0.3015, + "step": 19912 + }, + { + "epoch": 0.58, + "grad_norm": 1.2656999215990035, + "learning_rate": 3.993759709796494e-06, + "loss": 0.3139, + "step": 19913 + }, + { + "epoch": 0.58, + "grad_norm": 6.866830633668058, + "learning_rate": 3.993299616658994e-06, + "loss": 0.3196, + "step": 19914 + }, + { + "epoch": 0.58, + "grad_norm": 1.5488108320948677, + "learning_rate": 3.992839532405638e-06, + "loss": 0.2836, + "step": 19915 + }, + { + "epoch": 0.58, + "grad_norm": 1.37187105861311, + "learning_rate": 3.9923794570404835e-06, + "loss": 0.3035, + "step": 19916 + }, + { + "epoch": 0.58, + "grad_norm": 1.2945618085573618, + "learning_rate": 3.991919390567591e-06, + "loss": 0.2851, + "step": 19917 + }, + { + "epoch": 0.58, + "grad_norm": 1.3770159448704347, + "learning_rate": 3.991459332991021e-06, + "loss": 0.2786, + "step": 19918 + }, + { + "epoch": 0.58, + "grad_norm": 1.24185882629637, + "learning_rate": 3.990999284314834e-06, + "loss": 0.2957, + "step": 19919 + }, + { + "epoch": 0.58, + "grad_norm": 1.2687313993338354, + "learning_rate": 3.990539244543089e-06, + "loss": 0.3112, + "step": 19920 + }, + { + "epoch": 0.58, + "grad_norm": 1.247864824549665, + "learning_rate": 3.990079213679847e-06, + "loss": 0.2887, + "step": 19921 + }, + { + "epoch": 0.58, + "grad_norm": 1.3534153150527752, + "learning_rate": 3.989619191729167e-06, + "loss": 0.3034, + "step": 19922 + }, + { + "epoch": 0.58, + "grad_norm": 1.3054163315097562, + "learning_rate": 3.989159178695108e-06, + "loss": 0.298, + "step": 19923 + }, + { + "epoch": 0.58, + "grad_norm": 1.2279658648425276, + "learning_rate": 3.988699174581733e-06, + "loss": 0.2875, + "step": 19924 + }, + { + "epoch": 0.58, + "grad_norm": 1.3631838875526792, + "learning_rate": 3.988239179393096e-06, + "loss": 0.2968, + "step": 19925 + }, + { + "epoch": 0.58, + "grad_norm": 1.6252842462481494, + "learning_rate": 3.987779193133261e-06, + "loss": 0.2816, + "step": 19926 + }, + { + "epoch": 0.58, + "grad_norm": 1.454684592017812, + "learning_rate": 3.9873192158062854e-06, + "loss": 0.2989, + "step": 19927 + }, + { + "epoch": 0.58, + "grad_norm": 1.2503202387338364, + "learning_rate": 3.98685924741623e-06, + "loss": 0.271, + "step": 19928 + }, + { + "epoch": 0.58, + "grad_norm": 1.525695743276774, + "learning_rate": 3.986399287967152e-06, + "loss": 0.3332, + "step": 19929 + }, + { + "epoch": 0.58, + "grad_norm": 1.9126104459313764, + "learning_rate": 3.98593933746311e-06, + "loss": 0.294, + "step": 19930 + }, + { + "epoch": 0.58, + "grad_norm": 1.301060317942303, + "learning_rate": 3.985479395908167e-06, + "loss": 0.2944, + "step": 19931 + }, + { + "epoch": 0.58, + "grad_norm": 1.3234306263171531, + "learning_rate": 3.9850194633063785e-06, + "loss": 0.2852, + "step": 19932 + }, + { + "epoch": 0.58, + "grad_norm": 1.3297609122268879, + "learning_rate": 3.984559539661805e-06, + "loss": 0.3196, + "step": 19933 + }, + { + "epoch": 0.58, + "grad_norm": 1.1896513613170139, + "learning_rate": 3.984099624978507e-06, + "loss": 0.2926, + "step": 19934 + }, + { + "epoch": 0.58, + "grad_norm": 1.3177303306447974, + "learning_rate": 3.983639719260539e-06, + "loss": 0.3074, + "step": 19935 + }, + { + "epoch": 0.58, + "grad_norm": 1.4475515900227922, + "learning_rate": 3.98317982251196e-06, + "loss": 0.3318, + "step": 19936 + }, + { + "epoch": 0.58, + "grad_norm": 1.8018669380631267, + "learning_rate": 3.982719934736832e-06, + "loss": 0.2972, + "step": 19937 + }, + { + "epoch": 0.58, + "grad_norm": 1.3682121096165851, + "learning_rate": 3.982260055939212e-06, + "loss": 0.2854, + "step": 19938 + }, + { + "epoch": 0.58, + "grad_norm": 1.3412077814560879, + "learning_rate": 3.981800186123158e-06, + "loss": 0.2968, + "step": 19939 + }, + { + "epoch": 0.58, + "grad_norm": 2.223641304907366, + "learning_rate": 3.981340325292729e-06, + "loss": 0.2978, + "step": 19940 + }, + { + "epoch": 0.58, + "grad_norm": 1.291590533646557, + "learning_rate": 3.980880473451982e-06, + "loss": 0.3192, + "step": 19941 + }, + { + "epoch": 0.58, + "grad_norm": 1.194328865596269, + "learning_rate": 3.980420630604976e-06, + "loss": 0.3009, + "step": 19942 + }, + { + "epoch": 0.58, + "grad_norm": 1.3101824841105472, + "learning_rate": 3.979960796755771e-06, + "loss": 0.3139, + "step": 19943 + }, + { + "epoch": 0.58, + "grad_norm": 1.5566199945966144, + "learning_rate": 3.9795009719084225e-06, + "loss": 0.2815, + "step": 19944 + }, + { + "epoch": 0.58, + "grad_norm": 1.375243576274095, + "learning_rate": 3.979041156066989e-06, + "loss": 0.3089, + "step": 19945 + }, + { + "epoch": 0.58, + "grad_norm": 1.5778343066422882, + "learning_rate": 3.978581349235527e-06, + "loss": 0.294, + "step": 19946 + }, + { + "epoch": 0.58, + "grad_norm": 1.7737855218057672, + "learning_rate": 3.978121551418099e-06, + "loss": 0.2857, + "step": 19947 + }, + { + "epoch": 0.58, + "grad_norm": 1.1705308056620498, + "learning_rate": 3.977661762618758e-06, + "loss": 0.282, + "step": 19948 + }, + { + "epoch": 0.58, + "grad_norm": 1.235532518970951, + "learning_rate": 3.977201982841563e-06, + "loss": 0.2983, + "step": 19949 + }, + { + "epoch": 0.58, + "grad_norm": 1.4164076269468802, + "learning_rate": 3.976742212090572e-06, + "loss": 0.2966, + "step": 19950 + }, + { + "epoch": 0.58, + "grad_norm": 1.414215509488403, + "learning_rate": 3.976282450369843e-06, + "loss": 0.3105, + "step": 19951 + }, + { + "epoch": 0.58, + "grad_norm": 1.2277519107970494, + "learning_rate": 3.9758226976834325e-06, + "loss": 0.2953, + "step": 19952 + }, + { + "epoch": 0.58, + "grad_norm": 1.2109947023339502, + "learning_rate": 3.975362954035398e-06, + "loss": 0.2802, + "step": 19953 + }, + { + "epoch": 0.58, + "grad_norm": 1.3993648042990776, + "learning_rate": 3.974903219429796e-06, + "loss": 0.3132, + "step": 19954 + }, + { + "epoch": 0.58, + "grad_norm": 1.2866781848847344, + "learning_rate": 3.974443493870684e-06, + "loss": 0.307, + "step": 19955 + }, + { + "epoch": 0.58, + "grad_norm": 1.4028840170888863, + "learning_rate": 3.973983777362122e-06, + "loss": 0.2944, + "step": 19956 + }, + { + "epoch": 0.58, + "grad_norm": 1.4650892188485216, + "learning_rate": 3.9735240699081634e-06, + "loss": 0.3593, + "step": 19957 + }, + { + "epoch": 0.58, + "grad_norm": 1.9924715243100564, + "learning_rate": 3.9730643715128655e-06, + "loss": 0.3182, + "step": 19958 + }, + { + "epoch": 0.58, + "grad_norm": 1.5875296230788365, + "learning_rate": 3.9726046821802864e-06, + "loss": 0.3206, + "step": 19959 + }, + { + "epoch": 0.58, + "grad_norm": 1.2988041257179421, + "learning_rate": 3.972145001914484e-06, + "loss": 0.296, + "step": 19960 + }, + { + "epoch": 0.58, + "grad_norm": 1.2006238530230247, + "learning_rate": 3.971685330719511e-06, + "loss": 0.2838, + "step": 19961 + }, + { + "epoch": 0.58, + "grad_norm": 1.5466503971524745, + "learning_rate": 3.971225668599426e-06, + "loss": 0.2849, + "step": 19962 + }, + { + "epoch": 0.58, + "grad_norm": 1.3748457220246681, + "learning_rate": 3.97076601555829e-06, + "loss": 0.3005, + "step": 19963 + }, + { + "epoch": 0.58, + "grad_norm": 1.433577844875798, + "learning_rate": 3.970306371600152e-06, + "loss": 0.2772, + "step": 19964 + }, + { + "epoch": 0.58, + "grad_norm": 1.7517092995855288, + "learning_rate": 3.9698467367290715e-06, + "loss": 0.315, + "step": 19965 + }, + { + "epoch": 0.58, + "grad_norm": 1.2849151352233668, + "learning_rate": 3.969387110949107e-06, + "loss": 0.362, + "step": 19966 + }, + { + "epoch": 0.58, + "grad_norm": 1.4529276239784028, + "learning_rate": 3.968927494264311e-06, + "loss": 0.2811, + "step": 19967 + }, + { + "epoch": 0.58, + "grad_norm": 1.9422068197977205, + "learning_rate": 3.968467886678741e-06, + "loss": 0.3161, + "step": 19968 + }, + { + "epoch": 0.58, + "grad_norm": 1.4242871596328237, + "learning_rate": 3.9680082881964536e-06, + "loss": 0.3144, + "step": 19969 + }, + { + "epoch": 0.58, + "grad_norm": 1.449697882103568, + "learning_rate": 3.967548698821506e-06, + "loss": 0.3258, + "step": 19970 + }, + { + "epoch": 0.58, + "grad_norm": 1.3742224008064758, + "learning_rate": 3.967089118557951e-06, + "loss": 0.3013, + "step": 19971 + }, + { + "epoch": 0.58, + "grad_norm": 0.9513646680490399, + "learning_rate": 3.966629547409846e-06, + "loss": 0.5738, + "step": 19972 + }, + { + "epoch": 0.58, + "grad_norm": 1.7782605857899938, + "learning_rate": 3.966169985381247e-06, + "loss": 0.3268, + "step": 19973 + }, + { + "epoch": 0.58, + "grad_norm": 3.0297687832019116, + "learning_rate": 3.965710432476209e-06, + "loss": 0.3079, + "step": 19974 + }, + { + "epoch": 0.58, + "grad_norm": 1.212257996709289, + "learning_rate": 3.965250888698789e-06, + "loss": 0.302, + "step": 19975 + }, + { + "epoch": 0.58, + "grad_norm": 1.4467396911637043, + "learning_rate": 3.964791354053041e-06, + "loss": 0.2873, + "step": 19976 + }, + { + "epoch": 0.58, + "grad_norm": 1.2805390565950772, + "learning_rate": 3.96433182854302e-06, + "loss": 0.2945, + "step": 19977 + }, + { + "epoch": 0.58, + "grad_norm": 1.4458111770286373, + "learning_rate": 3.963872312172783e-06, + "loss": 0.3115, + "step": 19978 + }, + { + "epoch": 0.58, + "grad_norm": 1.3141973276631058, + "learning_rate": 3.963412804946384e-06, + "loss": 0.3176, + "step": 19979 + }, + { + "epoch": 0.58, + "grad_norm": 1.9235719166327012, + "learning_rate": 3.962953306867878e-06, + "loss": 0.2945, + "step": 19980 + }, + { + "epoch": 0.58, + "grad_norm": 1.1815632060217267, + "learning_rate": 3.962493817941322e-06, + "loss": 0.282, + "step": 19981 + }, + { + "epoch": 0.58, + "grad_norm": 1.3400479104645786, + "learning_rate": 3.962034338170768e-06, + "loss": 0.3003, + "step": 19982 + }, + { + "epoch": 0.58, + "grad_norm": 1.6948401869346403, + "learning_rate": 3.961574867560274e-06, + "loss": 0.2899, + "step": 19983 + }, + { + "epoch": 0.58, + "grad_norm": 1.276920941589561, + "learning_rate": 3.9611154061138915e-06, + "loss": 0.2959, + "step": 19984 + }, + { + "epoch": 0.58, + "grad_norm": 1.231804046974079, + "learning_rate": 3.960655953835678e-06, + "loss": 0.3037, + "step": 19985 + }, + { + "epoch": 0.58, + "grad_norm": 1.2510479505910788, + "learning_rate": 3.960196510729687e-06, + "loss": 0.2989, + "step": 19986 + }, + { + "epoch": 0.58, + "grad_norm": 1.2789593163594561, + "learning_rate": 3.959737076799974e-06, + "loss": 0.3004, + "step": 19987 + }, + { + "epoch": 0.58, + "grad_norm": 1.3714924739006165, + "learning_rate": 3.959277652050593e-06, + "loss": 0.2855, + "step": 19988 + }, + { + "epoch": 0.58, + "grad_norm": 1.2854570638646226, + "learning_rate": 3.958818236485596e-06, + "loss": 0.2836, + "step": 19989 + }, + { + "epoch": 0.58, + "grad_norm": 1.2536053525353685, + "learning_rate": 3.958358830109041e-06, + "loss": 0.3053, + "step": 19990 + }, + { + "epoch": 0.58, + "grad_norm": 2.351571694733114, + "learning_rate": 3.957899432924984e-06, + "loss": 0.287, + "step": 19991 + }, + { + "epoch": 0.58, + "grad_norm": 1.2698994896027678, + "learning_rate": 3.957440044937473e-06, + "loss": 0.3033, + "step": 19992 + }, + { + "epoch": 0.58, + "grad_norm": 1.3426015450537414, + "learning_rate": 3.956980666150565e-06, + "loss": 0.2857, + "step": 19993 + }, + { + "epoch": 0.58, + "grad_norm": 1.4930199951139174, + "learning_rate": 3.956521296568315e-06, + "loss": 0.2914, + "step": 19994 + }, + { + "epoch": 0.58, + "grad_norm": 1.1965641393787787, + "learning_rate": 3.956061936194775e-06, + "loss": 0.3036, + "step": 19995 + }, + { + "epoch": 0.58, + "grad_norm": 1.2431213156375815, + "learning_rate": 3.955602585034001e-06, + "loss": 0.2908, + "step": 19996 + }, + { + "epoch": 0.58, + "grad_norm": 5.030024341204155, + "learning_rate": 3.9551432430900446e-06, + "loss": 0.2991, + "step": 19997 + }, + { + "epoch": 0.58, + "grad_norm": 1.2563199635776938, + "learning_rate": 3.954683910366963e-06, + "loss": 0.2931, + "step": 19998 + }, + { + "epoch": 0.58, + "grad_norm": 1.657375426997724, + "learning_rate": 3.954224586868805e-06, + "loss": 0.2966, + "step": 19999 + }, + { + "epoch": 0.58, + "grad_norm": 1.3026966012829475, + "learning_rate": 3.953765272599628e-06, + "loss": 0.3135, + "step": 20000 + }, + { + "epoch": 0.58, + "grad_norm": 1.4476353729829234, + "learning_rate": 3.953305967563484e-06, + "loss": 0.2826, + "step": 20001 + }, + { + "epoch": 0.58, + "grad_norm": 1.3785259822694425, + "learning_rate": 3.952846671764426e-06, + "loss": 0.2996, + "step": 20002 + }, + { + "epoch": 0.58, + "grad_norm": 1.332165877975984, + "learning_rate": 3.952387385206508e-06, + "loss": 0.2654, + "step": 20003 + }, + { + "epoch": 0.58, + "grad_norm": 1.3530514287939561, + "learning_rate": 3.951928107893783e-06, + "loss": 0.305, + "step": 20004 + }, + { + "epoch": 0.58, + "grad_norm": 1.2586884159875102, + "learning_rate": 3.9514688398303035e-06, + "loss": 0.297, + "step": 20005 + }, + { + "epoch": 0.58, + "grad_norm": 1.2683386400035868, + "learning_rate": 3.9510095810201245e-06, + "loss": 0.3025, + "step": 20006 + }, + { + "epoch": 0.58, + "grad_norm": 1.3712489164652029, + "learning_rate": 3.950550331467296e-06, + "loss": 0.2878, + "step": 20007 + }, + { + "epoch": 0.58, + "grad_norm": 1.3428639352474498, + "learning_rate": 3.9500910911758725e-06, + "loss": 0.2741, + "step": 20008 + }, + { + "epoch": 0.58, + "grad_norm": 1.3014501209755835, + "learning_rate": 3.949631860149907e-06, + "loss": 0.3029, + "step": 20009 + }, + { + "epoch": 0.58, + "grad_norm": 1.3722662074776606, + "learning_rate": 3.949172638393452e-06, + "loss": 0.2984, + "step": 20010 + }, + { + "epoch": 0.58, + "grad_norm": 1.3312826107283742, + "learning_rate": 3.948713425910562e-06, + "loss": 0.3085, + "step": 20011 + }, + { + "epoch": 0.58, + "grad_norm": 1.4206893680003534, + "learning_rate": 3.9482542227052855e-06, + "loss": 0.3201, + "step": 20012 + }, + { + "epoch": 0.58, + "grad_norm": 1.5195539223366337, + "learning_rate": 3.947795028781678e-06, + "loss": 0.2804, + "step": 20013 + }, + { + "epoch": 0.58, + "grad_norm": 1.3193133567377522, + "learning_rate": 3.94733584414379e-06, + "loss": 0.3259, + "step": 20014 + }, + { + "epoch": 0.58, + "grad_norm": 1.2810181651960646, + "learning_rate": 3.946876668795677e-06, + "loss": 0.3106, + "step": 20015 + }, + { + "epoch": 0.58, + "grad_norm": 1.380703594443551, + "learning_rate": 3.946417502741388e-06, + "loss": 0.2874, + "step": 20016 + }, + { + "epoch": 0.58, + "grad_norm": 1.1989612276731338, + "learning_rate": 3.945958345984976e-06, + "loss": 0.2793, + "step": 20017 + }, + { + "epoch": 0.58, + "grad_norm": 1.2929976348577294, + "learning_rate": 3.945499198530492e-06, + "loss": 0.287, + "step": 20018 + }, + { + "epoch": 0.58, + "grad_norm": 1.2856667727041522, + "learning_rate": 3.945040060381993e-06, + "loss": 0.2844, + "step": 20019 + }, + { + "epoch": 0.58, + "grad_norm": 1.4629326148607824, + "learning_rate": 3.944580931543524e-06, + "loss": 0.2937, + "step": 20020 + }, + { + "epoch": 0.58, + "grad_norm": 1.6093323476725165, + "learning_rate": 3.944121812019141e-06, + "loss": 0.3074, + "step": 20021 + }, + { + "epoch": 0.58, + "grad_norm": 1.542193083615532, + "learning_rate": 3.943662701812895e-06, + "loss": 0.2906, + "step": 20022 + }, + { + "epoch": 0.58, + "grad_norm": 1.3485752001940932, + "learning_rate": 3.943203600928837e-06, + "loss": 0.2828, + "step": 20023 + }, + { + "epoch": 0.58, + "grad_norm": 1.3406702383862024, + "learning_rate": 3.942744509371018e-06, + "loss": 0.3052, + "step": 20024 + }, + { + "epoch": 0.58, + "grad_norm": 1.3812332216748153, + "learning_rate": 3.942285427143492e-06, + "loss": 0.3449, + "step": 20025 + }, + { + "epoch": 0.58, + "grad_norm": 2.0094690179367607, + "learning_rate": 3.941826354250307e-06, + "loss": 0.3012, + "step": 20026 + }, + { + "epoch": 0.58, + "grad_norm": 1.5798512917930323, + "learning_rate": 3.941367290695518e-06, + "loss": 0.2878, + "step": 20027 + }, + { + "epoch": 0.58, + "grad_norm": 1.396681998247213, + "learning_rate": 3.940908236483173e-06, + "loss": 0.2813, + "step": 20028 + }, + { + "epoch": 0.58, + "grad_norm": 1.242824641260438, + "learning_rate": 3.940449191617325e-06, + "loss": 0.3166, + "step": 20029 + }, + { + "epoch": 0.58, + "grad_norm": 1.7077024380172696, + "learning_rate": 3.939990156102026e-06, + "loss": 0.3067, + "step": 20030 + }, + { + "epoch": 0.58, + "grad_norm": 1.4143347687250543, + "learning_rate": 3.9395311299413245e-06, + "loss": 0.3057, + "step": 20031 + }, + { + "epoch": 0.58, + "grad_norm": 1.3093267192293374, + "learning_rate": 3.939072113139272e-06, + "loss": 0.2885, + "step": 20032 + }, + { + "epoch": 0.58, + "grad_norm": 1.1686747635792298, + "learning_rate": 3.93861310569992e-06, + "loss": 0.3008, + "step": 20033 + }, + { + "epoch": 0.58, + "grad_norm": 3.12577487094862, + "learning_rate": 3.93815410762732e-06, + "loss": 0.3069, + "step": 20034 + }, + { + "epoch": 0.58, + "grad_norm": 1.2848646353278639, + "learning_rate": 3.937695118925521e-06, + "loss": 0.297, + "step": 20035 + }, + { + "epoch": 0.58, + "grad_norm": 1.3171316229509702, + "learning_rate": 3.9372361395985746e-06, + "loss": 0.3034, + "step": 20036 + }, + { + "epoch": 0.58, + "grad_norm": 1.2941776386985695, + "learning_rate": 3.9367771696505304e-06, + "loss": 0.2987, + "step": 20037 + }, + { + "epoch": 0.58, + "grad_norm": 1.3892021029158643, + "learning_rate": 3.936318209085441e-06, + "loss": 0.2885, + "step": 20038 + }, + { + "epoch": 0.58, + "grad_norm": 1.4357888761711004, + "learning_rate": 3.935859257907354e-06, + "loss": 0.2894, + "step": 20039 + }, + { + "epoch": 0.58, + "grad_norm": 1.288650267168835, + "learning_rate": 3.9354003161203205e-06, + "loss": 0.3082, + "step": 20040 + }, + { + "epoch": 0.58, + "grad_norm": 1.372149537892625, + "learning_rate": 3.9349413837283925e-06, + "loss": 0.2987, + "step": 20041 + }, + { + "epoch": 0.58, + "grad_norm": 1.3275617867516742, + "learning_rate": 3.934482460735618e-06, + "loss": 0.3085, + "step": 20042 + }, + { + "epoch": 0.58, + "grad_norm": 1.2791371081728973, + "learning_rate": 3.934023547146048e-06, + "loss": 0.3082, + "step": 20043 + }, + { + "epoch": 0.58, + "grad_norm": 1.3317369022385968, + "learning_rate": 3.9335646429637306e-06, + "loss": 0.2823, + "step": 20044 + }, + { + "epoch": 0.58, + "grad_norm": 1.4821090143865883, + "learning_rate": 3.9331057481927175e-06, + "loss": 0.3026, + "step": 20045 + }, + { + "epoch": 0.58, + "grad_norm": 0.9099556096526861, + "learning_rate": 3.9326468628370585e-06, + "loss": 0.5791, + "step": 20046 + }, + { + "epoch": 0.58, + "grad_norm": 1.3565004623046644, + "learning_rate": 3.932187986900804e-06, + "loss": 0.294, + "step": 20047 + }, + { + "epoch": 0.58, + "grad_norm": 1.2239069456269103, + "learning_rate": 3.931729120388001e-06, + "loss": 0.2834, + "step": 20048 + }, + { + "epoch": 0.58, + "grad_norm": 0.9920429144155531, + "learning_rate": 3.931270263302701e-06, + "loss": 0.6265, + "step": 20049 + }, + { + "epoch": 0.58, + "grad_norm": 0.9120691539119168, + "learning_rate": 3.930811415648952e-06, + "loss": 0.5555, + "step": 20050 + }, + { + "epoch": 0.58, + "grad_norm": 1.3132781376472762, + "learning_rate": 3.930352577430804e-06, + "loss": 0.2985, + "step": 20051 + }, + { + "epoch": 0.58, + "grad_norm": 1.3790387984465333, + "learning_rate": 3.929893748652306e-06, + "loss": 0.2927, + "step": 20052 + }, + { + "epoch": 0.58, + "grad_norm": 1.4645075505402325, + "learning_rate": 3.9294349293175094e-06, + "loss": 0.3083, + "step": 20053 + }, + { + "epoch": 0.58, + "grad_norm": 1.7897777564486819, + "learning_rate": 3.928976119430461e-06, + "loss": 0.3045, + "step": 20054 + }, + { + "epoch": 0.58, + "grad_norm": 1.3969569217454292, + "learning_rate": 3.928517318995209e-06, + "loss": 0.303, + "step": 20055 + }, + { + "epoch": 0.58, + "grad_norm": 1.3111306967253606, + "learning_rate": 3.928058528015803e-06, + "loss": 0.2953, + "step": 20056 + }, + { + "epoch": 0.58, + "grad_norm": 1.3250169111268009, + "learning_rate": 3.927599746496294e-06, + "loss": 0.2934, + "step": 20057 + }, + { + "epoch": 0.58, + "grad_norm": 1.3750629875239393, + "learning_rate": 3.927140974440729e-06, + "loss": 0.3078, + "step": 20058 + }, + { + "epoch": 0.58, + "grad_norm": 1.2896295920124703, + "learning_rate": 3.926682211853155e-06, + "loss": 0.2969, + "step": 20059 + }, + { + "epoch": 0.58, + "grad_norm": 1.197671561315865, + "learning_rate": 3.926223458737624e-06, + "loss": 0.2915, + "step": 20060 + }, + { + "epoch": 0.58, + "grad_norm": 1.3300960237426052, + "learning_rate": 3.925764715098184e-06, + "loss": 0.3054, + "step": 20061 + }, + { + "epoch": 0.58, + "grad_norm": 0.935380807015978, + "learning_rate": 3.9253059809388806e-06, + "loss": 0.621, + "step": 20062 + }, + { + "epoch": 0.58, + "grad_norm": 1.6388918489518112, + "learning_rate": 3.924847256263763e-06, + "loss": 0.3011, + "step": 20063 + }, + { + "epoch": 0.58, + "grad_norm": 1.3006817436509337, + "learning_rate": 3.924388541076882e-06, + "loss": 0.2809, + "step": 20064 + }, + { + "epoch": 0.58, + "grad_norm": 1.7814513374883396, + "learning_rate": 3.923929835382283e-06, + "loss": 0.2846, + "step": 20065 + }, + { + "epoch": 0.58, + "grad_norm": 1.4502545144798853, + "learning_rate": 3.9234711391840164e-06, + "loss": 0.2968, + "step": 20066 + }, + { + "epoch": 0.58, + "grad_norm": 1.3353197189032333, + "learning_rate": 3.923012452486128e-06, + "loss": 0.3023, + "step": 20067 + }, + { + "epoch": 0.58, + "grad_norm": 1.422856144508634, + "learning_rate": 3.922553775292667e-06, + "loss": 0.3258, + "step": 20068 + }, + { + "epoch": 0.58, + "grad_norm": 1.645361643839217, + "learning_rate": 3.92209510760768e-06, + "loss": 0.3084, + "step": 20069 + }, + { + "epoch": 0.58, + "grad_norm": 1.3167218491844104, + "learning_rate": 3.921636449435217e-06, + "loss": 0.293, + "step": 20070 + }, + { + "epoch": 0.58, + "grad_norm": 1.3392820635488627, + "learning_rate": 3.9211778007793234e-06, + "loss": 0.3187, + "step": 20071 + }, + { + "epoch": 0.58, + "grad_norm": 1.2091555489747796, + "learning_rate": 3.920719161644047e-06, + "loss": 0.2917, + "step": 20072 + }, + { + "epoch": 0.58, + "grad_norm": 0.997288417331528, + "learning_rate": 3.920260532033437e-06, + "loss": 0.5929, + "step": 20073 + }, + { + "epoch": 0.58, + "grad_norm": 1.3979038235777408, + "learning_rate": 3.9198019119515416e-06, + "loss": 0.3379, + "step": 20074 + }, + { + "epoch": 0.58, + "grad_norm": 1.3992162265011423, + "learning_rate": 3.919343301402404e-06, + "loss": 0.3083, + "step": 20075 + }, + { + "epoch": 0.58, + "grad_norm": 1.6139739133498285, + "learning_rate": 3.9188847003900755e-06, + "loss": 0.3036, + "step": 20076 + }, + { + "epoch": 0.58, + "grad_norm": 1.3388438908249918, + "learning_rate": 3.9184261089185995e-06, + "loss": 0.3038, + "step": 20077 + }, + { + "epoch": 0.58, + "grad_norm": 1.233921513141657, + "learning_rate": 3.917967526992026e-06, + "loss": 0.2883, + "step": 20078 + }, + { + "epoch": 0.58, + "grad_norm": 0.9335760348939233, + "learning_rate": 3.917508954614401e-06, + "loss": 0.593, + "step": 20079 + }, + { + "epoch": 0.58, + "grad_norm": 1.347337119276784, + "learning_rate": 3.917050391789772e-06, + "loss": 0.3025, + "step": 20080 + }, + { + "epoch": 0.58, + "grad_norm": 1.6169073425330316, + "learning_rate": 3.916591838522184e-06, + "loss": 0.2962, + "step": 20081 + }, + { + "epoch": 0.58, + "grad_norm": 1.3042987259910221, + "learning_rate": 3.9161332948156855e-06, + "loss": 0.3015, + "step": 20082 + }, + { + "epoch": 0.58, + "grad_norm": 1.3223537167655461, + "learning_rate": 3.9156747606743225e-06, + "loss": 0.3005, + "step": 20083 + }, + { + "epoch": 0.58, + "grad_norm": 1.3506831280036486, + "learning_rate": 3.915216236102141e-06, + "loss": 0.3248, + "step": 20084 + }, + { + "epoch": 0.58, + "grad_norm": 1.26685390816991, + "learning_rate": 3.914757721103191e-06, + "loss": 0.3019, + "step": 20085 + }, + { + "epoch": 0.58, + "grad_norm": 1.283678782027346, + "learning_rate": 3.914299215681514e-06, + "loss": 0.2973, + "step": 20086 + }, + { + "epoch": 0.58, + "grad_norm": 1.381861215588168, + "learning_rate": 3.913840719841159e-06, + "loss": 0.2901, + "step": 20087 + }, + { + "epoch": 0.58, + "grad_norm": 1.8826127384693236, + "learning_rate": 3.913382233586171e-06, + "loss": 0.3156, + "step": 20088 + }, + { + "epoch": 0.58, + "grad_norm": 1.2701601493713255, + "learning_rate": 3.912923756920599e-06, + "loss": 0.2984, + "step": 20089 + }, + { + "epoch": 0.58, + "grad_norm": 1.2169902322090618, + "learning_rate": 3.912465289848485e-06, + "loss": 0.2901, + "step": 20090 + }, + { + "epoch": 0.58, + "grad_norm": 1.595068383141339, + "learning_rate": 3.912006832373877e-06, + "loss": 0.2762, + "step": 20091 + }, + { + "epoch": 0.58, + "grad_norm": 1.4650149692549126, + "learning_rate": 3.911548384500821e-06, + "loss": 0.2969, + "step": 20092 + }, + { + "epoch": 0.58, + "grad_norm": 1.468588557648325, + "learning_rate": 3.911089946233364e-06, + "loss": 0.2879, + "step": 20093 + }, + { + "epoch": 0.58, + "grad_norm": 1.395342909572073, + "learning_rate": 3.910631517575548e-06, + "loss": 0.2895, + "step": 20094 + }, + { + "epoch": 0.58, + "grad_norm": 1.81379971783796, + "learning_rate": 3.910173098531422e-06, + "loss": 0.2805, + "step": 20095 + }, + { + "epoch": 0.58, + "grad_norm": 1.3214303291740763, + "learning_rate": 3.90971468910503e-06, + "loss": 0.2888, + "step": 20096 + }, + { + "epoch": 0.58, + "grad_norm": 1.236819104995205, + "learning_rate": 3.909256289300419e-06, + "loss": 0.3012, + "step": 20097 + }, + { + "epoch": 0.58, + "grad_norm": 1.3121545787192719, + "learning_rate": 3.908797899121632e-06, + "loss": 0.3146, + "step": 20098 + }, + { + "epoch": 0.58, + "grad_norm": 1.3182110497190416, + "learning_rate": 3.908339518572717e-06, + "loss": 0.318, + "step": 20099 + }, + { + "epoch": 0.58, + "grad_norm": 1.3848478418649408, + "learning_rate": 3.907881147657717e-06, + "loss": 0.2715, + "step": 20100 + }, + { + "epoch": 0.58, + "grad_norm": 1.5465371416174132, + "learning_rate": 3.907422786380678e-06, + "loss": 0.3049, + "step": 20101 + }, + { + "epoch": 0.58, + "grad_norm": 1.5667283110221164, + "learning_rate": 3.906964434745646e-06, + "loss": 0.2945, + "step": 20102 + }, + { + "epoch": 0.58, + "grad_norm": 1.4065889817255128, + "learning_rate": 3.906506092756666e-06, + "loss": 0.3021, + "step": 20103 + }, + { + "epoch": 0.58, + "grad_norm": 1.262817521826851, + "learning_rate": 3.9060477604177795e-06, + "loss": 0.3059, + "step": 20104 + }, + { + "epoch": 0.58, + "grad_norm": 1.2251969464255195, + "learning_rate": 3.905589437733034e-06, + "loss": 0.3098, + "step": 20105 + }, + { + "epoch": 0.58, + "grad_norm": 1.1556750869478414, + "learning_rate": 3.905131124706474e-06, + "loss": 0.3027, + "step": 20106 + }, + { + "epoch": 0.58, + "grad_norm": 1.3035377370200567, + "learning_rate": 3.904672821342143e-06, + "loss": 0.3174, + "step": 20107 + }, + { + "epoch": 0.58, + "grad_norm": 1.4268563915114192, + "learning_rate": 3.904214527644089e-06, + "loss": 0.2925, + "step": 20108 + }, + { + "epoch": 0.58, + "grad_norm": 1.2418466515382585, + "learning_rate": 3.903756243616352e-06, + "loss": 0.2943, + "step": 20109 + }, + { + "epoch": 0.58, + "grad_norm": 1.3027935690868526, + "learning_rate": 3.903297969262979e-06, + "loss": 0.3061, + "step": 20110 + }, + { + "epoch": 0.58, + "grad_norm": 1.2783022918842035, + "learning_rate": 3.902839704588013e-06, + "loss": 0.295, + "step": 20111 + }, + { + "epoch": 0.58, + "grad_norm": 1.401141818461412, + "learning_rate": 3.902381449595501e-06, + "loss": 0.308, + "step": 20112 + }, + { + "epoch": 0.58, + "grad_norm": 1.238644339776449, + "learning_rate": 3.901923204289482e-06, + "loss": 0.3012, + "step": 20113 + }, + { + "epoch": 0.58, + "grad_norm": 1.2480735165141181, + "learning_rate": 3.901464968674003e-06, + "loss": 0.3004, + "step": 20114 + }, + { + "epoch": 0.58, + "grad_norm": 1.3611296612874777, + "learning_rate": 3.901006742753109e-06, + "loss": 0.2959, + "step": 20115 + }, + { + "epoch": 0.58, + "grad_norm": 1.291688275942131, + "learning_rate": 3.900548526530843e-06, + "loss": 0.2997, + "step": 20116 + }, + { + "epoch": 0.58, + "grad_norm": 1.7176718999916147, + "learning_rate": 3.900090320011248e-06, + "loss": 0.2951, + "step": 20117 + }, + { + "epoch": 0.58, + "grad_norm": 1.3230377527527888, + "learning_rate": 3.899632123198368e-06, + "loss": 0.2869, + "step": 20118 + }, + { + "epoch": 0.58, + "grad_norm": 1.2694242769183148, + "learning_rate": 3.899173936096247e-06, + "loss": 0.3045, + "step": 20119 + }, + { + "epoch": 0.58, + "grad_norm": 1.7888839025033878, + "learning_rate": 3.898715758708928e-06, + "loss": 0.2853, + "step": 20120 + }, + { + "epoch": 0.58, + "grad_norm": 1.32323068288644, + "learning_rate": 3.898257591040456e-06, + "loss": 0.288, + "step": 20121 + }, + { + "epoch": 0.58, + "grad_norm": 1.2539555570446888, + "learning_rate": 3.89779943309487e-06, + "loss": 0.2969, + "step": 20122 + }, + { + "epoch": 0.58, + "grad_norm": 1.157714085506715, + "learning_rate": 3.8973412848762185e-06, + "loss": 0.2777, + "step": 20123 + }, + { + "epoch": 0.58, + "grad_norm": 1.4508652388134038, + "learning_rate": 3.896883146388541e-06, + "loss": 0.3054, + "step": 20124 + }, + { + "epoch": 0.58, + "grad_norm": 1.296093122817245, + "learning_rate": 3.896425017635884e-06, + "loss": 0.3111, + "step": 20125 + }, + { + "epoch": 0.58, + "grad_norm": 1.2487945577027124, + "learning_rate": 3.895966898622287e-06, + "loss": 0.3031, + "step": 20126 + }, + { + "epoch": 0.58, + "grad_norm": 1.1484058892564752, + "learning_rate": 3.895508789351794e-06, + "loss": 0.2801, + "step": 20127 + }, + { + "epoch": 0.58, + "grad_norm": 1.3369975884414036, + "learning_rate": 3.8950506898284486e-06, + "loss": 0.3022, + "step": 20128 + }, + { + "epoch": 0.58, + "grad_norm": 1.2912811304642933, + "learning_rate": 3.894592600056294e-06, + "loss": 0.3009, + "step": 20129 + }, + { + "epoch": 0.58, + "grad_norm": 1.2121099608502852, + "learning_rate": 3.894134520039371e-06, + "loss": 0.2877, + "step": 20130 + }, + { + "epoch": 0.58, + "grad_norm": 1.4823145735784593, + "learning_rate": 3.893676449781725e-06, + "loss": 0.3071, + "step": 20131 + }, + { + "epoch": 0.58, + "grad_norm": 1.3545882632594324, + "learning_rate": 3.893218389287395e-06, + "loss": 0.3278, + "step": 20132 + }, + { + "epoch": 0.58, + "grad_norm": 1.4801761473592008, + "learning_rate": 3.892760338560424e-06, + "loss": 0.2901, + "step": 20133 + }, + { + "epoch": 0.58, + "grad_norm": 1.2823940639672715, + "learning_rate": 3.892302297604856e-06, + "loss": 0.3063, + "step": 20134 + }, + { + "epoch": 0.58, + "grad_norm": 1.239328841249346, + "learning_rate": 3.8918442664247345e-06, + "loss": 0.299, + "step": 20135 + }, + { + "epoch": 0.58, + "grad_norm": 1.4014358444757984, + "learning_rate": 3.891386245024098e-06, + "loss": 0.3019, + "step": 20136 + }, + { + "epoch": 0.58, + "grad_norm": 1.266658883691201, + "learning_rate": 3.890928233406989e-06, + "loss": 0.3077, + "step": 20137 + }, + { + "epoch": 0.58, + "grad_norm": 1.2299368977172582, + "learning_rate": 3.8904702315774515e-06, + "loss": 0.2895, + "step": 20138 + }, + { + "epoch": 0.58, + "grad_norm": 1.4086496113255549, + "learning_rate": 3.890012239539528e-06, + "loss": 0.3315, + "step": 20139 + }, + { + "epoch": 0.58, + "grad_norm": 1.318895061095738, + "learning_rate": 3.889554257297257e-06, + "loss": 0.2858, + "step": 20140 + }, + { + "epoch": 0.58, + "grad_norm": 1.2815320429170298, + "learning_rate": 3.889096284854683e-06, + "loss": 0.3171, + "step": 20141 + }, + { + "epoch": 0.58, + "grad_norm": 1.3255213899008973, + "learning_rate": 3.888638322215845e-06, + "loss": 0.3014, + "step": 20142 + }, + { + "epoch": 0.58, + "grad_norm": 1.245277963409047, + "learning_rate": 3.888180369384787e-06, + "loss": 0.2878, + "step": 20143 + }, + { + "epoch": 0.58, + "grad_norm": 1.334732399104049, + "learning_rate": 3.887722426365551e-06, + "loss": 0.2967, + "step": 20144 + }, + { + "epoch": 0.58, + "grad_norm": 1.346188640929344, + "learning_rate": 3.887264493162175e-06, + "loss": 0.3164, + "step": 20145 + }, + { + "epoch": 0.58, + "grad_norm": 1.398042351186375, + "learning_rate": 3.886806569778703e-06, + "loss": 0.3099, + "step": 20146 + }, + { + "epoch": 0.58, + "grad_norm": 1.3062488766941958, + "learning_rate": 3.8863486562191756e-06, + "loss": 0.2858, + "step": 20147 + }, + { + "epoch": 0.58, + "grad_norm": 1.3274689181195354, + "learning_rate": 3.885890752487634e-06, + "loss": 0.2992, + "step": 20148 + }, + { + "epoch": 0.58, + "grad_norm": 1.3814770011236157, + "learning_rate": 3.885432858588118e-06, + "loss": 0.286, + "step": 20149 + }, + { + "epoch": 0.58, + "grad_norm": 1.3071386165992236, + "learning_rate": 3.88497497452467e-06, + "loss": 0.3123, + "step": 20150 + }, + { + "epoch": 0.58, + "grad_norm": 1.7224999101155334, + "learning_rate": 3.884517100301329e-06, + "loss": 0.2799, + "step": 20151 + }, + { + "epoch": 0.58, + "grad_norm": 1.3418847959725442, + "learning_rate": 3.884059235922139e-06, + "loss": 0.2996, + "step": 20152 + }, + { + "epoch": 0.58, + "grad_norm": 1.2604631902695622, + "learning_rate": 3.883601381391137e-06, + "loss": 0.2818, + "step": 20153 + }, + { + "epoch": 0.58, + "grad_norm": 1.270571135886208, + "learning_rate": 3.883143536712366e-06, + "loss": 0.3021, + "step": 20154 + }, + { + "epoch": 0.58, + "grad_norm": 1.3248144108641524, + "learning_rate": 3.882685701889865e-06, + "loss": 0.2996, + "step": 20155 + }, + { + "epoch": 0.58, + "grad_norm": 1.2774361547840223, + "learning_rate": 3.882227876927675e-06, + "loss": 0.3008, + "step": 20156 + }, + { + "epoch": 0.58, + "grad_norm": 1.20582278675293, + "learning_rate": 3.881770061829838e-06, + "loss": 0.3127, + "step": 20157 + }, + { + "epoch": 0.58, + "grad_norm": 1.2275799218786045, + "learning_rate": 3.8813122566003905e-06, + "loss": 0.3138, + "step": 20158 + }, + { + "epoch": 0.58, + "grad_norm": 1.3092786756312649, + "learning_rate": 3.880854461243378e-06, + "loss": 0.3061, + "step": 20159 + }, + { + "epoch": 0.58, + "grad_norm": 1.3276770415443684, + "learning_rate": 3.8803966757628345e-06, + "loss": 0.2763, + "step": 20160 + }, + { + "epoch": 0.58, + "grad_norm": 1.026896013365169, + "learning_rate": 3.879938900162803e-06, + "loss": 0.6259, + "step": 20161 + }, + { + "epoch": 0.58, + "grad_norm": 1.446041359364295, + "learning_rate": 3.8794811344473234e-06, + "loss": 0.2869, + "step": 20162 + }, + { + "epoch": 0.58, + "grad_norm": 1.284806060570756, + "learning_rate": 3.879023378620436e-06, + "loss": 0.2732, + "step": 20163 + }, + { + "epoch": 0.58, + "grad_norm": 1.7816730278951949, + "learning_rate": 3.878565632686178e-06, + "loss": 0.3028, + "step": 20164 + }, + { + "epoch": 0.58, + "grad_norm": 1.4316603880229934, + "learning_rate": 3.878107896648592e-06, + "loss": 0.2913, + "step": 20165 + }, + { + "epoch": 0.58, + "grad_norm": 1.199203733659498, + "learning_rate": 3.877650170511716e-06, + "loss": 0.2837, + "step": 20166 + }, + { + "epoch": 0.58, + "grad_norm": 1.6723589310406655, + "learning_rate": 3.87719245427959e-06, + "loss": 0.3073, + "step": 20167 + }, + { + "epoch": 0.58, + "grad_norm": 1.6104132786792218, + "learning_rate": 3.876734747956252e-06, + "loss": 0.3112, + "step": 20168 + }, + { + "epoch": 0.58, + "grad_norm": 1.6617873616658945, + "learning_rate": 3.876277051545743e-06, + "loss": 0.2898, + "step": 20169 + }, + { + "epoch": 0.59, + "grad_norm": 1.296461696832956, + "learning_rate": 3.875819365052101e-06, + "loss": 0.2956, + "step": 20170 + }, + { + "epoch": 0.59, + "grad_norm": 1.4658330249617086, + "learning_rate": 3.875361688479367e-06, + "loss": 0.3152, + "step": 20171 + }, + { + "epoch": 0.59, + "grad_norm": 1.4551629513557567, + "learning_rate": 3.874904021831577e-06, + "loss": 0.304, + "step": 20172 + }, + { + "epoch": 0.59, + "grad_norm": 1.1906271937731374, + "learning_rate": 3.874446365112772e-06, + "loss": 0.3125, + "step": 20173 + }, + { + "epoch": 0.59, + "grad_norm": 1.2569493622545278, + "learning_rate": 3.87398871832699e-06, + "loss": 0.2928, + "step": 20174 + }, + { + "epoch": 0.59, + "grad_norm": 1.3527167606749848, + "learning_rate": 3.873531081478272e-06, + "loss": 0.3388, + "step": 20175 + }, + { + "epoch": 0.59, + "grad_norm": 0.9627761519073716, + "learning_rate": 3.873073454570653e-06, + "loss": 0.5713, + "step": 20176 + }, + { + "epoch": 0.59, + "grad_norm": 1.204611450794819, + "learning_rate": 3.872615837608173e-06, + "loss": 0.2935, + "step": 20177 + }, + { + "epoch": 0.59, + "grad_norm": 1.3131601800517891, + "learning_rate": 3.872158230594871e-06, + "loss": 0.2898, + "step": 20178 + }, + { + "epoch": 0.59, + "grad_norm": 1.4403254403103656, + "learning_rate": 3.871700633534784e-06, + "loss": 0.2697, + "step": 20179 + }, + { + "epoch": 0.59, + "grad_norm": 0.905070205371866, + "learning_rate": 3.871243046431954e-06, + "loss": 0.5512, + "step": 20180 + }, + { + "epoch": 0.59, + "grad_norm": 1.3192502433483742, + "learning_rate": 3.870785469290415e-06, + "loss": 0.2952, + "step": 20181 + }, + { + "epoch": 0.59, + "grad_norm": 1.284151034493876, + "learning_rate": 3.870327902114207e-06, + "loss": 0.2942, + "step": 20182 + }, + { + "epoch": 0.59, + "grad_norm": 1.2946597298731537, + "learning_rate": 3.869870344907368e-06, + "loss": 0.2882, + "step": 20183 + }, + { + "epoch": 0.59, + "grad_norm": 1.3622226631652332, + "learning_rate": 3.869412797673936e-06, + "loss": 0.3403, + "step": 20184 + }, + { + "epoch": 0.59, + "grad_norm": 1.2989590353309466, + "learning_rate": 3.868955260417948e-06, + "loss": 0.3031, + "step": 20185 + }, + { + "epoch": 0.59, + "grad_norm": 1.3050424973731785, + "learning_rate": 3.868497733143443e-06, + "loss": 0.3137, + "step": 20186 + }, + { + "epoch": 0.59, + "grad_norm": 2.83875758387636, + "learning_rate": 3.86804021585446e-06, + "loss": 0.287, + "step": 20187 + }, + { + "epoch": 0.59, + "grad_norm": 1.3157957673765586, + "learning_rate": 3.8675827085550315e-06, + "loss": 0.2928, + "step": 20188 + }, + { + "epoch": 0.59, + "grad_norm": 1.4149222590131498, + "learning_rate": 3.867125211249199e-06, + "loss": 0.2724, + "step": 20189 + }, + { + "epoch": 0.59, + "grad_norm": 1.2302837970963718, + "learning_rate": 3.866667723940999e-06, + "loss": 0.2972, + "step": 20190 + }, + { + "epoch": 0.59, + "grad_norm": 1.2730647071336847, + "learning_rate": 3.866210246634469e-06, + "loss": 0.309, + "step": 20191 + }, + { + "epoch": 0.59, + "grad_norm": 1.4356957382167077, + "learning_rate": 3.865752779333646e-06, + "loss": 0.3101, + "step": 20192 + }, + { + "epoch": 0.59, + "grad_norm": 1.3669977668883844, + "learning_rate": 3.8652953220425675e-06, + "loss": 0.2813, + "step": 20193 + }, + { + "epoch": 0.59, + "grad_norm": 1.2928197253806046, + "learning_rate": 3.864837874765271e-06, + "loss": 0.2861, + "step": 20194 + }, + { + "epoch": 0.59, + "grad_norm": 1.2766129653385168, + "learning_rate": 3.864380437505791e-06, + "loss": 0.2876, + "step": 20195 + }, + { + "epoch": 0.59, + "grad_norm": 0.9421455271102958, + "learning_rate": 3.863923010268168e-06, + "loss": 0.597, + "step": 20196 + }, + { + "epoch": 0.59, + "grad_norm": 1.2996192540187654, + "learning_rate": 3.863465593056436e-06, + "loss": 0.2881, + "step": 20197 + }, + { + "epoch": 0.59, + "grad_norm": 1.2310444806850773, + "learning_rate": 3.863008185874632e-06, + "loss": 0.3057, + "step": 20198 + }, + { + "epoch": 0.59, + "grad_norm": 1.3284008954506363, + "learning_rate": 3.8625507887267965e-06, + "loss": 0.2983, + "step": 20199 + }, + { + "epoch": 0.59, + "grad_norm": 1.461332776276499, + "learning_rate": 3.86209340161696e-06, + "loss": 0.3102, + "step": 20200 + }, + { + "epoch": 0.59, + "grad_norm": 1.1915237319042307, + "learning_rate": 3.861636024549163e-06, + "loss": 0.2909, + "step": 20201 + }, + { + "epoch": 0.59, + "grad_norm": 1.3976239289266836, + "learning_rate": 3.86117865752744e-06, + "loss": 0.2858, + "step": 20202 + }, + { + "epoch": 0.59, + "grad_norm": 1.1869619429319362, + "learning_rate": 3.86072130055583e-06, + "loss": 0.3001, + "step": 20203 + }, + { + "epoch": 0.59, + "grad_norm": 1.4822934159706083, + "learning_rate": 3.860263953638365e-06, + "loss": 0.3174, + "step": 20204 + }, + { + "epoch": 0.59, + "grad_norm": 1.4012816508966375, + "learning_rate": 3.859806616779084e-06, + "loss": 0.335, + "step": 20205 + }, + { + "epoch": 0.59, + "grad_norm": 1.5254504375305626, + "learning_rate": 3.859349289982022e-06, + "loss": 0.3019, + "step": 20206 + }, + { + "epoch": 0.59, + "grad_norm": 1.3044021908427403, + "learning_rate": 3.858891973251216e-06, + "loss": 0.296, + "step": 20207 + }, + { + "epoch": 0.59, + "grad_norm": 1.3533047599138577, + "learning_rate": 3.8584346665907e-06, + "loss": 0.2916, + "step": 20208 + }, + { + "epoch": 0.59, + "grad_norm": 1.2048869784947447, + "learning_rate": 3.8579773700045105e-06, + "loss": 0.3309, + "step": 20209 + }, + { + "epoch": 0.59, + "grad_norm": 1.2184030537108672, + "learning_rate": 3.857520083496683e-06, + "loss": 0.299, + "step": 20210 + }, + { + "epoch": 0.59, + "grad_norm": 1.6631379500523482, + "learning_rate": 3.857062807071256e-06, + "loss": 0.2895, + "step": 20211 + }, + { + "epoch": 0.59, + "grad_norm": 1.1914569165975788, + "learning_rate": 3.856605540732261e-06, + "loss": 0.3005, + "step": 20212 + }, + { + "epoch": 0.59, + "grad_norm": 1.2805448004792495, + "learning_rate": 3.856148284483734e-06, + "loss": 0.2901, + "step": 20213 + }, + { + "epoch": 0.59, + "grad_norm": 1.3249588357906263, + "learning_rate": 3.855691038329711e-06, + "loss": 0.287, + "step": 20214 + }, + { + "epoch": 0.59, + "grad_norm": 0.9287037180922966, + "learning_rate": 3.85523380227423e-06, + "loss": 0.6088, + "step": 20215 + }, + { + "epoch": 0.59, + "grad_norm": 1.247214823080007, + "learning_rate": 3.854776576321322e-06, + "loss": 0.2904, + "step": 20216 + }, + { + "epoch": 0.59, + "grad_norm": 1.2825688601977112, + "learning_rate": 3.854319360475022e-06, + "loss": 0.3219, + "step": 20217 + }, + { + "epoch": 0.59, + "grad_norm": 1.1859210788275516, + "learning_rate": 3.853862154739368e-06, + "loss": 0.2813, + "step": 20218 + }, + { + "epoch": 0.59, + "grad_norm": 1.578018623938614, + "learning_rate": 3.853404959118392e-06, + "loss": 0.324, + "step": 20219 + }, + { + "epoch": 0.59, + "grad_norm": 1.2538628475927809, + "learning_rate": 3.85294777361613e-06, + "loss": 0.2905, + "step": 20220 + }, + { + "epoch": 0.59, + "grad_norm": 1.4109606265305354, + "learning_rate": 3.852490598236616e-06, + "loss": 0.2874, + "step": 20221 + }, + { + "epoch": 0.59, + "grad_norm": 1.2072186569883652, + "learning_rate": 3.852033432983887e-06, + "loss": 0.3046, + "step": 20222 + }, + { + "epoch": 0.59, + "grad_norm": 1.2713643728054713, + "learning_rate": 3.8515762778619745e-06, + "loss": 0.2923, + "step": 20223 + }, + { + "epoch": 0.59, + "grad_norm": 1.4436536214304125, + "learning_rate": 3.8511191328749145e-06, + "loss": 0.3301, + "step": 20224 + }, + { + "epoch": 0.59, + "grad_norm": 1.364147042410391, + "learning_rate": 3.85066199802674e-06, + "loss": 0.2869, + "step": 20225 + }, + { + "epoch": 0.59, + "grad_norm": 1.4942964710944495, + "learning_rate": 3.850204873321488e-06, + "loss": 0.3006, + "step": 20226 + }, + { + "epoch": 0.59, + "grad_norm": 1.252746386049245, + "learning_rate": 3.849747758763189e-06, + "loss": 0.2991, + "step": 20227 + }, + { + "epoch": 0.59, + "grad_norm": 1.3366429426980768, + "learning_rate": 3.849290654355879e-06, + "loss": 0.3102, + "step": 20228 + }, + { + "epoch": 0.59, + "grad_norm": 1.502462095971946, + "learning_rate": 3.848833560103592e-06, + "loss": 0.2926, + "step": 20229 + }, + { + "epoch": 0.59, + "grad_norm": 1.476659439050792, + "learning_rate": 3.848376476010363e-06, + "loss": 0.3053, + "step": 20230 + }, + { + "epoch": 0.59, + "grad_norm": 1.261602708012713, + "learning_rate": 3.847919402080222e-06, + "loss": 0.3271, + "step": 20231 + }, + { + "epoch": 0.59, + "grad_norm": 1.2385264881003875, + "learning_rate": 3.847462338317207e-06, + "loss": 0.2923, + "step": 20232 + }, + { + "epoch": 0.59, + "grad_norm": 1.252258058297813, + "learning_rate": 3.847005284725348e-06, + "loss": 0.2825, + "step": 20233 + }, + { + "epoch": 0.59, + "grad_norm": 0.9926368797494723, + "learning_rate": 3.846548241308681e-06, + "loss": 0.5939, + "step": 20234 + }, + { + "epoch": 0.59, + "grad_norm": 1.2023994527364261, + "learning_rate": 3.8460912080712395e-06, + "loss": 0.2928, + "step": 20235 + }, + { + "epoch": 0.59, + "grad_norm": 1.7667177420258988, + "learning_rate": 3.845634185017054e-06, + "loss": 0.3081, + "step": 20236 + }, + { + "epoch": 0.59, + "grad_norm": 1.2705037835846316, + "learning_rate": 3.845177172150161e-06, + "loss": 0.3029, + "step": 20237 + }, + { + "epoch": 0.59, + "grad_norm": 1.28310873226012, + "learning_rate": 3.844720169474591e-06, + "loss": 0.2926, + "step": 20238 + }, + { + "epoch": 0.59, + "grad_norm": 1.4782391120123415, + "learning_rate": 3.84426317699438e-06, + "loss": 0.3047, + "step": 20239 + }, + { + "epoch": 0.59, + "grad_norm": 1.4277928470243249, + "learning_rate": 3.843806194713559e-06, + "loss": 0.3105, + "step": 20240 + }, + { + "epoch": 0.59, + "grad_norm": 1.4613692040397512, + "learning_rate": 3.84334922263616e-06, + "loss": 0.3297, + "step": 20241 + }, + { + "epoch": 0.59, + "grad_norm": 1.6168666965389331, + "learning_rate": 3.842892260766217e-06, + "loss": 0.2768, + "step": 20242 + }, + { + "epoch": 0.59, + "grad_norm": 1.4316880794834193, + "learning_rate": 3.8424353091077645e-06, + "loss": 0.3135, + "step": 20243 + }, + { + "epoch": 0.59, + "grad_norm": 1.2117898650643653, + "learning_rate": 3.8419783676648326e-06, + "loss": 0.2814, + "step": 20244 + }, + { + "epoch": 0.59, + "grad_norm": 1.6008993853946833, + "learning_rate": 3.841521436441455e-06, + "loss": 0.2886, + "step": 20245 + }, + { + "epoch": 0.59, + "grad_norm": 1.2193751041761498, + "learning_rate": 3.841064515441661e-06, + "loss": 0.2866, + "step": 20246 + }, + { + "epoch": 0.59, + "grad_norm": 1.1517810912969455, + "learning_rate": 3.840607604669487e-06, + "loss": 0.2702, + "step": 20247 + }, + { + "epoch": 0.59, + "grad_norm": 1.1764845563702953, + "learning_rate": 3.840150704128963e-06, + "loss": 0.284, + "step": 20248 + }, + { + "epoch": 0.59, + "grad_norm": 1.2340446453865055, + "learning_rate": 3.839693813824123e-06, + "loss": 0.3021, + "step": 20249 + }, + { + "epoch": 0.59, + "grad_norm": 1.3227889542817715, + "learning_rate": 3.839236933758998e-06, + "loss": 0.2868, + "step": 20250 + }, + { + "epoch": 0.59, + "grad_norm": 1.2276889153640391, + "learning_rate": 3.838780063937619e-06, + "loss": 0.2824, + "step": 20251 + }, + { + "epoch": 0.59, + "grad_norm": 1.6718145545779457, + "learning_rate": 3.838323204364019e-06, + "loss": 0.3188, + "step": 20252 + }, + { + "epoch": 0.59, + "grad_norm": 1.1662566101953582, + "learning_rate": 3.83786635504223e-06, + "loss": 0.2756, + "step": 20253 + }, + { + "epoch": 0.59, + "grad_norm": 1.3511831304810782, + "learning_rate": 3.837409515976284e-06, + "loss": 0.2977, + "step": 20254 + }, + { + "epoch": 0.59, + "grad_norm": 1.309029598365446, + "learning_rate": 3.83695268717021e-06, + "loss": 0.2884, + "step": 20255 + }, + { + "epoch": 0.59, + "grad_norm": 2.0478852980134046, + "learning_rate": 3.836495868628042e-06, + "loss": 0.3137, + "step": 20256 + }, + { + "epoch": 0.59, + "grad_norm": 1.3927942660108905, + "learning_rate": 3.836039060353812e-06, + "loss": 0.3244, + "step": 20257 + }, + { + "epoch": 0.59, + "grad_norm": 1.3380296674154866, + "learning_rate": 3.83558226235155e-06, + "loss": 0.2935, + "step": 20258 + }, + { + "epoch": 0.59, + "grad_norm": 1.4323307943405408, + "learning_rate": 3.835125474625286e-06, + "loss": 0.3013, + "step": 20259 + }, + { + "epoch": 0.59, + "grad_norm": 1.2255416187219401, + "learning_rate": 3.8346686971790545e-06, + "loss": 0.2934, + "step": 20260 + }, + { + "epoch": 0.59, + "grad_norm": 1.3714565025985863, + "learning_rate": 3.834211930016883e-06, + "loss": 0.3101, + "step": 20261 + }, + { + "epoch": 0.59, + "grad_norm": 1.3975653284353877, + "learning_rate": 3.833755173142806e-06, + "loss": 0.3284, + "step": 20262 + }, + { + "epoch": 0.59, + "grad_norm": 0.9279669553560018, + "learning_rate": 3.833298426560851e-06, + "loss": 0.5781, + "step": 20263 + }, + { + "epoch": 0.59, + "grad_norm": 1.2253523166268712, + "learning_rate": 3.832841690275051e-06, + "loss": 0.2911, + "step": 20264 + }, + { + "epoch": 0.59, + "grad_norm": 0.9538035459090048, + "learning_rate": 3.832384964289435e-06, + "loss": 0.6166, + "step": 20265 + }, + { + "epoch": 0.59, + "grad_norm": 1.3127752048419459, + "learning_rate": 3.831928248608036e-06, + "loss": 0.2974, + "step": 20266 + }, + { + "epoch": 0.59, + "grad_norm": 1.7929311798191256, + "learning_rate": 3.831471543234883e-06, + "loss": 0.3027, + "step": 20267 + }, + { + "epoch": 0.59, + "grad_norm": 1.3742022764595325, + "learning_rate": 3.831014848174006e-06, + "loss": 0.2822, + "step": 20268 + }, + { + "epoch": 0.59, + "grad_norm": 1.182323130073015, + "learning_rate": 3.830558163429436e-06, + "loss": 0.2838, + "step": 20269 + }, + { + "epoch": 0.59, + "grad_norm": 1.3129575207333204, + "learning_rate": 3.830101489005203e-06, + "loss": 0.2992, + "step": 20270 + }, + { + "epoch": 0.59, + "grad_norm": 1.3770020974320247, + "learning_rate": 3.829644824905339e-06, + "loss": 0.3079, + "step": 20271 + }, + { + "epoch": 0.59, + "grad_norm": 1.3044472953198438, + "learning_rate": 3.829188171133873e-06, + "loss": 0.2675, + "step": 20272 + }, + { + "epoch": 0.59, + "grad_norm": 1.188412464055325, + "learning_rate": 3.828731527694831e-06, + "loss": 0.2741, + "step": 20273 + }, + { + "epoch": 0.59, + "grad_norm": 1.1688792807227633, + "learning_rate": 3.828274894592248e-06, + "loss": 0.3074, + "step": 20274 + }, + { + "epoch": 0.59, + "grad_norm": 0.9146298461330944, + "learning_rate": 3.8278182718301515e-06, + "loss": 0.5795, + "step": 20275 + }, + { + "epoch": 0.59, + "grad_norm": 1.6353725691012386, + "learning_rate": 3.827361659412572e-06, + "loss": 0.2914, + "step": 20276 + }, + { + "epoch": 0.59, + "grad_norm": 1.3375152840225148, + "learning_rate": 3.826905057343539e-06, + "loss": 0.3257, + "step": 20277 + }, + { + "epoch": 0.59, + "grad_norm": 1.3454788803566708, + "learning_rate": 3.8264484656270825e-06, + "loss": 0.3246, + "step": 20278 + }, + { + "epoch": 0.59, + "grad_norm": 1.179098675895963, + "learning_rate": 3.82599188426723e-06, + "loss": 0.2681, + "step": 20279 + }, + { + "epoch": 0.59, + "grad_norm": 1.3164929065369642, + "learning_rate": 3.825535313268013e-06, + "loss": 0.3062, + "step": 20280 + }, + { + "epoch": 0.59, + "grad_norm": 1.3839077988764092, + "learning_rate": 3.8250787526334595e-06, + "loss": 0.2927, + "step": 20281 + }, + { + "epoch": 0.59, + "grad_norm": 1.4012863006485086, + "learning_rate": 3.824622202367599e-06, + "loss": 0.2941, + "step": 20282 + }, + { + "epoch": 0.59, + "grad_norm": 1.3509400111726582, + "learning_rate": 3.824165662474461e-06, + "loss": 0.3117, + "step": 20283 + }, + { + "epoch": 0.59, + "grad_norm": 5.390319478405501, + "learning_rate": 3.823709132958073e-06, + "loss": 0.3223, + "step": 20284 + }, + { + "epoch": 0.59, + "grad_norm": 1.2401058817767185, + "learning_rate": 3.823252613822467e-06, + "loss": 0.2942, + "step": 20285 + }, + { + "epoch": 0.59, + "grad_norm": 1.2656983478366801, + "learning_rate": 3.822796105071668e-06, + "loss": 0.2914, + "step": 20286 + }, + { + "epoch": 0.59, + "grad_norm": 1.7383605029317157, + "learning_rate": 3.8223396067097074e-06, + "loss": 0.2867, + "step": 20287 + }, + { + "epoch": 0.59, + "grad_norm": 1.463450948780554, + "learning_rate": 3.821883118740612e-06, + "loss": 0.3166, + "step": 20288 + }, + { + "epoch": 0.59, + "grad_norm": 1.393994852957325, + "learning_rate": 3.821426641168411e-06, + "loss": 0.2889, + "step": 20289 + }, + { + "epoch": 0.59, + "grad_norm": 1.4780755873686244, + "learning_rate": 3.8209701739971344e-06, + "loss": 0.3069, + "step": 20290 + }, + { + "epoch": 0.59, + "grad_norm": 1.4884497679608741, + "learning_rate": 3.820513717230809e-06, + "loss": 0.3249, + "step": 20291 + }, + { + "epoch": 0.59, + "grad_norm": 1.147805635885765, + "learning_rate": 3.8200572708734615e-06, + "loss": 0.2903, + "step": 20292 + }, + { + "epoch": 0.59, + "grad_norm": 1.2217194312409114, + "learning_rate": 3.8196008349291215e-06, + "loss": 0.2946, + "step": 20293 + }, + { + "epoch": 0.59, + "grad_norm": 1.2495235141937064, + "learning_rate": 3.819144409401819e-06, + "loss": 0.2919, + "step": 20294 + }, + { + "epoch": 0.59, + "grad_norm": 0.9234158365543439, + "learning_rate": 3.81868799429558e-06, + "loss": 0.5348, + "step": 20295 + }, + { + "epoch": 0.59, + "grad_norm": 1.3879866671997874, + "learning_rate": 3.818231589614432e-06, + "loss": 0.2777, + "step": 20296 + }, + { + "epoch": 0.59, + "grad_norm": 1.4721588483595136, + "learning_rate": 3.817775195362402e-06, + "loss": 0.3008, + "step": 20297 + }, + { + "epoch": 0.59, + "grad_norm": 1.1788058726901294, + "learning_rate": 3.817318811543522e-06, + "loss": 0.296, + "step": 20298 + }, + { + "epoch": 0.59, + "grad_norm": 1.215096970785496, + "learning_rate": 3.816862438161814e-06, + "loss": 0.2884, + "step": 20299 + }, + { + "epoch": 0.59, + "grad_norm": 1.3603060903861635, + "learning_rate": 3.816406075221311e-06, + "loss": 0.2905, + "step": 20300 + }, + { + "epoch": 0.59, + "grad_norm": 1.1583089448796804, + "learning_rate": 3.815949722726036e-06, + "loss": 0.2849, + "step": 20301 + }, + { + "epoch": 0.59, + "grad_norm": 1.2432775756666028, + "learning_rate": 3.815493380680016e-06, + "loss": 0.2799, + "step": 20302 + }, + { + "epoch": 0.59, + "grad_norm": 1.566485994134433, + "learning_rate": 3.815037049087282e-06, + "loss": 0.3077, + "step": 20303 + }, + { + "epoch": 0.59, + "grad_norm": 1.4320084264043929, + "learning_rate": 3.814580727951859e-06, + "loss": 0.2785, + "step": 20304 + }, + { + "epoch": 0.59, + "grad_norm": 1.6181581265115739, + "learning_rate": 3.8141244172777743e-06, + "loss": 0.3456, + "step": 20305 + }, + { + "epoch": 0.59, + "grad_norm": 1.2717551522966872, + "learning_rate": 3.813668117069054e-06, + "loss": 0.2875, + "step": 20306 + }, + { + "epoch": 0.59, + "grad_norm": 1.2868907624858368, + "learning_rate": 3.8132118273297258e-06, + "loss": 0.3096, + "step": 20307 + }, + { + "epoch": 0.59, + "grad_norm": 1.2236241143062563, + "learning_rate": 3.8127555480638174e-06, + "loss": 0.2931, + "step": 20308 + }, + { + "epoch": 0.59, + "grad_norm": 1.778028901499522, + "learning_rate": 3.8122992792753534e-06, + "loss": 0.3369, + "step": 20309 + }, + { + "epoch": 0.59, + "grad_norm": 1.424621333023832, + "learning_rate": 3.8118430209683614e-06, + "loss": 0.3046, + "step": 20310 + }, + { + "epoch": 0.59, + "grad_norm": 1.4976487519805268, + "learning_rate": 3.8113867731468677e-06, + "loss": 0.2879, + "step": 20311 + }, + { + "epoch": 0.59, + "grad_norm": 1.4045523537879137, + "learning_rate": 3.8109305358148996e-06, + "loss": 0.2819, + "step": 20312 + }, + { + "epoch": 0.59, + "grad_norm": 1.332641611536086, + "learning_rate": 3.8104743089764835e-06, + "loss": 0.3141, + "step": 20313 + }, + { + "epoch": 0.59, + "grad_norm": 1.5573858799414666, + "learning_rate": 3.8100180926356435e-06, + "loss": 0.3161, + "step": 20314 + }, + { + "epoch": 0.59, + "grad_norm": 1.211417409883731, + "learning_rate": 3.8095618867964074e-06, + "loss": 0.28, + "step": 20315 + }, + { + "epoch": 0.59, + "grad_norm": 1.3706944868195439, + "learning_rate": 3.8091056914628007e-06, + "loss": 0.3203, + "step": 20316 + }, + { + "epoch": 0.59, + "grad_norm": 1.542600499121541, + "learning_rate": 3.80864950663885e-06, + "loss": 0.2853, + "step": 20317 + }, + { + "epoch": 0.59, + "grad_norm": 1.3109282755771674, + "learning_rate": 3.8081933323285804e-06, + "loss": 0.2786, + "step": 20318 + }, + { + "epoch": 0.59, + "grad_norm": 1.2427013957503226, + "learning_rate": 3.8077371685360174e-06, + "loss": 0.2863, + "step": 20319 + }, + { + "epoch": 0.59, + "grad_norm": 1.3210339264637785, + "learning_rate": 3.807281015265187e-06, + "loss": 0.3067, + "step": 20320 + }, + { + "epoch": 0.59, + "grad_norm": 1.5219969519763938, + "learning_rate": 3.806824872520116e-06, + "loss": 0.3086, + "step": 20321 + }, + { + "epoch": 0.59, + "grad_norm": 1.7323237397995248, + "learning_rate": 3.8063687403048283e-06, + "loss": 0.2865, + "step": 20322 + }, + { + "epoch": 0.59, + "grad_norm": 1.4504460817868532, + "learning_rate": 3.805912618623349e-06, + "loss": 0.3072, + "step": 20323 + }, + { + "epoch": 0.59, + "grad_norm": 1.6495118699473754, + "learning_rate": 3.8054565074797045e-06, + "loss": 0.3021, + "step": 20324 + }, + { + "epoch": 0.59, + "grad_norm": 1.2173552201204274, + "learning_rate": 3.8050004068779196e-06, + "loss": 0.2712, + "step": 20325 + }, + { + "epoch": 0.59, + "grad_norm": 1.3042862319836301, + "learning_rate": 3.8045443168220205e-06, + "loss": 0.3081, + "step": 20326 + }, + { + "epoch": 0.59, + "grad_norm": 1.2941194006419565, + "learning_rate": 3.8040882373160294e-06, + "loss": 0.2986, + "step": 20327 + }, + { + "epoch": 0.59, + "grad_norm": 1.2805262289173405, + "learning_rate": 3.8036321683639752e-06, + "loss": 0.2701, + "step": 20328 + }, + { + "epoch": 0.59, + "grad_norm": 1.2770773839741132, + "learning_rate": 3.8031761099698785e-06, + "loss": 0.3209, + "step": 20329 + }, + { + "epoch": 0.59, + "grad_norm": 1.258401871125642, + "learning_rate": 3.802720062137766e-06, + "loss": 0.306, + "step": 20330 + }, + { + "epoch": 0.59, + "grad_norm": 1.298456044201901, + "learning_rate": 3.802264024871662e-06, + "loss": 0.2893, + "step": 20331 + }, + { + "epoch": 0.59, + "grad_norm": 1.572861073148735, + "learning_rate": 3.8018079981755922e-06, + "loss": 0.3171, + "step": 20332 + }, + { + "epoch": 0.59, + "grad_norm": 1.2716633998859468, + "learning_rate": 3.8013519820535793e-06, + "loss": 0.3083, + "step": 20333 + }, + { + "epoch": 0.59, + "grad_norm": 1.185698992744724, + "learning_rate": 3.800895976509648e-06, + "loss": 0.2806, + "step": 20334 + }, + { + "epoch": 0.59, + "grad_norm": 1.2417464250806967, + "learning_rate": 3.800439981547824e-06, + "loss": 0.3025, + "step": 20335 + }, + { + "epoch": 0.59, + "grad_norm": 1.2223008816432162, + "learning_rate": 3.799983997172131e-06, + "loss": 0.2872, + "step": 20336 + }, + { + "epoch": 0.59, + "grad_norm": 1.3116912605567554, + "learning_rate": 3.7995280233865912e-06, + "loss": 0.301, + "step": 20337 + }, + { + "epoch": 0.59, + "grad_norm": 1.1605411583125054, + "learning_rate": 3.7990720601952297e-06, + "loss": 0.3278, + "step": 20338 + }, + { + "epoch": 0.59, + "grad_norm": 1.4050073563989847, + "learning_rate": 3.798616107602071e-06, + "loss": 0.3069, + "step": 20339 + }, + { + "epoch": 0.59, + "grad_norm": 1.4444211633235429, + "learning_rate": 3.79816016561114e-06, + "loss": 0.2987, + "step": 20340 + }, + { + "epoch": 0.59, + "grad_norm": 1.3508103118554897, + "learning_rate": 3.7977042342264568e-06, + "loss": 0.2946, + "step": 20341 + }, + { + "epoch": 0.59, + "grad_norm": 1.3829228246817864, + "learning_rate": 3.7972483134520476e-06, + "loss": 0.2743, + "step": 20342 + }, + { + "epoch": 0.59, + "grad_norm": 1.2778303751158777, + "learning_rate": 3.7967924032919357e-06, + "loss": 0.2909, + "step": 20343 + }, + { + "epoch": 0.59, + "grad_norm": 1.4580333659299742, + "learning_rate": 3.796336503750145e-06, + "loss": 0.3034, + "step": 20344 + }, + { + "epoch": 0.59, + "grad_norm": 1.2167478344596827, + "learning_rate": 3.795880614830697e-06, + "loss": 0.3002, + "step": 20345 + }, + { + "epoch": 0.59, + "grad_norm": 1.3634599144241653, + "learning_rate": 3.7954247365376157e-06, + "loss": 0.3028, + "step": 20346 + }, + { + "epoch": 0.59, + "grad_norm": 1.13650504799856, + "learning_rate": 3.7949688688749243e-06, + "loss": 0.2904, + "step": 20347 + }, + { + "epoch": 0.59, + "grad_norm": 1.2525602757577796, + "learning_rate": 3.794513011846647e-06, + "loss": 0.319, + "step": 20348 + }, + { + "epoch": 0.59, + "grad_norm": 1.4230651527470897, + "learning_rate": 3.7940571654568055e-06, + "loss": 0.3032, + "step": 20349 + }, + { + "epoch": 0.59, + "grad_norm": 1.2780745150683082, + "learning_rate": 3.793601329709422e-06, + "loss": 0.2848, + "step": 20350 + }, + { + "epoch": 0.59, + "grad_norm": 1.4016013965769194, + "learning_rate": 3.793145504608521e-06, + "loss": 0.2932, + "step": 20351 + }, + { + "epoch": 0.59, + "grad_norm": 1.2381853481485032, + "learning_rate": 3.792689690158124e-06, + "loss": 0.2745, + "step": 20352 + }, + { + "epoch": 0.59, + "grad_norm": 1.5302667086345914, + "learning_rate": 3.792233886362254e-06, + "loss": 0.3011, + "step": 20353 + }, + { + "epoch": 0.59, + "grad_norm": 1.419363553583497, + "learning_rate": 3.7917780932249337e-06, + "loss": 0.3011, + "step": 20354 + }, + { + "epoch": 0.59, + "grad_norm": 1.3855935276227649, + "learning_rate": 3.7913223107501847e-06, + "loss": 0.2923, + "step": 20355 + }, + { + "epoch": 0.59, + "grad_norm": 1.3277612180119005, + "learning_rate": 3.790866538942032e-06, + "loss": 0.2921, + "step": 20356 + }, + { + "epoch": 0.59, + "grad_norm": 1.2270435898840382, + "learning_rate": 3.790410777804493e-06, + "loss": 0.2908, + "step": 20357 + }, + { + "epoch": 0.59, + "grad_norm": 1.9238650802506196, + "learning_rate": 3.789955027341592e-06, + "loss": 0.3194, + "step": 20358 + }, + { + "epoch": 0.59, + "grad_norm": 1.2463990536849994, + "learning_rate": 3.7894992875573527e-06, + "loss": 0.2639, + "step": 20359 + }, + { + "epoch": 0.59, + "grad_norm": 1.4516042027978828, + "learning_rate": 3.7890435584557942e-06, + "loss": 0.3083, + "step": 20360 + }, + { + "epoch": 0.59, + "grad_norm": 1.4770488662455439, + "learning_rate": 3.7885878400409405e-06, + "loss": 0.3081, + "step": 20361 + }, + { + "epoch": 0.59, + "grad_norm": 1.2751615965470013, + "learning_rate": 3.788132132316812e-06, + "loss": 0.2921, + "step": 20362 + }, + { + "epoch": 0.59, + "grad_norm": 1.854008608608124, + "learning_rate": 3.7876764352874325e-06, + "loss": 0.2904, + "step": 20363 + }, + { + "epoch": 0.59, + "grad_norm": 1.556498135398876, + "learning_rate": 3.78722074895682e-06, + "loss": 0.2866, + "step": 20364 + }, + { + "epoch": 0.59, + "grad_norm": 1.2601581295125446, + "learning_rate": 3.7867650733289983e-06, + "loss": 0.3193, + "step": 20365 + }, + { + "epoch": 0.59, + "grad_norm": 1.8085762293460064, + "learning_rate": 3.786309408407989e-06, + "loss": 0.3062, + "step": 20366 + }, + { + "epoch": 0.59, + "grad_norm": 1.2907127803755847, + "learning_rate": 3.785853754197811e-06, + "loss": 0.2965, + "step": 20367 + }, + { + "epoch": 0.59, + "grad_norm": 1.55789639233711, + "learning_rate": 3.7853981107024897e-06, + "loss": 0.307, + "step": 20368 + }, + { + "epoch": 0.59, + "grad_norm": 1.3243207463332012, + "learning_rate": 3.7849424779260414e-06, + "loss": 0.2929, + "step": 20369 + }, + { + "epoch": 0.59, + "grad_norm": 0.9632475554070898, + "learning_rate": 3.78448685587249e-06, + "loss": 0.5824, + "step": 20370 + }, + { + "epoch": 0.59, + "grad_norm": 0.9385640004811187, + "learning_rate": 3.784031244545855e-06, + "loss": 0.5607, + "step": 20371 + }, + { + "epoch": 0.59, + "grad_norm": 1.3250040736953834, + "learning_rate": 3.783575643950159e-06, + "loss": 0.2998, + "step": 20372 + }, + { + "epoch": 0.59, + "grad_norm": 1.6623770977902854, + "learning_rate": 3.7831200540894208e-06, + "loss": 0.3011, + "step": 20373 + }, + { + "epoch": 0.59, + "grad_norm": 1.3861620488994861, + "learning_rate": 3.782664474967661e-06, + "loss": 0.2839, + "step": 20374 + }, + { + "epoch": 0.59, + "grad_norm": 1.4421693473737391, + "learning_rate": 3.7822089065889012e-06, + "loss": 0.2933, + "step": 20375 + }, + { + "epoch": 0.59, + "grad_norm": 1.2781770022828312, + "learning_rate": 3.781753348957162e-06, + "loss": 0.2888, + "step": 20376 + }, + { + "epoch": 0.59, + "grad_norm": 1.4957509579462793, + "learning_rate": 3.7812978020764625e-06, + "loss": 0.3422, + "step": 20377 + }, + { + "epoch": 0.59, + "grad_norm": 1.3540741510822063, + "learning_rate": 3.7808422659508226e-06, + "loss": 0.2843, + "step": 20378 + }, + { + "epoch": 0.59, + "grad_norm": 1.3405416304915383, + "learning_rate": 3.780386740584264e-06, + "loss": 0.2916, + "step": 20379 + }, + { + "epoch": 0.59, + "grad_norm": 1.3110919628105524, + "learning_rate": 3.7799312259808062e-06, + "loss": 0.2858, + "step": 20380 + }, + { + "epoch": 0.59, + "grad_norm": 1.8106094224437184, + "learning_rate": 3.7794757221444685e-06, + "loss": 0.2944, + "step": 20381 + }, + { + "epoch": 0.59, + "grad_norm": 1.3526955315038471, + "learning_rate": 3.7790202290792706e-06, + "loss": 0.2837, + "step": 20382 + }, + { + "epoch": 0.59, + "grad_norm": 1.2458827744816439, + "learning_rate": 3.7785647467892326e-06, + "loss": 0.3141, + "step": 20383 + }, + { + "epoch": 0.59, + "grad_norm": 1.5592381891847695, + "learning_rate": 3.778109275278377e-06, + "loss": 0.2848, + "step": 20384 + }, + { + "epoch": 0.59, + "grad_norm": 1.7172512941591396, + "learning_rate": 3.777653814550718e-06, + "loss": 0.3014, + "step": 20385 + }, + { + "epoch": 0.59, + "grad_norm": 1.3925918332929306, + "learning_rate": 3.7771983646102784e-06, + "loss": 0.2729, + "step": 20386 + }, + { + "epoch": 0.59, + "grad_norm": 1.3662079930413957, + "learning_rate": 3.7767429254610772e-06, + "loss": 0.3122, + "step": 20387 + }, + { + "epoch": 0.59, + "grad_norm": 1.311638280161633, + "learning_rate": 3.7762874971071323e-06, + "loss": 0.2584, + "step": 20388 + }, + { + "epoch": 0.59, + "grad_norm": 0.9620574289920877, + "learning_rate": 3.775832079552464e-06, + "loss": 0.601, + "step": 20389 + }, + { + "epoch": 0.59, + "grad_norm": 1.2170314223005336, + "learning_rate": 3.775376672801091e-06, + "loss": 0.2882, + "step": 20390 + }, + { + "epoch": 0.59, + "grad_norm": 1.3443422339084898, + "learning_rate": 3.774921276857033e-06, + "loss": 0.2946, + "step": 20391 + }, + { + "epoch": 0.59, + "grad_norm": 1.8272107056909865, + "learning_rate": 3.774465891724308e-06, + "loss": 0.2942, + "step": 20392 + }, + { + "epoch": 0.59, + "grad_norm": 3.3868170332954755, + "learning_rate": 3.7740105174069347e-06, + "loss": 0.2864, + "step": 20393 + }, + { + "epoch": 0.59, + "grad_norm": 1.2054906561520866, + "learning_rate": 3.773555153908933e-06, + "loss": 0.2805, + "step": 20394 + }, + { + "epoch": 0.59, + "grad_norm": 1.3168671112822836, + "learning_rate": 3.7730998012343206e-06, + "loss": 0.3034, + "step": 20395 + }, + { + "epoch": 0.59, + "grad_norm": 1.2818073578945361, + "learning_rate": 3.7726444593871154e-06, + "loss": 0.2942, + "step": 20396 + }, + { + "epoch": 0.59, + "grad_norm": 1.2240076708340015, + "learning_rate": 3.7721891283713363e-06, + "loss": 0.2872, + "step": 20397 + }, + { + "epoch": 0.59, + "grad_norm": 1.3413204674665546, + "learning_rate": 3.7717338081910026e-06, + "loss": 0.2981, + "step": 20398 + }, + { + "epoch": 0.59, + "grad_norm": 1.2472672110349836, + "learning_rate": 3.7712784988501317e-06, + "loss": 0.3052, + "step": 20399 + }, + { + "epoch": 0.59, + "grad_norm": 1.171187810371937, + "learning_rate": 3.770823200352741e-06, + "loss": 0.2747, + "step": 20400 + }, + { + "epoch": 0.59, + "grad_norm": 1.2681372562087385, + "learning_rate": 3.7703679127028497e-06, + "loss": 0.2861, + "step": 20401 + }, + { + "epoch": 0.59, + "grad_norm": 2.4473298399942505, + "learning_rate": 3.769912635904475e-06, + "loss": 0.3003, + "step": 20402 + }, + { + "epoch": 0.59, + "grad_norm": 2.0716753786547906, + "learning_rate": 3.7694573699616343e-06, + "loss": 0.3031, + "step": 20403 + }, + { + "epoch": 0.59, + "grad_norm": 1.2663705014478774, + "learning_rate": 3.7690021148783474e-06, + "loss": 0.2842, + "step": 20404 + }, + { + "epoch": 0.59, + "grad_norm": 1.2164640069497181, + "learning_rate": 3.768546870658629e-06, + "loss": 0.3008, + "step": 20405 + }, + { + "epoch": 0.59, + "grad_norm": 1.020079985496612, + "learning_rate": 3.768091637306499e-06, + "loss": 0.6102, + "step": 20406 + }, + { + "epoch": 0.59, + "grad_norm": 1.3056476654680604, + "learning_rate": 3.7676364148259735e-06, + "loss": 0.311, + "step": 20407 + }, + { + "epoch": 0.59, + "grad_norm": 1.9022185941849512, + "learning_rate": 3.7671812032210716e-06, + "loss": 0.3062, + "step": 20408 + }, + { + "epoch": 0.59, + "grad_norm": 1.3659126789482898, + "learning_rate": 3.766726002495808e-06, + "loss": 0.2623, + "step": 20409 + }, + { + "epoch": 0.59, + "grad_norm": 1.2592886646093762, + "learning_rate": 3.7662708126542015e-06, + "loss": 0.2745, + "step": 20410 + }, + { + "epoch": 0.59, + "grad_norm": 2.4255218716221716, + "learning_rate": 3.7658156337002683e-06, + "loss": 0.34, + "step": 20411 + }, + { + "epoch": 0.59, + "grad_norm": 1.2717105585615303, + "learning_rate": 3.7653604656380276e-06, + "loss": 0.3143, + "step": 20412 + }, + { + "epoch": 0.59, + "grad_norm": 1.201988137842925, + "learning_rate": 3.764905308471493e-06, + "loss": 0.2852, + "step": 20413 + }, + { + "epoch": 0.59, + "grad_norm": 1.3041881919126441, + "learning_rate": 3.7644501622046838e-06, + "loss": 0.2754, + "step": 20414 + }, + { + "epoch": 0.59, + "grad_norm": 1.534945357261202, + "learning_rate": 3.7639950268416143e-06, + "loss": 0.2915, + "step": 20415 + }, + { + "epoch": 0.59, + "grad_norm": 1.3494596577533207, + "learning_rate": 3.7635399023863022e-06, + "loss": 0.2861, + "step": 20416 + }, + { + "epoch": 0.59, + "grad_norm": 1.303317070047731, + "learning_rate": 3.7630847888427646e-06, + "loss": 0.3092, + "step": 20417 + }, + { + "epoch": 0.59, + "grad_norm": 1.3365937347465375, + "learning_rate": 3.7626296862150186e-06, + "loss": 0.2662, + "step": 20418 + }, + { + "epoch": 0.59, + "grad_norm": 1.3578617834676407, + "learning_rate": 3.7621745945070777e-06, + "loss": 0.2885, + "step": 20419 + }, + { + "epoch": 0.59, + "grad_norm": 1.2355215775628703, + "learning_rate": 3.7617195137229603e-06, + "loss": 0.3008, + "step": 20420 + }, + { + "epoch": 0.59, + "grad_norm": 1.320005760411763, + "learning_rate": 3.7612644438666818e-06, + "loss": 0.2984, + "step": 20421 + }, + { + "epoch": 0.59, + "grad_norm": 1.5250335234162409, + "learning_rate": 3.7608093849422583e-06, + "loss": 0.2919, + "step": 20422 + }, + { + "epoch": 0.59, + "grad_norm": 1.337424913319179, + "learning_rate": 3.7603543369537066e-06, + "loss": 0.2871, + "step": 20423 + }, + { + "epoch": 0.59, + "grad_norm": 1.2393524308564554, + "learning_rate": 3.759899299905041e-06, + "loss": 0.3016, + "step": 20424 + }, + { + "epoch": 0.59, + "grad_norm": 1.3872072994144493, + "learning_rate": 3.7594442738002775e-06, + "loss": 0.2903, + "step": 20425 + }, + { + "epoch": 0.59, + "grad_norm": 1.2873540850024365, + "learning_rate": 3.7589892586434318e-06, + "loss": 0.2831, + "step": 20426 + }, + { + "epoch": 0.59, + "grad_norm": 1.2806291445972282, + "learning_rate": 3.7585342544385213e-06, + "loss": 0.2975, + "step": 20427 + }, + { + "epoch": 0.59, + "grad_norm": 1.2041900379627977, + "learning_rate": 3.7580792611895585e-06, + "loss": 0.2695, + "step": 20428 + }, + { + "epoch": 0.59, + "grad_norm": 1.319298044093831, + "learning_rate": 3.7576242789005595e-06, + "loss": 0.3275, + "step": 20429 + }, + { + "epoch": 0.59, + "grad_norm": 1.329046144596645, + "learning_rate": 3.7571693075755406e-06, + "loss": 0.3015, + "step": 20430 + }, + { + "epoch": 0.59, + "grad_norm": 1.784547657592079, + "learning_rate": 3.7567143472185176e-06, + "loss": 0.2739, + "step": 20431 + }, + { + "epoch": 0.59, + "grad_norm": 1.2562387551369258, + "learning_rate": 3.756259397833503e-06, + "loss": 0.2991, + "step": 20432 + }, + { + "epoch": 0.59, + "grad_norm": 1.3912423736114035, + "learning_rate": 3.7558044594245125e-06, + "loss": 0.2944, + "step": 20433 + }, + { + "epoch": 0.59, + "grad_norm": 1.451302334997049, + "learning_rate": 3.755349531995562e-06, + "loss": 0.296, + "step": 20434 + }, + { + "epoch": 0.59, + "grad_norm": 1.6631794707326972, + "learning_rate": 3.754894615550666e-06, + "loss": 0.2804, + "step": 20435 + }, + { + "epoch": 0.59, + "grad_norm": 1.283911747492745, + "learning_rate": 3.7544397100938384e-06, + "loss": 0.2904, + "step": 20436 + }, + { + "epoch": 0.59, + "grad_norm": 1.232577539741754, + "learning_rate": 3.7539848156290944e-06, + "loss": 0.2958, + "step": 20437 + }, + { + "epoch": 0.59, + "grad_norm": 1.1974944844595137, + "learning_rate": 3.753529932160448e-06, + "loss": 0.2795, + "step": 20438 + }, + { + "epoch": 0.59, + "grad_norm": 1.4693391375673697, + "learning_rate": 3.7530750596919136e-06, + "loss": 0.3117, + "step": 20439 + }, + { + "epoch": 0.59, + "grad_norm": 1.2353048762838004, + "learning_rate": 3.752620198227508e-06, + "loss": 0.2858, + "step": 20440 + }, + { + "epoch": 0.59, + "grad_norm": 1.1669395622896575, + "learning_rate": 3.7521653477712417e-06, + "loss": 0.2886, + "step": 20441 + }, + { + "epoch": 0.59, + "grad_norm": 1.3828776843305295, + "learning_rate": 3.751710508327129e-06, + "loss": 0.2923, + "step": 20442 + }, + { + "epoch": 0.59, + "grad_norm": 1.2959954852628879, + "learning_rate": 3.751255679899185e-06, + "loss": 0.2912, + "step": 20443 + }, + { + "epoch": 0.59, + "grad_norm": 1.370002644276581, + "learning_rate": 3.7508008624914236e-06, + "loss": 0.2973, + "step": 20444 + }, + { + "epoch": 0.59, + "grad_norm": 1.2704767547767843, + "learning_rate": 3.7503460561078587e-06, + "loss": 0.2723, + "step": 20445 + }, + { + "epoch": 0.59, + "grad_norm": 1.302329206611128, + "learning_rate": 3.749891260752504e-06, + "loss": 0.2752, + "step": 20446 + }, + { + "epoch": 0.59, + "grad_norm": 1.4467499479030579, + "learning_rate": 3.7494364764293722e-06, + "loss": 0.3157, + "step": 20447 + }, + { + "epoch": 0.59, + "grad_norm": 1.2195691607875998, + "learning_rate": 3.7489817031424767e-06, + "loss": 0.2887, + "step": 20448 + }, + { + "epoch": 0.59, + "grad_norm": 1.597980744951308, + "learning_rate": 3.7485269408958323e-06, + "loss": 0.2785, + "step": 20449 + }, + { + "epoch": 0.59, + "grad_norm": 1.3800208202482862, + "learning_rate": 3.748072189693453e-06, + "loss": 0.315, + "step": 20450 + }, + { + "epoch": 0.59, + "grad_norm": 1.315688302219665, + "learning_rate": 3.747617449539349e-06, + "loss": 0.2822, + "step": 20451 + }, + { + "epoch": 0.59, + "grad_norm": 1.3362325666281387, + "learning_rate": 3.747162720437535e-06, + "loss": 0.2997, + "step": 20452 + }, + { + "epoch": 0.59, + "grad_norm": 1.8030649586168173, + "learning_rate": 3.7467080023920234e-06, + "loss": 0.2764, + "step": 20453 + }, + { + "epoch": 0.59, + "grad_norm": 1.248669862888152, + "learning_rate": 3.7462532954068288e-06, + "loss": 0.292, + "step": 20454 + }, + { + "epoch": 0.59, + "grad_norm": 1.6026438705212085, + "learning_rate": 3.7457985994859617e-06, + "loss": 0.3031, + "step": 20455 + }, + { + "epoch": 0.59, + "grad_norm": 1.3928211232927135, + "learning_rate": 3.745343914633436e-06, + "loss": 0.2967, + "step": 20456 + }, + { + "epoch": 0.59, + "grad_norm": 1.4425142658787502, + "learning_rate": 3.7448892408532644e-06, + "loss": 0.2766, + "step": 20457 + }, + { + "epoch": 0.59, + "grad_norm": 0.9824644210480722, + "learning_rate": 3.744434578149459e-06, + "loss": 0.6713, + "step": 20458 + }, + { + "epoch": 0.59, + "grad_norm": 1.508461202078661, + "learning_rate": 3.7439799265260336e-06, + "loss": 0.3325, + "step": 20459 + }, + { + "epoch": 0.59, + "grad_norm": 1.2166375156542661, + "learning_rate": 3.7435252859869975e-06, + "loss": 0.2884, + "step": 20460 + }, + { + "epoch": 0.59, + "grad_norm": 1.2987216544683646, + "learning_rate": 3.743070656536365e-06, + "loss": 0.2869, + "step": 20461 + }, + { + "epoch": 0.59, + "grad_norm": 1.5532432127153353, + "learning_rate": 3.7426160381781483e-06, + "loss": 0.3013, + "step": 20462 + }, + { + "epoch": 0.59, + "grad_norm": 1.5788088362654, + "learning_rate": 3.74216143091636e-06, + "loss": 0.3133, + "step": 20463 + }, + { + "epoch": 0.59, + "grad_norm": 1.2885148873617662, + "learning_rate": 3.74170683475501e-06, + "loss": 0.2942, + "step": 20464 + }, + { + "epoch": 0.59, + "grad_norm": 1.2737646272648069, + "learning_rate": 3.7412522496981106e-06, + "loss": 0.2891, + "step": 20465 + }, + { + "epoch": 0.59, + "grad_norm": 1.3255580167334826, + "learning_rate": 3.7407976757496745e-06, + "loss": 0.2724, + "step": 20466 + }, + { + "epoch": 0.59, + "grad_norm": 1.2536077591860229, + "learning_rate": 3.740343112913714e-06, + "loss": 0.299, + "step": 20467 + }, + { + "epoch": 0.59, + "grad_norm": 1.2548899013086798, + "learning_rate": 3.7398885611942394e-06, + "loss": 0.3004, + "step": 20468 + }, + { + "epoch": 0.59, + "grad_norm": 1.296358435363787, + "learning_rate": 3.739434020595263e-06, + "loss": 0.2905, + "step": 20469 + }, + { + "epoch": 0.59, + "grad_norm": 1.3660990813785991, + "learning_rate": 3.738979491120793e-06, + "loss": 0.2863, + "step": 20470 + }, + { + "epoch": 0.59, + "grad_norm": 1.282887512726405, + "learning_rate": 3.7385249727748435e-06, + "loss": 0.2953, + "step": 20471 + }, + { + "epoch": 0.59, + "grad_norm": 1.3451132002404194, + "learning_rate": 3.7380704655614255e-06, + "loss": 0.2836, + "step": 20472 + }, + { + "epoch": 0.59, + "grad_norm": 1.3165091396898534, + "learning_rate": 3.7376159694845497e-06, + "loss": 0.3056, + "step": 20473 + }, + { + "epoch": 0.59, + "grad_norm": 1.364640549219642, + "learning_rate": 3.737161484548227e-06, + "loss": 0.2896, + "step": 20474 + }, + { + "epoch": 0.59, + "grad_norm": 1.3981860729057456, + "learning_rate": 3.7367070107564673e-06, + "loss": 0.3282, + "step": 20475 + }, + { + "epoch": 0.59, + "grad_norm": 1.2388422622474593, + "learning_rate": 3.7362525481132826e-06, + "loss": 0.281, + "step": 20476 + }, + { + "epoch": 0.59, + "grad_norm": 1.3346035533685776, + "learning_rate": 3.7357980966226846e-06, + "loss": 0.2971, + "step": 20477 + }, + { + "epoch": 0.59, + "grad_norm": 1.4623576966494798, + "learning_rate": 3.7353436562886816e-06, + "loss": 0.2916, + "step": 20478 + }, + { + "epoch": 0.59, + "grad_norm": 1.2242186771890853, + "learning_rate": 3.7348892271152837e-06, + "loss": 0.288, + "step": 20479 + }, + { + "epoch": 0.59, + "grad_norm": 1.456548632866406, + "learning_rate": 3.734434809106503e-06, + "loss": 0.2903, + "step": 20480 + }, + { + "epoch": 0.59, + "grad_norm": 1.3501234023469755, + "learning_rate": 3.7339804022663495e-06, + "loss": 0.3158, + "step": 20481 + }, + { + "epoch": 0.59, + "grad_norm": 1.3422460062779722, + "learning_rate": 3.7335260065988333e-06, + "loss": 0.2957, + "step": 20482 + }, + { + "epoch": 0.59, + "grad_norm": 1.6549227922702177, + "learning_rate": 3.7330716221079637e-06, + "loss": 0.2924, + "step": 20483 + }, + { + "epoch": 0.59, + "grad_norm": 1.2907049477917327, + "learning_rate": 3.732617248797751e-06, + "loss": 0.3044, + "step": 20484 + }, + { + "epoch": 0.59, + "grad_norm": 1.3225133821466177, + "learning_rate": 3.7321628866722048e-06, + "loss": 0.3018, + "step": 20485 + }, + { + "epoch": 0.59, + "grad_norm": 1.2244238373496728, + "learning_rate": 3.7317085357353368e-06, + "loss": 0.298, + "step": 20486 + }, + { + "epoch": 0.59, + "grad_norm": 1.4205212063889556, + "learning_rate": 3.7312541959911536e-06, + "loss": 0.2929, + "step": 20487 + }, + { + "epoch": 0.59, + "grad_norm": 1.2338621549568258, + "learning_rate": 3.730799867443666e-06, + "loss": 0.2883, + "step": 20488 + }, + { + "epoch": 0.59, + "grad_norm": 1.2190522995100008, + "learning_rate": 3.7303455500968844e-06, + "loss": 0.288, + "step": 20489 + }, + { + "epoch": 0.59, + "grad_norm": 1.2670561909560847, + "learning_rate": 3.729891243954818e-06, + "loss": 0.2946, + "step": 20490 + }, + { + "epoch": 0.59, + "grad_norm": 1.2483645956459497, + "learning_rate": 3.7294369490214743e-06, + "loss": 0.3092, + "step": 20491 + }, + { + "epoch": 0.59, + "grad_norm": 0.9312382201277093, + "learning_rate": 3.728982665300864e-06, + "loss": 0.6061, + "step": 20492 + }, + { + "epoch": 0.59, + "grad_norm": 1.7520428543068522, + "learning_rate": 3.728528392796995e-06, + "loss": 0.2814, + "step": 20493 + }, + { + "epoch": 0.59, + "grad_norm": 1.407746997814198, + "learning_rate": 3.728074131513878e-06, + "loss": 0.3187, + "step": 20494 + }, + { + "epoch": 0.59, + "grad_norm": 1.3037183997165016, + "learning_rate": 3.727619881455521e-06, + "loss": 0.3033, + "step": 20495 + }, + { + "epoch": 0.59, + "grad_norm": 1.3959712942857814, + "learning_rate": 3.7271656426259346e-06, + "loss": 0.3007, + "step": 20496 + }, + { + "epoch": 0.59, + "grad_norm": 1.2768558184626242, + "learning_rate": 3.726711415029123e-06, + "loss": 0.2943, + "step": 20497 + }, + { + "epoch": 0.59, + "grad_norm": 1.227182509193489, + "learning_rate": 3.726257198669097e-06, + "loss": 0.3091, + "step": 20498 + }, + { + "epoch": 0.59, + "grad_norm": 1.3039459773250857, + "learning_rate": 3.7258029935498655e-06, + "loss": 0.3047, + "step": 20499 + }, + { + "epoch": 0.59, + "grad_norm": 1.7969231422700207, + "learning_rate": 3.7253487996754374e-06, + "loss": 0.2931, + "step": 20500 + }, + { + "epoch": 0.59, + "grad_norm": 1.3350137074504314, + "learning_rate": 3.7248946170498207e-06, + "loss": 0.3115, + "step": 20501 + }, + { + "epoch": 0.59, + "grad_norm": 1.30733367950907, + "learning_rate": 3.7244404456770224e-06, + "loss": 0.3036, + "step": 20502 + }, + { + "epoch": 0.59, + "grad_norm": 1.4471173152396266, + "learning_rate": 3.723986285561051e-06, + "loss": 0.2961, + "step": 20503 + }, + { + "epoch": 0.59, + "grad_norm": 1.6059130875779282, + "learning_rate": 3.723532136705915e-06, + "loss": 0.3178, + "step": 20504 + }, + { + "epoch": 0.59, + "grad_norm": 1.4258226955247406, + "learning_rate": 3.723077999115623e-06, + "loss": 0.2985, + "step": 20505 + }, + { + "epoch": 0.59, + "grad_norm": 1.688335864898217, + "learning_rate": 3.7226238727941806e-06, + "loss": 0.3139, + "step": 20506 + }, + { + "epoch": 0.59, + "grad_norm": 1.293170351816274, + "learning_rate": 3.722169757745597e-06, + "loss": 0.3067, + "step": 20507 + }, + { + "epoch": 0.59, + "grad_norm": 1.230299726027857, + "learning_rate": 3.7217156539738793e-06, + "loss": 0.3109, + "step": 20508 + }, + { + "epoch": 0.59, + "grad_norm": 1.2303905692670665, + "learning_rate": 3.7212615614830363e-06, + "loss": 0.307, + "step": 20509 + }, + { + "epoch": 0.59, + "grad_norm": 1.3115723792512384, + "learning_rate": 3.7208074802770734e-06, + "loss": 0.315, + "step": 20510 + }, + { + "epoch": 0.59, + "grad_norm": 2.5101861110165338, + "learning_rate": 3.720353410359998e-06, + "loss": 0.3124, + "step": 20511 + }, + { + "epoch": 0.59, + "grad_norm": 1.2802149335173956, + "learning_rate": 3.7198993517358184e-06, + "loss": 0.2854, + "step": 20512 + }, + { + "epoch": 0.59, + "grad_norm": 1.2178691266805797, + "learning_rate": 3.719445304408542e-06, + "loss": 0.2981, + "step": 20513 + }, + { + "epoch": 0.6, + "grad_norm": 1.2540393141078636, + "learning_rate": 3.718991268382174e-06, + "loss": 0.3109, + "step": 20514 + }, + { + "epoch": 0.6, + "grad_norm": 1.2963693039520021, + "learning_rate": 3.7185372436607227e-06, + "loss": 0.2953, + "step": 20515 + }, + { + "epoch": 0.6, + "grad_norm": 1.2531573679604389, + "learning_rate": 3.7180832302481938e-06, + "loss": 0.3025, + "step": 20516 + }, + { + "epoch": 0.6, + "grad_norm": 1.3388180470986704, + "learning_rate": 3.717629228148595e-06, + "loss": 0.3081, + "step": 20517 + }, + { + "epoch": 0.6, + "grad_norm": 1.2642485783810251, + "learning_rate": 3.7171752373659335e-06, + "loss": 0.2927, + "step": 20518 + }, + { + "epoch": 0.6, + "grad_norm": 1.2863183213945288, + "learning_rate": 3.716721257904214e-06, + "loss": 0.2962, + "step": 20519 + }, + { + "epoch": 0.6, + "grad_norm": 1.3067771541189996, + "learning_rate": 3.7162672897674433e-06, + "loss": 0.2915, + "step": 20520 + }, + { + "epoch": 0.6, + "grad_norm": 1.4355181440046265, + "learning_rate": 3.715813332959628e-06, + "loss": 0.2913, + "step": 20521 + }, + { + "epoch": 0.6, + "grad_norm": 1.2715035543564, + "learning_rate": 3.715359387484775e-06, + "loss": 0.2909, + "step": 20522 + }, + { + "epoch": 0.6, + "grad_norm": 1.346883137683357, + "learning_rate": 3.714905453346889e-06, + "loss": 0.3055, + "step": 20523 + }, + { + "epoch": 0.6, + "grad_norm": 1.3054839612172742, + "learning_rate": 3.7144515305499767e-06, + "loss": 0.2975, + "step": 20524 + }, + { + "epoch": 0.6, + "grad_norm": 1.5403808928344962, + "learning_rate": 3.7139976190980465e-06, + "loss": 0.292, + "step": 20525 + }, + { + "epoch": 0.6, + "grad_norm": 1.3500117725669598, + "learning_rate": 3.7135437189950986e-06, + "loss": 0.277, + "step": 20526 + }, + { + "epoch": 0.6, + "grad_norm": 1.3763879444481955, + "learning_rate": 3.7130898302451417e-06, + "loss": 0.3081, + "step": 20527 + }, + { + "epoch": 0.6, + "grad_norm": 1.3048441718073134, + "learning_rate": 3.7126359528521827e-06, + "loss": 0.2857, + "step": 20528 + }, + { + "epoch": 0.6, + "grad_norm": 1.583357940268377, + "learning_rate": 3.7121820868202246e-06, + "loss": 0.3157, + "step": 20529 + }, + { + "epoch": 0.6, + "grad_norm": 2.0771715682895135, + "learning_rate": 3.711728232153274e-06, + "loss": 0.2874, + "step": 20530 + }, + { + "epoch": 0.6, + "grad_norm": 1.7412230705105425, + "learning_rate": 3.7112743888553358e-06, + "loss": 0.2806, + "step": 20531 + }, + { + "epoch": 0.6, + "grad_norm": 1.7248483112185857, + "learning_rate": 3.7108205569304167e-06, + "loss": 0.2791, + "step": 20532 + }, + { + "epoch": 0.6, + "grad_norm": 1.5041470019134344, + "learning_rate": 3.710366736382519e-06, + "loss": 0.2903, + "step": 20533 + }, + { + "epoch": 0.6, + "grad_norm": 1.6622298048002886, + "learning_rate": 3.7099129272156498e-06, + "loss": 0.3041, + "step": 20534 + }, + { + "epoch": 0.6, + "grad_norm": 1.6262946080168892, + "learning_rate": 3.7094591294338127e-06, + "loss": 0.2867, + "step": 20535 + }, + { + "epoch": 0.6, + "grad_norm": 0.9964505604193696, + "learning_rate": 3.7090053430410133e-06, + "loss": 0.6358, + "step": 20536 + }, + { + "epoch": 0.6, + "grad_norm": 1.4929787862661459, + "learning_rate": 3.7085515680412577e-06, + "loss": 0.2929, + "step": 20537 + }, + { + "epoch": 0.6, + "grad_norm": 1.8185303192086415, + "learning_rate": 3.708097804438547e-06, + "loss": 0.289, + "step": 20538 + }, + { + "epoch": 0.6, + "grad_norm": 1.4081752397736826, + "learning_rate": 3.707644052236887e-06, + "loss": 0.295, + "step": 20539 + }, + { + "epoch": 0.6, + "grad_norm": 1.2820197873519628, + "learning_rate": 3.707190311440284e-06, + "loss": 0.3181, + "step": 20540 + }, + { + "epoch": 0.6, + "grad_norm": 0.9485254915104673, + "learning_rate": 3.7067365820527413e-06, + "loss": 0.6395, + "step": 20541 + }, + { + "epoch": 0.6, + "grad_norm": 1.4385087230365827, + "learning_rate": 3.7062828640782613e-06, + "loss": 0.2827, + "step": 20542 + }, + { + "epoch": 0.6, + "grad_norm": 1.3092360668517053, + "learning_rate": 3.7058291575208494e-06, + "loss": 0.3153, + "step": 20543 + }, + { + "epoch": 0.6, + "grad_norm": 1.3880232448493512, + "learning_rate": 3.70537546238451e-06, + "loss": 0.2857, + "step": 20544 + }, + { + "epoch": 0.6, + "grad_norm": 1.517657345474924, + "learning_rate": 3.704921778673247e-06, + "loss": 0.3062, + "step": 20545 + }, + { + "epoch": 0.6, + "grad_norm": 1.4142869496047223, + "learning_rate": 3.7044681063910632e-06, + "loss": 0.2887, + "step": 20546 + }, + { + "epoch": 0.6, + "grad_norm": 1.300466085779789, + "learning_rate": 3.7040144455419624e-06, + "loss": 0.3022, + "step": 20547 + }, + { + "epoch": 0.6, + "grad_norm": 1.2482715467288936, + "learning_rate": 3.7035607961299485e-06, + "loss": 0.2901, + "step": 20548 + }, + { + "epoch": 0.6, + "grad_norm": 1.418385246524061, + "learning_rate": 3.7031071581590263e-06, + "loss": 0.2913, + "step": 20549 + }, + { + "epoch": 0.6, + "grad_norm": 1.3991309740920248, + "learning_rate": 3.7026535316331963e-06, + "loss": 0.2914, + "step": 20550 + }, + { + "epoch": 0.6, + "grad_norm": 1.7048128582642397, + "learning_rate": 3.702199916556464e-06, + "loss": 0.303, + "step": 20551 + }, + { + "epoch": 0.6, + "grad_norm": 1.3672517517604386, + "learning_rate": 3.7017463129328314e-06, + "loss": 0.2865, + "step": 20552 + }, + { + "epoch": 0.6, + "grad_norm": 1.3508557795581828, + "learning_rate": 3.7012927207663034e-06, + "loss": 0.3069, + "step": 20553 + }, + { + "epoch": 0.6, + "grad_norm": 1.6261472243202348, + "learning_rate": 3.70083914006088e-06, + "loss": 0.296, + "step": 20554 + }, + { + "epoch": 0.6, + "grad_norm": 1.3064731985665436, + "learning_rate": 3.700385570820566e-06, + "loss": 0.2868, + "step": 20555 + }, + { + "epoch": 0.6, + "grad_norm": 1.2880895334545388, + "learning_rate": 3.699932013049364e-06, + "loss": 0.299, + "step": 20556 + }, + { + "epoch": 0.6, + "grad_norm": 1.2794218799644443, + "learning_rate": 3.6994784667512762e-06, + "loss": 0.2971, + "step": 20557 + }, + { + "epoch": 0.6, + "grad_norm": 1.3805039023298455, + "learning_rate": 3.699024931930305e-06, + "loss": 0.3133, + "step": 20558 + }, + { + "epoch": 0.6, + "grad_norm": 1.5602576301688469, + "learning_rate": 3.698571408590453e-06, + "loss": 0.2811, + "step": 20559 + }, + { + "epoch": 0.6, + "grad_norm": 1.2103874497862004, + "learning_rate": 3.698117896735724e-06, + "loss": 0.2988, + "step": 20560 + }, + { + "epoch": 0.6, + "grad_norm": 1.3049067138805757, + "learning_rate": 3.697664396370118e-06, + "loss": 0.2953, + "step": 20561 + }, + { + "epoch": 0.6, + "grad_norm": 1.461857381555799, + "learning_rate": 3.697210907497638e-06, + "loss": 0.2936, + "step": 20562 + }, + { + "epoch": 0.6, + "grad_norm": 1.4438108592824717, + "learning_rate": 3.6967574301222864e-06, + "loss": 0.3085, + "step": 20563 + }, + { + "epoch": 0.6, + "grad_norm": 1.6791271184409677, + "learning_rate": 3.696303964248066e-06, + "loss": 0.2934, + "step": 20564 + }, + { + "epoch": 0.6, + "grad_norm": 1.6292243024987683, + "learning_rate": 3.6958505098789767e-06, + "loss": 0.2831, + "step": 20565 + }, + { + "epoch": 0.6, + "grad_norm": 1.4194524113133573, + "learning_rate": 3.6953970670190204e-06, + "loss": 0.3021, + "step": 20566 + }, + { + "epoch": 0.6, + "grad_norm": 1.274804753139616, + "learning_rate": 3.6949436356722002e-06, + "loss": 0.3169, + "step": 20567 + }, + { + "epoch": 0.6, + "grad_norm": 1.2621781845280273, + "learning_rate": 3.694490215842517e-06, + "loss": 0.2866, + "step": 20568 + }, + { + "epoch": 0.6, + "grad_norm": 1.2682002149170974, + "learning_rate": 3.6940368075339716e-06, + "loss": 0.2806, + "step": 20569 + }, + { + "epoch": 0.6, + "grad_norm": 1.2386598548148016, + "learning_rate": 3.6935834107505663e-06, + "loss": 0.291, + "step": 20570 + }, + { + "epoch": 0.6, + "grad_norm": 1.3951808060410475, + "learning_rate": 3.6931300254963014e-06, + "loss": 0.3161, + "step": 20571 + }, + { + "epoch": 0.6, + "grad_norm": 1.2640762571014676, + "learning_rate": 3.692676651775179e-06, + "loss": 0.3167, + "step": 20572 + }, + { + "epoch": 0.6, + "grad_norm": 1.237280618658344, + "learning_rate": 3.6922232895912003e-06, + "loss": 0.288, + "step": 20573 + }, + { + "epoch": 0.6, + "grad_norm": 1.3359139526122452, + "learning_rate": 3.6917699389483643e-06, + "loss": 0.3095, + "step": 20574 + }, + { + "epoch": 0.6, + "grad_norm": 1.7077957775689692, + "learning_rate": 3.691316599850674e-06, + "loss": 0.2989, + "step": 20575 + }, + { + "epoch": 0.6, + "grad_norm": 2.2469341928276774, + "learning_rate": 3.690863272302128e-06, + "loss": 0.2849, + "step": 20576 + }, + { + "epoch": 0.6, + "grad_norm": 1.477700217210414, + "learning_rate": 3.6904099563067297e-06, + "loss": 0.2882, + "step": 20577 + }, + { + "epoch": 0.6, + "grad_norm": 1.471530002550376, + "learning_rate": 3.6899566518684775e-06, + "loss": 0.2984, + "step": 20578 + }, + { + "epoch": 0.6, + "grad_norm": 1.232345029146178, + "learning_rate": 3.689503358991372e-06, + "loss": 0.28, + "step": 20579 + }, + { + "epoch": 0.6, + "grad_norm": 2.0004792951257127, + "learning_rate": 3.6890500776794136e-06, + "loss": 0.2957, + "step": 20580 + }, + { + "epoch": 0.6, + "grad_norm": 1.364045674362361, + "learning_rate": 3.688596807936605e-06, + "loss": 0.2977, + "step": 20581 + }, + { + "epoch": 0.6, + "grad_norm": 1.6909270469094841, + "learning_rate": 3.688143549766943e-06, + "loss": 0.2798, + "step": 20582 + }, + { + "epoch": 0.6, + "grad_norm": 1.503462291383209, + "learning_rate": 3.6876903031744286e-06, + "loss": 0.3034, + "step": 20583 + }, + { + "epoch": 0.6, + "grad_norm": 1.4695628259874414, + "learning_rate": 3.6872370681630613e-06, + "loss": 0.2998, + "step": 20584 + }, + { + "epoch": 0.6, + "grad_norm": 1.3148183017720656, + "learning_rate": 3.6867838447368414e-06, + "loss": 0.3021, + "step": 20585 + }, + { + "epoch": 0.6, + "grad_norm": 1.666490659929298, + "learning_rate": 3.6863306328997683e-06, + "loss": 0.2983, + "step": 20586 + }, + { + "epoch": 0.6, + "grad_norm": 2.0405062598480685, + "learning_rate": 3.6858774326558432e-06, + "loss": 0.2867, + "step": 20587 + }, + { + "epoch": 0.6, + "grad_norm": 1.3103214561574548, + "learning_rate": 3.685424244009064e-06, + "loss": 0.2926, + "step": 20588 + }, + { + "epoch": 0.6, + "grad_norm": 1.3632981952276624, + "learning_rate": 3.68497106696343e-06, + "loss": 0.3284, + "step": 20589 + }, + { + "epoch": 0.6, + "grad_norm": 1.4046549617460873, + "learning_rate": 3.6845179015229404e-06, + "loss": 0.2829, + "step": 20590 + }, + { + "epoch": 0.6, + "grad_norm": 1.2300783328077942, + "learning_rate": 3.684064747691595e-06, + "loss": 0.2858, + "step": 20591 + }, + { + "epoch": 0.6, + "grad_norm": 1.373230875774539, + "learning_rate": 3.683611605473394e-06, + "loss": 0.3004, + "step": 20592 + }, + { + "epoch": 0.6, + "grad_norm": 1.2643956929488305, + "learning_rate": 3.6831584748723343e-06, + "loss": 0.2903, + "step": 20593 + }, + { + "epoch": 0.6, + "grad_norm": 0.9924579468353286, + "learning_rate": 3.6827053558924152e-06, + "loss": 0.5985, + "step": 20594 + }, + { + "epoch": 0.6, + "grad_norm": 1.140867168139003, + "learning_rate": 3.6822522485376367e-06, + "loss": 0.3072, + "step": 20595 + }, + { + "epoch": 0.6, + "grad_norm": 1.3901477782179994, + "learning_rate": 3.6817991528119975e-06, + "loss": 0.2878, + "step": 20596 + }, + { + "epoch": 0.6, + "grad_norm": 1.3539685528444725, + "learning_rate": 3.681346068719494e-06, + "loss": 0.2915, + "step": 20597 + }, + { + "epoch": 0.6, + "grad_norm": 1.4227995645419027, + "learning_rate": 3.680892996264126e-06, + "loss": 0.2998, + "step": 20598 + }, + { + "epoch": 0.6, + "grad_norm": 1.2811654541614301, + "learning_rate": 3.680439935449892e-06, + "loss": 0.3092, + "step": 20599 + }, + { + "epoch": 0.6, + "grad_norm": 1.251276092221638, + "learning_rate": 3.679986886280792e-06, + "loss": 0.2945, + "step": 20600 + }, + { + "epoch": 0.6, + "grad_norm": 1.3171765158225757, + "learning_rate": 3.679533848760821e-06, + "loss": 0.2858, + "step": 20601 + }, + { + "epoch": 0.6, + "grad_norm": 1.3514945058534362, + "learning_rate": 3.679080822893978e-06, + "loss": 0.3003, + "step": 20602 + }, + { + "epoch": 0.6, + "grad_norm": 1.7837441518833472, + "learning_rate": 3.678627808684262e-06, + "loss": 0.3275, + "step": 20603 + }, + { + "epoch": 0.6, + "grad_norm": 1.2665780228000085, + "learning_rate": 3.67817480613567e-06, + "loss": 0.2965, + "step": 20604 + }, + { + "epoch": 0.6, + "grad_norm": 1.3595355399787117, + "learning_rate": 3.677721815252201e-06, + "loss": 0.286, + "step": 20605 + }, + { + "epoch": 0.6, + "grad_norm": 1.2840945459172017, + "learning_rate": 3.67726883603785e-06, + "loss": 0.2861, + "step": 20606 + }, + { + "epoch": 0.6, + "grad_norm": 1.53215356035218, + "learning_rate": 3.676815868496617e-06, + "loss": 0.2958, + "step": 20607 + }, + { + "epoch": 0.6, + "grad_norm": 1.5957707293985575, + "learning_rate": 3.6763629126324983e-06, + "loss": 0.2914, + "step": 20608 + }, + { + "epoch": 0.6, + "grad_norm": 1.498850281311405, + "learning_rate": 3.6759099684494937e-06, + "loss": 0.2781, + "step": 20609 + }, + { + "epoch": 0.6, + "grad_norm": 1.3876232479026713, + "learning_rate": 3.6754570359515974e-06, + "loss": 0.2737, + "step": 20610 + }, + { + "epoch": 0.6, + "grad_norm": 1.3774696856787265, + "learning_rate": 3.675004115142806e-06, + "loss": 0.3378, + "step": 20611 + }, + { + "epoch": 0.6, + "grad_norm": 1.2929257180669402, + "learning_rate": 3.6745512060271184e-06, + "loss": 0.2848, + "step": 20612 + }, + { + "epoch": 0.6, + "grad_norm": 1.686982925889722, + "learning_rate": 3.6740983086085312e-06, + "loss": 0.3271, + "step": 20613 + }, + { + "epoch": 0.6, + "grad_norm": 5.0971951381769305, + "learning_rate": 3.673645422891041e-06, + "loss": 0.2803, + "step": 20614 + }, + { + "epoch": 0.6, + "grad_norm": 1.350095370584115, + "learning_rate": 3.673192548878646e-06, + "loss": 0.3231, + "step": 20615 + }, + { + "epoch": 0.6, + "grad_norm": 1.2393697234490664, + "learning_rate": 3.6727396865753397e-06, + "loss": 0.2825, + "step": 20616 + }, + { + "epoch": 0.6, + "grad_norm": 1.8936643201308512, + "learning_rate": 3.6722868359851207e-06, + "loss": 0.265, + "step": 20617 + }, + { + "epoch": 0.6, + "grad_norm": 1.2108810572205733, + "learning_rate": 3.6718339971119855e-06, + "loss": 0.2822, + "step": 20618 + }, + { + "epoch": 0.6, + "grad_norm": 1.3064440778877877, + "learning_rate": 3.6713811699599305e-06, + "loss": 0.3139, + "step": 20619 + }, + { + "epoch": 0.6, + "grad_norm": 1.3405420742233098, + "learning_rate": 3.6709283545329506e-06, + "loss": 0.2742, + "step": 20620 + }, + { + "epoch": 0.6, + "grad_norm": 1.4259726294422013, + "learning_rate": 3.6704755508350426e-06, + "loss": 0.3128, + "step": 20621 + }, + { + "epoch": 0.6, + "grad_norm": 1.3158270605914213, + "learning_rate": 3.6700227588702033e-06, + "loss": 0.3237, + "step": 20622 + }, + { + "epoch": 0.6, + "grad_norm": 1.2744477612784113, + "learning_rate": 3.6695699786424276e-06, + "loss": 0.3016, + "step": 20623 + }, + { + "epoch": 0.6, + "grad_norm": 1.2360079256483185, + "learning_rate": 3.6691172101557115e-06, + "loss": 0.2907, + "step": 20624 + }, + { + "epoch": 0.6, + "grad_norm": 1.300908707236652, + "learning_rate": 3.6686644534140507e-06, + "loss": 0.3257, + "step": 20625 + }, + { + "epoch": 0.6, + "grad_norm": 1.378256312119553, + "learning_rate": 3.668211708421441e-06, + "loss": 0.3196, + "step": 20626 + }, + { + "epoch": 0.6, + "grad_norm": 1.21367855888247, + "learning_rate": 3.6677589751818777e-06, + "loss": 0.2984, + "step": 20627 + }, + { + "epoch": 0.6, + "grad_norm": 1.4393225487631296, + "learning_rate": 3.6673062536993563e-06, + "loss": 0.3015, + "step": 20628 + }, + { + "epoch": 0.6, + "grad_norm": 1.1979246405880386, + "learning_rate": 3.6668535439778717e-06, + "loss": 0.2725, + "step": 20629 + }, + { + "epoch": 0.6, + "grad_norm": 1.3256350158789343, + "learning_rate": 3.6664008460214194e-06, + "loss": 0.3, + "step": 20630 + }, + { + "epoch": 0.6, + "grad_norm": 0.9548316217231066, + "learning_rate": 3.6659481598339952e-06, + "loss": 0.5874, + "step": 20631 + }, + { + "epoch": 0.6, + "grad_norm": 1.5095692678379462, + "learning_rate": 3.6654954854195934e-06, + "loss": 0.2864, + "step": 20632 + }, + { + "epoch": 0.6, + "grad_norm": 1.2442917714003212, + "learning_rate": 3.6650428227822076e-06, + "loss": 0.2756, + "step": 20633 + }, + { + "epoch": 0.6, + "grad_norm": 1.285542501416898, + "learning_rate": 3.664590171925834e-06, + "loss": 0.2975, + "step": 20634 + }, + { + "epoch": 0.6, + "grad_norm": 1.2245718975004298, + "learning_rate": 3.664137532854467e-06, + "loss": 0.2913, + "step": 20635 + }, + { + "epoch": 0.6, + "grad_norm": 1.2928124939385026, + "learning_rate": 3.6636849055721025e-06, + "loss": 0.3042, + "step": 20636 + }, + { + "epoch": 0.6, + "grad_norm": 0.9166245806417412, + "learning_rate": 3.6632322900827326e-06, + "loss": 0.5818, + "step": 20637 + }, + { + "epoch": 0.6, + "grad_norm": 1.3043262830392601, + "learning_rate": 3.662779686390353e-06, + "loss": 0.2943, + "step": 20638 + }, + { + "epoch": 0.6, + "grad_norm": 1.2069829719504004, + "learning_rate": 3.662327094498957e-06, + "loss": 0.2935, + "step": 20639 + }, + { + "epoch": 0.6, + "grad_norm": 1.3531927766106364, + "learning_rate": 3.661874514412539e-06, + "loss": 0.3094, + "step": 20640 + }, + { + "epoch": 0.6, + "grad_norm": 1.4779351108664445, + "learning_rate": 3.6614219461350924e-06, + "loss": 0.2928, + "step": 20641 + }, + { + "epoch": 0.6, + "grad_norm": 1.2215612685148731, + "learning_rate": 3.6609693896706138e-06, + "loss": 0.2917, + "step": 20642 + }, + { + "epoch": 0.6, + "grad_norm": 1.4788215669475986, + "learning_rate": 3.6605168450230943e-06, + "loss": 0.2987, + "step": 20643 + }, + { + "epoch": 0.6, + "grad_norm": 1.4138360750088144, + "learning_rate": 3.6600643121965278e-06, + "loss": 0.2778, + "step": 20644 + }, + { + "epoch": 0.6, + "grad_norm": 1.9152441044415132, + "learning_rate": 3.6596117911949092e-06, + "loss": 0.3123, + "step": 20645 + }, + { + "epoch": 0.6, + "grad_norm": 1.3820520892391082, + "learning_rate": 3.659159282022232e-06, + "loss": 0.3108, + "step": 20646 + }, + { + "epoch": 0.6, + "grad_norm": 1.3074342898525078, + "learning_rate": 3.6587067846824886e-06, + "loss": 0.2898, + "step": 20647 + }, + { + "epoch": 0.6, + "grad_norm": 1.3587933017169274, + "learning_rate": 3.6582542991796723e-06, + "loss": 0.2964, + "step": 20648 + }, + { + "epoch": 0.6, + "grad_norm": 1.4695325136363298, + "learning_rate": 3.657801825517776e-06, + "loss": 0.298, + "step": 20649 + }, + { + "epoch": 0.6, + "grad_norm": 1.2670481993561802, + "learning_rate": 3.6573493637007945e-06, + "loss": 0.2941, + "step": 20650 + }, + { + "epoch": 0.6, + "grad_norm": 1.2704898777308082, + "learning_rate": 3.6568969137327206e-06, + "loss": 0.2868, + "step": 20651 + }, + { + "epoch": 0.6, + "grad_norm": 1.8752669737059016, + "learning_rate": 3.6564444756175454e-06, + "loss": 0.3026, + "step": 20652 + }, + { + "epoch": 0.6, + "grad_norm": 1.2395759015845387, + "learning_rate": 3.655992049359262e-06, + "loss": 0.2799, + "step": 20653 + }, + { + "epoch": 0.6, + "grad_norm": 1.5361330161022317, + "learning_rate": 3.6555396349618643e-06, + "loss": 0.2809, + "step": 20654 + }, + { + "epoch": 0.6, + "grad_norm": 1.3386388604107609, + "learning_rate": 3.655087232429345e-06, + "loss": 0.2961, + "step": 20655 + }, + { + "epoch": 0.6, + "grad_norm": 1.3112430513586628, + "learning_rate": 3.6546348417656947e-06, + "loss": 0.2802, + "step": 20656 + }, + { + "epoch": 0.6, + "grad_norm": 1.3289040840117126, + "learning_rate": 3.6541824629749068e-06, + "loss": 0.2947, + "step": 20657 + }, + { + "epoch": 0.6, + "grad_norm": 1.2464805351139914, + "learning_rate": 3.653730096060974e-06, + "loss": 0.3104, + "step": 20658 + }, + { + "epoch": 0.6, + "grad_norm": 1.2858851214274238, + "learning_rate": 3.6532777410278884e-06, + "loss": 0.283, + "step": 20659 + }, + { + "epoch": 0.6, + "grad_norm": 1.3591103701699698, + "learning_rate": 3.6528253978796413e-06, + "loss": 0.2963, + "step": 20660 + }, + { + "epoch": 0.6, + "grad_norm": 1.2692631410786495, + "learning_rate": 3.652373066620225e-06, + "loss": 0.2753, + "step": 20661 + }, + { + "epoch": 0.6, + "grad_norm": 1.2793628160185133, + "learning_rate": 3.651920747253631e-06, + "loss": 0.3074, + "step": 20662 + }, + { + "epoch": 0.6, + "grad_norm": 1.5205227008731934, + "learning_rate": 3.651468439783851e-06, + "loss": 0.305, + "step": 20663 + }, + { + "epoch": 0.6, + "grad_norm": 1.1264660498238996, + "learning_rate": 3.6510161442148783e-06, + "loss": 0.2856, + "step": 20664 + }, + { + "epoch": 0.6, + "grad_norm": 1.4390645266009279, + "learning_rate": 3.650563860550705e-06, + "loss": 0.2866, + "step": 20665 + }, + { + "epoch": 0.6, + "grad_norm": 1.3683738254634887, + "learning_rate": 3.6501115887953175e-06, + "loss": 0.3016, + "step": 20666 + }, + { + "epoch": 0.6, + "grad_norm": 1.553099529667807, + "learning_rate": 3.6496593289527103e-06, + "loss": 0.297, + "step": 20667 + }, + { + "epoch": 0.6, + "grad_norm": 1.3152937797874111, + "learning_rate": 3.6492070810268754e-06, + "loss": 0.2889, + "step": 20668 + }, + { + "epoch": 0.6, + "grad_norm": 1.487031909111657, + "learning_rate": 3.648754845021802e-06, + "loss": 0.3118, + "step": 20669 + }, + { + "epoch": 0.6, + "grad_norm": 1.3157288402685448, + "learning_rate": 3.6483026209414835e-06, + "loss": 0.3283, + "step": 20670 + }, + { + "epoch": 0.6, + "grad_norm": 1.58525194266439, + "learning_rate": 3.647850408789908e-06, + "loss": 0.31, + "step": 20671 + }, + { + "epoch": 0.6, + "grad_norm": 1.9332236378136032, + "learning_rate": 3.6473982085710678e-06, + "loss": 0.2916, + "step": 20672 + }, + { + "epoch": 0.6, + "grad_norm": 1.6472247767268056, + "learning_rate": 3.646946020288954e-06, + "loss": 0.2871, + "step": 20673 + }, + { + "epoch": 0.6, + "grad_norm": 1.2961605569367631, + "learning_rate": 3.646493843947557e-06, + "loss": 0.3019, + "step": 20674 + }, + { + "epoch": 0.6, + "grad_norm": 1.2739068821504318, + "learning_rate": 3.6460416795508658e-06, + "loss": 0.2885, + "step": 20675 + }, + { + "epoch": 0.6, + "grad_norm": 1.3206245530463248, + "learning_rate": 3.645589527102872e-06, + "loss": 0.2918, + "step": 20676 + }, + { + "epoch": 0.6, + "grad_norm": 1.4115861592026115, + "learning_rate": 3.6451373866075657e-06, + "loss": 0.2878, + "step": 20677 + }, + { + "epoch": 0.6, + "grad_norm": 1.7004112655241057, + "learning_rate": 3.644685258068938e-06, + "loss": 0.3364, + "step": 20678 + }, + { + "epoch": 0.6, + "grad_norm": 1.2320587891742305, + "learning_rate": 3.644233141490977e-06, + "loss": 0.2756, + "step": 20679 + }, + { + "epoch": 0.6, + "grad_norm": 1.2747916046744863, + "learning_rate": 3.6437810368776727e-06, + "loss": 0.2769, + "step": 20680 + }, + { + "epoch": 0.6, + "grad_norm": 1.3211160265207926, + "learning_rate": 3.6433289442330165e-06, + "loss": 0.3083, + "step": 20681 + }, + { + "epoch": 0.6, + "grad_norm": 1.2396606305557394, + "learning_rate": 3.642876863560999e-06, + "loss": 0.2926, + "step": 20682 + }, + { + "epoch": 0.6, + "grad_norm": 0.9076433768596194, + "learning_rate": 3.6424247948656067e-06, + "loss": 0.5803, + "step": 20683 + }, + { + "epoch": 0.6, + "grad_norm": 1.4573810741903048, + "learning_rate": 3.6419727381508308e-06, + "loss": 0.2872, + "step": 20684 + }, + { + "epoch": 0.6, + "grad_norm": 1.3256344357112384, + "learning_rate": 3.6415206934206596e-06, + "loss": 0.2928, + "step": 20685 + }, + { + "epoch": 0.6, + "grad_norm": 1.3995262747536201, + "learning_rate": 3.6410686606790845e-06, + "loss": 0.2892, + "step": 20686 + }, + { + "epoch": 0.6, + "grad_norm": 1.7150666731196407, + "learning_rate": 3.640616639930094e-06, + "loss": 0.3169, + "step": 20687 + }, + { + "epoch": 0.6, + "grad_norm": 2.303777511391723, + "learning_rate": 3.640164631177676e-06, + "loss": 0.2775, + "step": 20688 + }, + { + "epoch": 0.6, + "grad_norm": 1.336859586954962, + "learning_rate": 3.63971263442582e-06, + "loss": 0.3061, + "step": 20689 + }, + { + "epoch": 0.6, + "grad_norm": 1.3989703195660579, + "learning_rate": 3.6392606496785157e-06, + "loss": 0.2965, + "step": 20690 + }, + { + "epoch": 0.6, + "grad_norm": 1.544654374118664, + "learning_rate": 3.638808676939751e-06, + "loss": 0.2666, + "step": 20691 + }, + { + "epoch": 0.6, + "grad_norm": 1.2618949484402298, + "learning_rate": 3.638356716213515e-06, + "loss": 0.2968, + "step": 20692 + }, + { + "epoch": 0.6, + "grad_norm": 1.4030451191700941, + "learning_rate": 3.637904767503798e-06, + "loss": 0.3082, + "step": 20693 + }, + { + "epoch": 0.6, + "grad_norm": 1.1577413807025334, + "learning_rate": 3.637452830814584e-06, + "loss": 0.2852, + "step": 20694 + }, + { + "epoch": 0.6, + "grad_norm": 1.3727589476333457, + "learning_rate": 3.6370009061498636e-06, + "loss": 0.2995, + "step": 20695 + }, + { + "epoch": 0.6, + "grad_norm": 1.7571431792138177, + "learning_rate": 3.6365489935136255e-06, + "loss": 0.3022, + "step": 20696 + }, + { + "epoch": 0.6, + "grad_norm": 1.9872274651325645, + "learning_rate": 3.6360970929098593e-06, + "loss": 0.3001, + "step": 20697 + }, + { + "epoch": 0.6, + "grad_norm": 1.2062574549775367, + "learning_rate": 3.6356452043425503e-06, + "loss": 0.2927, + "step": 20698 + }, + { + "epoch": 0.6, + "grad_norm": 1.2831346243614006, + "learning_rate": 3.635193327815687e-06, + "loss": 0.2863, + "step": 20699 + }, + { + "epoch": 0.6, + "grad_norm": 1.2948509071385998, + "learning_rate": 3.6347414633332578e-06, + "loss": 0.2791, + "step": 20700 + }, + { + "epoch": 0.6, + "grad_norm": 1.242417066978764, + "learning_rate": 3.634289610899252e-06, + "loss": 0.2711, + "step": 20701 + }, + { + "epoch": 0.6, + "grad_norm": 1.1541722570933381, + "learning_rate": 3.6338377705176534e-06, + "loss": 0.2775, + "step": 20702 + }, + { + "epoch": 0.6, + "grad_norm": 2.514222952606456, + "learning_rate": 3.633385942192452e-06, + "loss": 0.2944, + "step": 20703 + }, + { + "epoch": 0.6, + "grad_norm": 1.3038461888494755, + "learning_rate": 3.6329341259276358e-06, + "loss": 0.2696, + "step": 20704 + }, + { + "epoch": 0.6, + "grad_norm": 1.2669660552240216, + "learning_rate": 3.6324823217271897e-06, + "loss": 0.2884, + "step": 20705 + }, + { + "epoch": 0.6, + "grad_norm": 1.3647582216262641, + "learning_rate": 3.632030529595104e-06, + "loss": 0.2912, + "step": 20706 + }, + { + "epoch": 0.6, + "grad_norm": 1.2427197948063953, + "learning_rate": 3.6315787495353638e-06, + "loss": 0.2883, + "step": 20707 + }, + { + "epoch": 0.6, + "grad_norm": 1.1949904890160556, + "learning_rate": 3.6311269815519557e-06, + "loss": 0.2882, + "step": 20708 + }, + { + "epoch": 0.6, + "grad_norm": 1.468176993128722, + "learning_rate": 3.6306752256488662e-06, + "loss": 0.2905, + "step": 20709 + }, + { + "epoch": 0.6, + "grad_norm": 1.3198045010450083, + "learning_rate": 3.630223481830085e-06, + "loss": 0.3033, + "step": 20710 + }, + { + "epoch": 0.6, + "grad_norm": 1.3243287113033677, + "learning_rate": 3.6297717500995956e-06, + "loss": 0.2969, + "step": 20711 + }, + { + "epoch": 0.6, + "grad_norm": 1.2902871524877533, + "learning_rate": 3.629320030461386e-06, + "loss": 0.3212, + "step": 20712 + }, + { + "epoch": 0.6, + "grad_norm": 1.3223020062469357, + "learning_rate": 3.628868322919442e-06, + "loss": 0.279, + "step": 20713 + }, + { + "epoch": 0.6, + "grad_norm": 1.7030263549019318, + "learning_rate": 3.628416627477751e-06, + "loss": 0.3068, + "step": 20714 + }, + { + "epoch": 0.6, + "grad_norm": 1.7343999390327085, + "learning_rate": 3.627964944140298e-06, + "loss": 0.2883, + "step": 20715 + }, + { + "epoch": 0.6, + "grad_norm": 1.353951926146751, + "learning_rate": 3.6275132729110698e-06, + "loss": 0.2966, + "step": 20716 + }, + { + "epoch": 0.6, + "grad_norm": 1.3676544852282686, + "learning_rate": 3.627061613794052e-06, + "loss": 0.3146, + "step": 20717 + }, + { + "epoch": 0.6, + "grad_norm": 1.3588646323180136, + "learning_rate": 3.6266099667932308e-06, + "loss": 0.2993, + "step": 20718 + }, + { + "epoch": 0.6, + "grad_norm": 1.2394380582440192, + "learning_rate": 3.6261583319125914e-06, + "loss": 0.2839, + "step": 20719 + }, + { + "epoch": 0.6, + "grad_norm": 2.7113559254011546, + "learning_rate": 3.6257067091561204e-06, + "loss": 0.311, + "step": 20720 + }, + { + "epoch": 0.6, + "grad_norm": 1.633856193722984, + "learning_rate": 3.6252550985278047e-06, + "loss": 0.2944, + "step": 20721 + }, + { + "epoch": 0.6, + "grad_norm": 1.3185564091438637, + "learning_rate": 3.624803500031626e-06, + "loss": 0.2849, + "step": 20722 + }, + { + "epoch": 0.6, + "grad_norm": 1.4674236719076261, + "learning_rate": 3.624351913671571e-06, + "loss": 0.3249, + "step": 20723 + }, + { + "epoch": 0.6, + "grad_norm": 1.3591247060037974, + "learning_rate": 3.623900339451627e-06, + "loss": 0.3063, + "step": 20724 + }, + { + "epoch": 0.6, + "grad_norm": 0.9258685851902932, + "learning_rate": 3.6234487773757774e-06, + "loss": 0.5926, + "step": 20725 + }, + { + "epoch": 0.6, + "grad_norm": 1.1853404771262703, + "learning_rate": 3.6229972274480072e-06, + "loss": 0.2894, + "step": 20726 + }, + { + "epoch": 0.6, + "grad_norm": 1.3248624311486996, + "learning_rate": 3.622545689672302e-06, + "loss": 0.3016, + "step": 20727 + }, + { + "epoch": 0.6, + "grad_norm": 1.39856567699815, + "learning_rate": 3.6220941640526465e-06, + "loss": 0.2944, + "step": 20728 + }, + { + "epoch": 0.6, + "grad_norm": 1.3836024181576096, + "learning_rate": 3.6216426505930257e-06, + "loss": 0.3039, + "step": 20729 + }, + { + "epoch": 0.6, + "grad_norm": 0.9875735992663697, + "learning_rate": 3.6211911492974226e-06, + "loss": 0.6663, + "step": 20730 + }, + { + "epoch": 0.6, + "grad_norm": 1.6234453354976652, + "learning_rate": 3.6207396601698235e-06, + "loss": 0.2764, + "step": 20731 + }, + { + "epoch": 0.6, + "grad_norm": 1.433595526126242, + "learning_rate": 3.6202881832142124e-06, + "loss": 0.3222, + "step": 20732 + }, + { + "epoch": 0.6, + "grad_norm": 1.2528889334998587, + "learning_rate": 3.619836718434574e-06, + "loss": 0.3088, + "step": 20733 + }, + { + "epoch": 0.6, + "grad_norm": 1.2494159715951516, + "learning_rate": 3.6193852658348906e-06, + "loss": 0.2777, + "step": 20734 + }, + { + "epoch": 0.6, + "grad_norm": 1.3002381761829396, + "learning_rate": 3.6189338254191475e-06, + "loss": 0.2853, + "step": 20735 + }, + { + "epoch": 0.6, + "grad_norm": 1.2919352804962496, + "learning_rate": 3.6184823971913292e-06, + "loss": 0.2788, + "step": 20736 + }, + { + "epoch": 0.6, + "grad_norm": 1.2439629719216914, + "learning_rate": 3.61803098115542e-06, + "loss": 0.2935, + "step": 20737 + }, + { + "epoch": 0.6, + "grad_norm": 1.2034087620504044, + "learning_rate": 3.6175795773154014e-06, + "loss": 0.2866, + "step": 20738 + }, + { + "epoch": 0.6, + "grad_norm": 1.4830362623349338, + "learning_rate": 3.6171281856752586e-06, + "loss": 0.2906, + "step": 20739 + }, + { + "epoch": 0.6, + "grad_norm": 1.3834625768809126, + "learning_rate": 3.6166768062389746e-06, + "loss": 0.3042, + "step": 20740 + }, + { + "epoch": 0.6, + "grad_norm": 2.0853838410147416, + "learning_rate": 3.6162254390105335e-06, + "loss": 0.3, + "step": 20741 + }, + { + "epoch": 0.6, + "grad_norm": 1.3807030027419815, + "learning_rate": 3.6157740839939193e-06, + "loss": 0.2977, + "step": 20742 + }, + { + "epoch": 0.6, + "grad_norm": 1.7349082650080105, + "learning_rate": 3.6153227411931135e-06, + "loss": 0.2769, + "step": 20743 + }, + { + "epoch": 0.6, + "grad_norm": 1.2675173577094743, + "learning_rate": 3.614871410612099e-06, + "loss": 0.291, + "step": 20744 + }, + { + "epoch": 0.6, + "grad_norm": 1.2411661043477353, + "learning_rate": 3.61442009225486e-06, + "loss": 0.2918, + "step": 20745 + }, + { + "epoch": 0.6, + "grad_norm": 1.3040830251925386, + "learning_rate": 3.6139687861253805e-06, + "loss": 0.2835, + "step": 20746 + }, + { + "epoch": 0.6, + "grad_norm": 1.3896882330094524, + "learning_rate": 3.6135174922276407e-06, + "loss": 0.3017, + "step": 20747 + }, + { + "epoch": 0.6, + "grad_norm": 1.59655275172373, + "learning_rate": 3.6130662105656234e-06, + "loss": 0.2773, + "step": 20748 + }, + { + "epoch": 0.6, + "grad_norm": 1.502134935717072, + "learning_rate": 3.6126149411433153e-06, + "loss": 0.2948, + "step": 20749 + }, + { + "epoch": 0.6, + "grad_norm": 2.0075697648311577, + "learning_rate": 3.6121636839646934e-06, + "loss": 0.3262, + "step": 20750 + }, + { + "epoch": 0.6, + "grad_norm": 0.9353761190398796, + "learning_rate": 3.611712439033742e-06, + "loss": 0.5656, + "step": 20751 + }, + { + "epoch": 0.6, + "grad_norm": 1.2465928823927892, + "learning_rate": 3.6112612063544457e-06, + "loss": 0.3201, + "step": 20752 + }, + { + "epoch": 0.6, + "grad_norm": 1.3853826370443072, + "learning_rate": 3.6108099859307832e-06, + "loss": 0.2951, + "step": 20753 + }, + { + "epoch": 0.6, + "grad_norm": 1.3590326445359238, + "learning_rate": 3.6103587777667377e-06, + "loss": 0.2803, + "step": 20754 + }, + { + "epoch": 0.6, + "grad_norm": 1.3086927357279052, + "learning_rate": 3.609907581866291e-06, + "loss": 0.2871, + "step": 20755 + }, + { + "epoch": 0.6, + "grad_norm": 1.3597451145367903, + "learning_rate": 3.6094563982334274e-06, + "loss": 0.3198, + "step": 20756 + }, + { + "epoch": 0.6, + "grad_norm": 1.3473912615515784, + "learning_rate": 3.609005226872125e-06, + "loss": 0.3041, + "step": 20757 + }, + { + "epoch": 0.6, + "grad_norm": 1.637831913266684, + "learning_rate": 3.6085540677863667e-06, + "loss": 0.3354, + "step": 20758 + }, + { + "epoch": 0.6, + "grad_norm": 1.4179830267996323, + "learning_rate": 3.6081029209801342e-06, + "loss": 0.298, + "step": 20759 + }, + { + "epoch": 0.6, + "grad_norm": 1.261903202695819, + "learning_rate": 3.607651786457409e-06, + "loss": 0.2868, + "step": 20760 + }, + { + "epoch": 0.6, + "grad_norm": 1.6376357561679764, + "learning_rate": 3.6072006642221736e-06, + "loss": 0.2904, + "step": 20761 + }, + { + "epoch": 0.6, + "grad_norm": 0.9998909249161201, + "learning_rate": 3.606749554278406e-06, + "loss": 0.6194, + "step": 20762 + }, + { + "epoch": 0.6, + "grad_norm": 1.28844274672145, + "learning_rate": 3.6062984566300896e-06, + "loss": 0.2751, + "step": 20763 + }, + { + "epoch": 0.6, + "grad_norm": 1.2158584305618274, + "learning_rate": 3.6058473712812046e-06, + "loss": 0.2817, + "step": 20764 + }, + { + "epoch": 0.6, + "grad_norm": 1.4052292698031141, + "learning_rate": 3.6053962982357332e-06, + "loss": 0.2923, + "step": 20765 + }, + { + "epoch": 0.6, + "grad_norm": 1.3389155971700093, + "learning_rate": 3.604945237497654e-06, + "loss": 0.2858, + "step": 20766 + }, + { + "epoch": 0.6, + "grad_norm": 1.2690258148218856, + "learning_rate": 3.6044941890709483e-06, + "loss": 0.2978, + "step": 20767 + }, + { + "epoch": 0.6, + "grad_norm": 1.3627199831776284, + "learning_rate": 3.6040431529595964e-06, + "loss": 0.3293, + "step": 20768 + }, + { + "epoch": 0.6, + "grad_norm": 1.2161374315417968, + "learning_rate": 3.6035921291675815e-06, + "loss": 0.2699, + "step": 20769 + }, + { + "epoch": 0.6, + "grad_norm": 1.3454007008391227, + "learning_rate": 3.6031411176988795e-06, + "loss": 0.2881, + "step": 20770 + }, + { + "epoch": 0.6, + "grad_norm": 1.2428557680063457, + "learning_rate": 3.6026901185574724e-06, + "loss": 0.2716, + "step": 20771 + }, + { + "epoch": 0.6, + "grad_norm": 1.2112333581586687, + "learning_rate": 3.6022391317473416e-06, + "loss": 0.2857, + "step": 20772 + }, + { + "epoch": 0.6, + "grad_norm": 1.3659497001575394, + "learning_rate": 3.601788157272466e-06, + "loss": 0.2922, + "step": 20773 + }, + { + "epoch": 0.6, + "grad_norm": 1.5933851509202974, + "learning_rate": 3.6013371951368247e-06, + "loss": 0.2966, + "step": 20774 + }, + { + "epoch": 0.6, + "grad_norm": 1.9241560033169325, + "learning_rate": 3.600886245344398e-06, + "loss": 0.2806, + "step": 20775 + }, + { + "epoch": 0.6, + "grad_norm": 0.9147615090384679, + "learning_rate": 3.600435307899166e-06, + "loss": 0.5737, + "step": 20776 + }, + { + "epoch": 0.6, + "grad_norm": 1.2949401960195313, + "learning_rate": 3.5999843828051084e-06, + "loss": 0.2912, + "step": 20777 + }, + { + "epoch": 0.6, + "grad_norm": 1.311002995879389, + "learning_rate": 3.5995334700662055e-06, + "loss": 0.2963, + "step": 20778 + }, + { + "epoch": 0.6, + "grad_norm": 1.255784193525268, + "learning_rate": 3.5990825696864344e-06, + "loss": 0.2791, + "step": 20779 + }, + { + "epoch": 0.6, + "grad_norm": 1.4585971799884618, + "learning_rate": 3.598631681669774e-06, + "loss": 0.3061, + "step": 20780 + }, + { + "epoch": 0.6, + "grad_norm": 0.9535948500505426, + "learning_rate": 3.598180806020205e-06, + "loss": 0.6069, + "step": 20781 + }, + { + "epoch": 0.6, + "grad_norm": 1.2830964242560632, + "learning_rate": 3.5977299427417055e-06, + "loss": 0.2998, + "step": 20782 + }, + { + "epoch": 0.6, + "grad_norm": 1.3958818880615127, + "learning_rate": 3.597279091838255e-06, + "loss": 0.3091, + "step": 20783 + }, + { + "epoch": 0.6, + "grad_norm": 1.2085574042987923, + "learning_rate": 3.5968282533138332e-06, + "loss": 0.2967, + "step": 20784 + }, + { + "epoch": 0.6, + "grad_norm": 1.3034343499283652, + "learning_rate": 3.596377427172416e-06, + "loss": 0.3089, + "step": 20785 + }, + { + "epoch": 0.6, + "grad_norm": 1.4693806855652962, + "learning_rate": 3.595926613417985e-06, + "loss": 0.341, + "step": 20786 + }, + { + "epoch": 0.6, + "grad_norm": 2.845140190067724, + "learning_rate": 3.595475812054516e-06, + "loss": 0.2977, + "step": 20787 + }, + { + "epoch": 0.6, + "grad_norm": 1.2260426952016505, + "learning_rate": 3.5950250230859896e-06, + "loss": 0.2847, + "step": 20788 + }, + { + "epoch": 0.6, + "grad_norm": 1.3368666049766111, + "learning_rate": 3.5945742465163824e-06, + "loss": 0.2815, + "step": 20789 + }, + { + "epoch": 0.6, + "grad_norm": 2.384278365958395, + "learning_rate": 3.594123482349673e-06, + "loss": 0.2951, + "step": 20790 + }, + { + "epoch": 0.6, + "grad_norm": 1.8418203390481676, + "learning_rate": 3.593672730589839e-06, + "loss": 0.2909, + "step": 20791 + }, + { + "epoch": 0.6, + "grad_norm": 1.337542149896952, + "learning_rate": 3.59322199124086e-06, + "loss": 0.2879, + "step": 20792 + }, + { + "epoch": 0.6, + "grad_norm": 1.3205920352015317, + "learning_rate": 3.5927712643067116e-06, + "loss": 0.2829, + "step": 20793 + }, + { + "epoch": 0.6, + "grad_norm": 1.696652958761933, + "learning_rate": 3.592320549791372e-06, + "loss": 0.3204, + "step": 20794 + }, + { + "epoch": 0.6, + "grad_norm": 1.2343156454318065, + "learning_rate": 3.5918698476988202e-06, + "loss": 0.2741, + "step": 20795 + }, + { + "epoch": 0.6, + "grad_norm": 1.2659172499291749, + "learning_rate": 3.5914191580330316e-06, + "loss": 0.2983, + "step": 20796 + }, + { + "epoch": 0.6, + "grad_norm": 1.3833641331101967, + "learning_rate": 3.590968480797986e-06, + "loss": 0.291, + "step": 20797 + }, + { + "epoch": 0.6, + "grad_norm": 1.2635204705221512, + "learning_rate": 3.590517815997658e-06, + "loss": 0.3031, + "step": 20798 + }, + { + "epoch": 0.6, + "grad_norm": 21.68253817310177, + "learning_rate": 3.590067163636026e-06, + "loss": 0.298, + "step": 20799 + }, + { + "epoch": 0.6, + "grad_norm": 1.2138046666900153, + "learning_rate": 3.5896165237170666e-06, + "loss": 0.2798, + "step": 20800 + }, + { + "epoch": 0.6, + "grad_norm": 1.2883520732934783, + "learning_rate": 3.5891658962447585e-06, + "loss": 0.2883, + "step": 20801 + }, + { + "epoch": 0.6, + "grad_norm": 2.1667634577832615, + "learning_rate": 3.588715281223076e-06, + "loss": 0.3047, + "step": 20802 + }, + { + "epoch": 0.6, + "grad_norm": 2.1072104414232244, + "learning_rate": 3.5882646786559965e-06, + "loss": 0.3179, + "step": 20803 + }, + { + "epoch": 0.6, + "grad_norm": 1.2687865887507095, + "learning_rate": 3.587814088547497e-06, + "loss": 0.3074, + "step": 20804 + }, + { + "epoch": 0.6, + "grad_norm": 1.321007476922686, + "learning_rate": 3.587363510901555e-06, + "loss": 0.3137, + "step": 20805 + }, + { + "epoch": 0.6, + "grad_norm": 1.3741586804013852, + "learning_rate": 3.5869129457221457e-06, + "loss": 0.3048, + "step": 20806 + }, + { + "epoch": 0.6, + "grad_norm": 1.3549764257497612, + "learning_rate": 3.586462393013245e-06, + "loss": 0.2832, + "step": 20807 + }, + { + "epoch": 0.6, + "grad_norm": 1.4221046757755507, + "learning_rate": 3.586011852778829e-06, + "loss": 0.3001, + "step": 20808 + }, + { + "epoch": 0.6, + "grad_norm": 1.4760541763400805, + "learning_rate": 3.585561325022874e-06, + "loss": 0.28, + "step": 20809 + }, + { + "epoch": 0.6, + "grad_norm": 1.260156466792384, + "learning_rate": 3.5851108097493547e-06, + "loss": 0.2877, + "step": 20810 + }, + { + "epoch": 0.6, + "grad_norm": 1.3276938530988651, + "learning_rate": 3.5846603069622506e-06, + "loss": 0.2789, + "step": 20811 + }, + { + "epoch": 0.6, + "grad_norm": 1.3651676546544498, + "learning_rate": 3.5842098166655336e-06, + "loss": 0.2735, + "step": 20812 + }, + { + "epoch": 0.6, + "grad_norm": 1.3926753663056777, + "learning_rate": 3.5837593388631808e-06, + "loss": 0.2867, + "step": 20813 + }, + { + "epoch": 0.6, + "grad_norm": 1.3626914775752534, + "learning_rate": 3.583308873559167e-06, + "loss": 0.2953, + "step": 20814 + }, + { + "epoch": 0.6, + "grad_norm": 1.2608234901621003, + "learning_rate": 3.5828584207574698e-06, + "loss": 0.2964, + "step": 20815 + }, + { + "epoch": 0.6, + "grad_norm": 1.4835002060634401, + "learning_rate": 3.5824079804620615e-06, + "loss": 0.3046, + "step": 20816 + }, + { + "epoch": 0.6, + "grad_norm": 2.6056345468934645, + "learning_rate": 3.5819575526769178e-06, + "loss": 0.3044, + "step": 20817 + }, + { + "epoch": 0.6, + "grad_norm": 1.24225387950433, + "learning_rate": 3.581507137406015e-06, + "loss": 0.2699, + "step": 20818 + }, + { + "epoch": 0.6, + "grad_norm": 1.5250950509406715, + "learning_rate": 3.5810567346533274e-06, + "loss": 0.2841, + "step": 20819 + }, + { + "epoch": 0.6, + "grad_norm": 1.567029598529576, + "learning_rate": 3.580606344422831e-06, + "loss": 0.2862, + "step": 20820 + }, + { + "epoch": 0.6, + "grad_norm": 1.3967500943944267, + "learning_rate": 3.580155966718499e-06, + "loss": 0.2932, + "step": 20821 + }, + { + "epoch": 0.6, + "grad_norm": 1.3207683025082493, + "learning_rate": 3.5797056015443056e-06, + "loss": 0.2827, + "step": 20822 + }, + { + "epoch": 0.6, + "grad_norm": 1.4911167361396218, + "learning_rate": 3.5792552489042255e-06, + "loss": 0.2838, + "step": 20823 + }, + { + "epoch": 0.6, + "grad_norm": 1.4657316438396257, + "learning_rate": 3.5788049088022354e-06, + "loss": 0.2997, + "step": 20824 + }, + { + "epoch": 0.6, + "grad_norm": 1.2286840836217447, + "learning_rate": 3.5783545812423067e-06, + "loss": 0.2852, + "step": 20825 + }, + { + "epoch": 0.6, + "grad_norm": 1.3504522574517315, + "learning_rate": 3.577904266228414e-06, + "loss": 0.3038, + "step": 20826 + }, + { + "epoch": 0.6, + "grad_norm": 1.3063700137061836, + "learning_rate": 3.5774539637645324e-06, + "loss": 0.2708, + "step": 20827 + }, + { + "epoch": 0.6, + "grad_norm": 1.514135343797653, + "learning_rate": 3.5770036738546367e-06, + "loss": 0.2955, + "step": 20828 + }, + { + "epoch": 0.6, + "grad_norm": 1.2712736938819564, + "learning_rate": 3.5765533965026977e-06, + "loss": 0.2819, + "step": 20829 + }, + { + "epoch": 0.6, + "grad_norm": 1.3268309615651894, + "learning_rate": 3.576103131712691e-06, + "loss": 0.2866, + "step": 20830 + }, + { + "epoch": 0.6, + "grad_norm": 1.3358594786908775, + "learning_rate": 3.5756528794885904e-06, + "loss": 0.2807, + "step": 20831 + }, + { + "epoch": 0.6, + "grad_norm": 1.5646297055635146, + "learning_rate": 3.575202639834368e-06, + "loss": 0.2765, + "step": 20832 + }, + { + "epoch": 0.6, + "grad_norm": 1.376042726189118, + "learning_rate": 3.574752412754e-06, + "loss": 0.3171, + "step": 20833 + }, + { + "epoch": 0.6, + "grad_norm": 2.1791960325220177, + "learning_rate": 3.574302198251458e-06, + "loss": 0.3035, + "step": 20834 + }, + { + "epoch": 0.6, + "grad_norm": 1.3597483676577882, + "learning_rate": 3.573851996330713e-06, + "loss": 0.2993, + "step": 20835 + }, + { + "epoch": 0.6, + "grad_norm": 1.3712345470295817, + "learning_rate": 3.573401806995741e-06, + "loss": 0.3076, + "step": 20836 + }, + { + "epoch": 0.6, + "grad_norm": 1.6831638232025532, + "learning_rate": 3.572951630250513e-06, + "loss": 0.2761, + "step": 20837 + }, + { + "epoch": 0.6, + "grad_norm": 1.4839736380252764, + "learning_rate": 3.5725014660990026e-06, + "loss": 0.3121, + "step": 20838 + }, + { + "epoch": 0.6, + "grad_norm": 2.2973191515811964, + "learning_rate": 3.572051314545184e-06, + "loss": 0.296, + "step": 20839 + }, + { + "epoch": 0.6, + "grad_norm": 1.3104410259546617, + "learning_rate": 3.5716011755930268e-06, + "loss": 0.2924, + "step": 20840 + }, + { + "epoch": 0.6, + "grad_norm": 1.392086399697323, + "learning_rate": 3.5711510492465056e-06, + "loss": 0.3383, + "step": 20841 + }, + { + "epoch": 0.6, + "grad_norm": 1.5136438602844475, + "learning_rate": 3.5707009355095925e-06, + "loss": 0.2841, + "step": 20842 + }, + { + "epoch": 0.6, + "grad_norm": 1.556319160103078, + "learning_rate": 3.57025083438626e-06, + "loss": 0.2916, + "step": 20843 + }, + { + "epoch": 0.6, + "grad_norm": 1.562637737449352, + "learning_rate": 3.5698007458804786e-06, + "loss": 0.2888, + "step": 20844 + }, + { + "epoch": 0.6, + "grad_norm": 1.3505652866701088, + "learning_rate": 3.5693506699962223e-06, + "loss": 0.2917, + "step": 20845 + }, + { + "epoch": 0.6, + "grad_norm": 0.9537281484228943, + "learning_rate": 3.568900606737461e-06, + "loss": 0.5957, + "step": 20846 + }, + { + "epoch": 0.6, + "grad_norm": 1.414097832958407, + "learning_rate": 3.5684505561081695e-06, + "loss": 0.287, + "step": 20847 + }, + { + "epoch": 0.6, + "grad_norm": 1.4020941704522283, + "learning_rate": 3.568000518112317e-06, + "loss": 0.3248, + "step": 20848 + }, + { + "epoch": 0.6, + "grad_norm": 1.338513494143535, + "learning_rate": 3.567550492753875e-06, + "loss": 0.2915, + "step": 20849 + }, + { + "epoch": 0.6, + "grad_norm": 1.2394752078648708, + "learning_rate": 3.5671004800368165e-06, + "loss": 0.2913, + "step": 20850 + }, + { + "epoch": 0.6, + "grad_norm": 1.2998239837213668, + "learning_rate": 3.5666504799651124e-06, + "loss": 0.3026, + "step": 20851 + }, + { + "epoch": 0.6, + "grad_norm": 1.263354961677767, + "learning_rate": 3.566200492542734e-06, + "loss": 0.2961, + "step": 20852 + }, + { + "epoch": 0.6, + "grad_norm": 1.286099898035641, + "learning_rate": 3.565750517773651e-06, + "loss": 0.2746, + "step": 20853 + }, + { + "epoch": 0.6, + "grad_norm": 1.3580991829045619, + "learning_rate": 3.565300555661836e-06, + "loss": 0.285, + "step": 20854 + }, + { + "epoch": 0.6, + "grad_norm": 1.2484138611811184, + "learning_rate": 3.564850606211259e-06, + "loss": 0.289, + "step": 20855 + }, + { + "epoch": 0.6, + "grad_norm": 1.4593258602073513, + "learning_rate": 3.564400669425893e-06, + "loss": 0.3068, + "step": 20856 + }, + { + "epoch": 0.6, + "grad_norm": 1.258494440973769, + "learning_rate": 3.563950745309706e-06, + "loss": 0.2835, + "step": 20857 + }, + { + "epoch": 0.6, + "grad_norm": 1.3552929585707152, + "learning_rate": 3.5635008338666695e-06, + "loss": 0.3042, + "step": 20858 + }, + { + "epoch": 0.61, + "grad_norm": 1.301457436838761, + "learning_rate": 3.5630509351007535e-06, + "loss": 0.3097, + "step": 20859 + }, + { + "epoch": 0.61, + "grad_norm": 1.2910358444459744, + "learning_rate": 3.562601049015931e-06, + "loss": 0.2861, + "step": 20860 + }, + { + "epoch": 0.61, + "grad_norm": 1.6597013406252203, + "learning_rate": 3.5621511756161686e-06, + "loss": 0.2946, + "step": 20861 + }, + { + "epoch": 0.61, + "grad_norm": 1.3126525156578421, + "learning_rate": 3.56170131490544e-06, + "loss": 0.2814, + "step": 20862 + }, + { + "epoch": 0.61, + "grad_norm": 1.3288879897333084, + "learning_rate": 3.561251466887711e-06, + "loss": 0.2937, + "step": 20863 + }, + { + "epoch": 0.61, + "grad_norm": 1.6712913022123799, + "learning_rate": 3.5608016315669543e-06, + "loss": 0.306, + "step": 20864 + }, + { + "epoch": 0.61, + "grad_norm": 1.436284794014697, + "learning_rate": 3.560351808947139e-06, + "loss": 0.319, + "step": 20865 + }, + { + "epoch": 0.61, + "grad_norm": 1.2750459743088989, + "learning_rate": 3.5599019990322357e-06, + "loss": 0.2925, + "step": 20866 + }, + { + "epoch": 0.61, + "grad_norm": 1.2743853469791944, + "learning_rate": 3.559452201826212e-06, + "loss": 0.2927, + "step": 20867 + }, + { + "epoch": 0.61, + "grad_norm": 1.353099929380178, + "learning_rate": 3.5590024173330397e-06, + "loss": 0.2924, + "step": 20868 + }, + { + "epoch": 0.61, + "grad_norm": 1.3058692732141688, + "learning_rate": 3.558552645556686e-06, + "loss": 0.2899, + "step": 20869 + }, + { + "epoch": 0.61, + "grad_norm": 1.713004672573045, + "learning_rate": 3.5581028865011224e-06, + "loss": 0.2881, + "step": 20870 + }, + { + "epoch": 0.61, + "grad_norm": 1.2200891243183174, + "learning_rate": 3.5576531401703164e-06, + "loss": 0.2665, + "step": 20871 + }, + { + "epoch": 0.61, + "grad_norm": 1.3526242226978489, + "learning_rate": 3.557203406568237e-06, + "loss": 0.3201, + "step": 20872 + }, + { + "epoch": 0.61, + "grad_norm": 1.681034608262926, + "learning_rate": 3.5567536856988534e-06, + "loss": 0.2797, + "step": 20873 + }, + { + "epoch": 0.61, + "grad_norm": 1.295479912185608, + "learning_rate": 3.5563039775661345e-06, + "loss": 0.304, + "step": 20874 + }, + { + "epoch": 0.61, + "grad_norm": 1.3964171613243153, + "learning_rate": 3.5558542821740505e-06, + "loss": 0.279, + "step": 20875 + }, + { + "epoch": 0.61, + "grad_norm": 1.4043101389310302, + "learning_rate": 3.5554045995265666e-06, + "loss": 0.2843, + "step": 20876 + }, + { + "epoch": 0.61, + "grad_norm": 1.3914882915256543, + "learning_rate": 3.554954929627654e-06, + "loss": 0.2972, + "step": 20877 + }, + { + "epoch": 0.61, + "grad_norm": 1.4636017755571278, + "learning_rate": 3.55450527248128e-06, + "loss": 0.2762, + "step": 20878 + }, + { + "epoch": 0.61, + "grad_norm": 2.062628315111769, + "learning_rate": 3.554055628091414e-06, + "loss": 0.2836, + "step": 20879 + }, + { + "epoch": 0.61, + "grad_norm": 1.9131964137393436, + "learning_rate": 3.5536059964620216e-06, + "loss": 0.298, + "step": 20880 + }, + { + "epoch": 0.61, + "grad_norm": 1.2940721109512139, + "learning_rate": 3.553156377597073e-06, + "loss": 0.2944, + "step": 20881 + }, + { + "epoch": 0.61, + "grad_norm": 1.5912291790054636, + "learning_rate": 3.5527067715005346e-06, + "loss": 0.3337, + "step": 20882 + }, + { + "epoch": 0.61, + "grad_norm": 1.5120384976018346, + "learning_rate": 3.552257178176377e-06, + "loss": 0.2951, + "step": 20883 + }, + { + "epoch": 0.61, + "grad_norm": 1.2851343392760932, + "learning_rate": 3.551807597628565e-06, + "loss": 0.2829, + "step": 20884 + }, + { + "epoch": 0.61, + "grad_norm": 1.3510812765578422, + "learning_rate": 3.5513580298610663e-06, + "loss": 0.2868, + "step": 20885 + }, + { + "epoch": 0.61, + "grad_norm": 1.3455144211930374, + "learning_rate": 3.5509084748778486e-06, + "loss": 0.2727, + "step": 20886 + }, + { + "epoch": 0.61, + "grad_norm": 1.2996532853422795, + "learning_rate": 3.5504589326828816e-06, + "loss": 0.2862, + "step": 20887 + }, + { + "epoch": 0.61, + "grad_norm": 1.4224107331985971, + "learning_rate": 3.5500094032801293e-06, + "loss": 0.2934, + "step": 20888 + }, + { + "epoch": 0.61, + "grad_norm": 1.4983397761083772, + "learning_rate": 3.5495598866735604e-06, + "loss": 0.2977, + "step": 20889 + }, + { + "epoch": 0.61, + "grad_norm": 1.2938136505758862, + "learning_rate": 3.549110382867143e-06, + "loss": 0.2815, + "step": 20890 + }, + { + "epoch": 0.61, + "grad_norm": 1.6895660276247126, + "learning_rate": 3.5486608918648407e-06, + "loss": 0.3194, + "step": 20891 + }, + { + "epoch": 0.61, + "grad_norm": 1.4044122665279324, + "learning_rate": 3.5482114136706226e-06, + "loss": 0.3158, + "step": 20892 + }, + { + "epoch": 0.61, + "grad_norm": 1.6625348526250707, + "learning_rate": 3.547761948288454e-06, + "loss": 0.2946, + "step": 20893 + }, + { + "epoch": 0.61, + "grad_norm": 1.245537463331179, + "learning_rate": 3.5473124957223044e-06, + "loss": 0.2687, + "step": 20894 + }, + { + "epoch": 0.61, + "grad_norm": 1.276642921978246, + "learning_rate": 3.5468630559761362e-06, + "loss": 0.3055, + "step": 20895 + }, + { + "epoch": 0.61, + "grad_norm": 1.484996101147358, + "learning_rate": 3.5464136290539186e-06, + "loss": 0.2952, + "step": 20896 + }, + { + "epoch": 0.61, + "grad_norm": 1.4439162665286396, + "learning_rate": 3.5459642149596163e-06, + "loss": 0.3172, + "step": 20897 + }, + { + "epoch": 0.61, + "grad_norm": 1.3473118532025976, + "learning_rate": 3.5455148136971962e-06, + "loss": 0.295, + "step": 20898 + }, + { + "epoch": 0.61, + "grad_norm": 1.2701689693527645, + "learning_rate": 3.545065425270624e-06, + "loss": 0.2903, + "step": 20899 + }, + { + "epoch": 0.61, + "grad_norm": 1.3654320724209352, + "learning_rate": 3.5446160496838656e-06, + "loss": 0.2739, + "step": 20900 + }, + { + "epoch": 0.61, + "grad_norm": 1.436867811474705, + "learning_rate": 3.5441666869408863e-06, + "loss": 0.2951, + "step": 20901 + }, + { + "epoch": 0.61, + "grad_norm": 1.1663413930697877, + "learning_rate": 3.5437173370456535e-06, + "loss": 0.2979, + "step": 20902 + }, + { + "epoch": 0.61, + "grad_norm": 1.2798176606905376, + "learning_rate": 3.54326800000213e-06, + "loss": 0.3026, + "step": 20903 + }, + { + "epoch": 0.61, + "grad_norm": 1.2588180242561744, + "learning_rate": 3.5428186758142823e-06, + "loss": 0.3005, + "step": 20904 + }, + { + "epoch": 0.61, + "grad_norm": 1.3623623410150971, + "learning_rate": 3.5423693644860766e-06, + "loss": 0.2907, + "step": 20905 + }, + { + "epoch": 0.61, + "grad_norm": 1.8217002363058006, + "learning_rate": 3.5419200660214783e-06, + "loss": 0.298, + "step": 20906 + }, + { + "epoch": 0.61, + "grad_norm": 1.264016686327418, + "learning_rate": 3.54147078042445e-06, + "loss": 0.2822, + "step": 20907 + }, + { + "epoch": 0.61, + "grad_norm": 1.278532675776531, + "learning_rate": 3.5410215076989583e-06, + "loss": 0.3186, + "step": 20908 + }, + { + "epoch": 0.61, + "grad_norm": 1.287953081226619, + "learning_rate": 3.540572247848969e-06, + "loss": 0.2883, + "step": 20909 + }, + { + "epoch": 0.61, + "grad_norm": 1.3167372799729475, + "learning_rate": 3.540123000878445e-06, + "loss": 0.3206, + "step": 20910 + }, + { + "epoch": 0.61, + "grad_norm": 1.341950692873926, + "learning_rate": 3.539673766791353e-06, + "loss": 0.3387, + "step": 20911 + }, + { + "epoch": 0.61, + "grad_norm": 1.5369562388371472, + "learning_rate": 3.539224545591655e-06, + "loss": 0.3026, + "step": 20912 + }, + { + "epoch": 0.61, + "grad_norm": 1.2273457651651418, + "learning_rate": 3.5387753372833165e-06, + "loss": 0.2845, + "step": 20913 + }, + { + "epoch": 0.61, + "grad_norm": 1.4286934630432881, + "learning_rate": 3.5383261418703017e-06, + "loss": 0.3149, + "step": 20914 + }, + { + "epoch": 0.61, + "grad_norm": 1.2174350669952019, + "learning_rate": 3.5378769593565763e-06, + "loss": 0.287, + "step": 20915 + }, + { + "epoch": 0.61, + "grad_norm": 1.334452237627559, + "learning_rate": 3.5374277897461023e-06, + "loss": 0.2912, + "step": 20916 + }, + { + "epoch": 0.61, + "grad_norm": 1.3769705256954803, + "learning_rate": 3.5369786330428434e-06, + "loss": 0.2909, + "step": 20917 + }, + { + "epoch": 0.61, + "grad_norm": 1.4510401123637593, + "learning_rate": 3.536529489250767e-06, + "loss": 0.304, + "step": 20918 + }, + { + "epoch": 0.61, + "grad_norm": 1.277362417415915, + "learning_rate": 3.5360803583738325e-06, + "loss": 0.3041, + "step": 20919 + }, + { + "epoch": 0.61, + "grad_norm": 1.448664582245498, + "learning_rate": 3.535631240416004e-06, + "loss": 0.3087, + "step": 20920 + }, + { + "epoch": 0.61, + "grad_norm": 1.3595914186702176, + "learning_rate": 3.5351821353812476e-06, + "loss": 0.3169, + "step": 20921 + }, + { + "epoch": 0.61, + "grad_norm": 1.5531434890662854, + "learning_rate": 3.534733043273524e-06, + "loss": 0.3246, + "step": 20922 + }, + { + "epoch": 0.61, + "grad_norm": 1.3182511162797923, + "learning_rate": 3.5342839640967973e-06, + "loss": 0.3001, + "step": 20923 + }, + { + "epoch": 0.61, + "grad_norm": 1.290108412172171, + "learning_rate": 3.5338348978550313e-06, + "loss": 0.3226, + "step": 20924 + }, + { + "epoch": 0.61, + "grad_norm": 1.2100025966178767, + "learning_rate": 3.5333858445521895e-06, + "loss": 0.2898, + "step": 20925 + }, + { + "epoch": 0.61, + "grad_norm": 1.938354701960084, + "learning_rate": 3.532936804192233e-06, + "loss": 0.3002, + "step": 20926 + }, + { + "epoch": 0.61, + "grad_norm": 1.3114699060838444, + "learning_rate": 3.5324877767791257e-06, + "loss": 0.3121, + "step": 20927 + }, + { + "epoch": 0.61, + "grad_norm": 1.699585044294186, + "learning_rate": 3.532038762316829e-06, + "loss": 0.2905, + "step": 20928 + }, + { + "epoch": 0.61, + "grad_norm": 1.3544165580326095, + "learning_rate": 3.5315897608093077e-06, + "loss": 0.3006, + "step": 20929 + }, + { + "epoch": 0.61, + "grad_norm": 1.5760284408531389, + "learning_rate": 3.531140772260523e-06, + "loss": 0.2854, + "step": 20930 + }, + { + "epoch": 0.61, + "grad_norm": 2.8377868865522564, + "learning_rate": 3.530691796674437e-06, + "loss": 0.302, + "step": 20931 + }, + { + "epoch": 0.61, + "grad_norm": 1.7325996189202, + "learning_rate": 3.5302428340550117e-06, + "loss": 0.3122, + "step": 20932 + }, + { + "epoch": 0.61, + "grad_norm": 1.5495314109584644, + "learning_rate": 3.52979388440621e-06, + "loss": 0.2779, + "step": 20933 + }, + { + "epoch": 0.61, + "grad_norm": 1.3892350162981273, + "learning_rate": 3.529344947731994e-06, + "loss": 0.2862, + "step": 20934 + }, + { + "epoch": 0.61, + "grad_norm": 1.2527882165396906, + "learning_rate": 3.528896024036325e-06, + "loss": 0.2766, + "step": 20935 + }, + { + "epoch": 0.61, + "grad_norm": 1.3236906531908696, + "learning_rate": 3.5284471133231648e-06, + "loss": 0.2934, + "step": 20936 + }, + { + "epoch": 0.61, + "grad_norm": 0.8999818228045289, + "learning_rate": 3.527998215596475e-06, + "loss": 0.5838, + "step": 20937 + }, + { + "epoch": 0.61, + "grad_norm": 1.2979230440627874, + "learning_rate": 3.5275493308602184e-06, + "loss": 0.292, + "step": 20938 + }, + { + "epoch": 0.61, + "grad_norm": 1.2757379993876545, + "learning_rate": 3.527100459118354e-06, + "loss": 0.3095, + "step": 20939 + }, + { + "epoch": 0.61, + "grad_norm": 1.6046259604759883, + "learning_rate": 3.526651600374844e-06, + "loss": 0.3041, + "step": 20940 + }, + { + "epoch": 0.61, + "grad_norm": 1.5027462945102616, + "learning_rate": 3.5262027546336508e-06, + "loss": 0.3007, + "step": 20941 + }, + { + "epoch": 0.61, + "grad_norm": 1.3484149137472077, + "learning_rate": 3.5257539218987352e-06, + "loss": 0.3505, + "step": 20942 + }, + { + "epoch": 0.61, + "grad_norm": 1.4734883170602424, + "learning_rate": 3.525305102174057e-06, + "loss": 0.2818, + "step": 20943 + }, + { + "epoch": 0.61, + "grad_norm": 1.539573632260016, + "learning_rate": 3.5248562954635768e-06, + "loss": 0.2878, + "step": 20944 + }, + { + "epoch": 0.61, + "grad_norm": 1.3345566064865184, + "learning_rate": 3.524407501771257e-06, + "loss": 0.2784, + "step": 20945 + }, + { + "epoch": 0.61, + "grad_norm": 0.9037204780698899, + "learning_rate": 3.5239587211010596e-06, + "loss": 0.5157, + "step": 20946 + }, + { + "epoch": 0.61, + "grad_norm": 1.4106933351559325, + "learning_rate": 3.5235099534569394e-06, + "loss": 0.2938, + "step": 20947 + }, + { + "epoch": 0.61, + "grad_norm": 1.3710775661313686, + "learning_rate": 3.5230611988428624e-06, + "loss": 0.2985, + "step": 20948 + }, + { + "epoch": 0.61, + "grad_norm": 1.8918278026610145, + "learning_rate": 3.5226124572627858e-06, + "loss": 0.3065, + "step": 20949 + }, + { + "epoch": 0.61, + "grad_norm": 1.742433888488868, + "learning_rate": 3.52216372872067e-06, + "loss": 0.2867, + "step": 20950 + }, + { + "epoch": 0.61, + "grad_norm": 1.3816131100013096, + "learning_rate": 3.5217150132204757e-06, + "loss": 0.2767, + "step": 20951 + }, + { + "epoch": 0.61, + "grad_norm": 1.664920346437638, + "learning_rate": 3.5212663107661638e-06, + "loss": 0.2951, + "step": 20952 + }, + { + "epoch": 0.61, + "grad_norm": 1.3665170355687826, + "learning_rate": 3.520817621361693e-06, + "loss": 0.314, + "step": 20953 + }, + { + "epoch": 0.61, + "grad_norm": 1.3936848374578998, + "learning_rate": 3.520368945011023e-06, + "loss": 0.2986, + "step": 20954 + }, + { + "epoch": 0.61, + "grad_norm": 1.3437457906819608, + "learning_rate": 3.5199202817181126e-06, + "loss": 0.2973, + "step": 20955 + }, + { + "epoch": 0.61, + "grad_norm": 1.9755364123381423, + "learning_rate": 3.5194716314869224e-06, + "loss": 0.3227, + "step": 20956 + }, + { + "epoch": 0.61, + "grad_norm": 1.2783266226372398, + "learning_rate": 3.519022994321413e-06, + "loss": 0.2961, + "step": 20957 + }, + { + "epoch": 0.61, + "grad_norm": 1.4955311192951624, + "learning_rate": 3.5185743702255404e-06, + "loss": 0.3187, + "step": 20958 + }, + { + "epoch": 0.61, + "grad_norm": 1.3781301480172834, + "learning_rate": 3.5181257592032657e-06, + "loss": 0.2995, + "step": 20959 + }, + { + "epoch": 0.61, + "grad_norm": 1.2846184846921471, + "learning_rate": 3.517677161258548e-06, + "loss": 0.2954, + "step": 20960 + }, + { + "epoch": 0.61, + "grad_norm": 1.4186990658665597, + "learning_rate": 3.517228576395346e-06, + "loss": 0.3339, + "step": 20961 + }, + { + "epoch": 0.61, + "grad_norm": 1.3161390955974985, + "learning_rate": 3.516780004617618e-06, + "loss": 0.3003, + "step": 20962 + }, + { + "epoch": 0.61, + "grad_norm": 1.360196332123642, + "learning_rate": 3.5163314459293228e-06, + "loss": 0.2992, + "step": 20963 + }, + { + "epoch": 0.61, + "grad_norm": 1.4524089771585715, + "learning_rate": 3.5158829003344187e-06, + "loss": 0.2841, + "step": 20964 + }, + { + "epoch": 0.61, + "grad_norm": 1.415917520514018, + "learning_rate": 3.515434367836865e-06, + "loss": 0.2881, + "step": 20965 + }, + { + "epoch": 0.61, + "grad_norm": 1.2603260165629198, + "learning_rate": 3.5149858484406195e-06, + "loss": 0.3005, + "step": 20966 + }, + { + "epoch": 0.61, + "grad_norm": 1.565854483816967, + "learning_rate": 3.51453734214964e-06, + "loss": 0.2915, + "step": 20967 + }, + { + "epoch": 0.61, + "grad_norm": 0.8968645794491736, + "learning_rate": 3.514088848967885e-06, + "loss": 0.5754, + "step": 20968 + }, + { + "epoch": 0.61, + "grad_norm": 1.2498174627391037, + "learning_rate": 3.5136403688993127e-06, + "loss": 0.2878, + "step": 20969 + }, + { + "epoch": 0.61, + "grad_norm": 1.901166983701708, + "learning_rate": 3.513191901947881e-06, + "loss": 0.3172, + "step": 20970 + }, + { + "epoch": 0.61, + "grad_norm": 1.8785867174412483, + "learning_rate": 3.5127434481175465e-06, + "loss": 0.2981, + "step": 20971 + }, + { + "epoch": 0.61, + "grad_norm": 1.3304966560961167, + "learning_rate": 3.5122950074122675e-06, + "loss": 0.2856, + "step": 20972 + }, + { + "epoch": 0.61, + "grad_norm": 1.6231442609108373, + "learning_rate": 3.5118465798360013e-06, + "loss": 0.3139, + "step": 20973 + }, + { + "epoch": 0.61, + "grad_norm": 1.068283954775126, + "learning_rate": 3.511398165392708e-06, + "loss": 0.5653, + "step": 20974 + }, + { + "epoch": 0.61, + "grad_norm": 1.2266245550242376, + "learning_rate": 3.5109497640863404e-06, + "loss": 0.2729, + "step": 20975 + }, + { + "epoch": 0.61, + "grad_norm": 1.249674054817045, + "learning_rate": 3.5105013759208583e-06, + "loss": 0.2928, + "step": 20976 + }, + { + "epoch": 0.61, + "grad_norm": 1.2162712245356517, + "learning_rate": 3.5100530009002176e-06, + "loss": 0.2992, + "step": 20977 + }, + { + "epoch": 0.61, + "grad_norm": 2.1459116347541856, + "learning_rate": 3.5096046390283754e-06, + "loss": 0.2926, + "step": 20978 + }, + { + "epoch": 0.61, + "grad_norm": 1.6478542601295076, + "learning_rate": 3.5091562903092893e-06, + "loss": 0.3183, + "step": 20979 + }, + { + "epoch": 0.61, + "grad_norm": 1.2324576545410526, + "learning_rate": 3.5087079547469156e-06, + "loss": 0.2955, + "step": 20980 + }, + { + "epoch": 0.61, + "grad_norm": 1.3313652540991112, + "learning_rate": 3.50825963234521e-06, + "loss": 0.3037, + "step": 20981 + }, + { + "epoch": 0.61, + "grad_norm": 1.4723578412586515, + "learning_rate": 3.5078113231081303e-06, + "loss": 0.2724, + "step": 20982 + }, + { + "epoch": 0.61, + "grad_norm": 1.290765813055242, + "learning_rate": 3.507363027039631e-06, + "loss": 0.2981, + "step": 20983 + }, + { + "epoch": 0.61, + "grad_norm": 1.2160236736793624, + "learning_rate": 3.506914744143671e-06, + "loss": 0.2911, + "step": 20984 + }, + { + "epoch": 0.61, + "grad_norm": 1.330394044681248, + "learning_rate": 3.506466474424205e-06, + "loss": 0.2975, + "step": 20985 + }, + { + "epoch": 0.61, + "grad_norm": 1.266118120451719, + "learning_rate": 3.5060182178851874e-06, + "loss": 0.2806, + "step": 20986 + }, + { + "epoch": 0.61, + "grad_norm": 1.9834771353950524, + "learning_rate": 3.5055699745305773e-06, + "loss": 0.3219, + "step": 20987 + }, + { + "epoch": 0.61, + "grad_norm": 0.9873463123642254, + "learning_rate": 3.505121744364327e-06, + "loss": 0.5629, + "step": 20988 + }, + { + "epoch": 0.61, + "grad_norm": 1.5415452749737353, + "learning_rate": 3.504673527390396e-06, + "loss": 0.3162, + "step": 20989 + }, + { + "epoch": 0.61, + "grad_norm": 1.3459578911608483, + "learning_rate": 3.5042253236127368e-06, + "loss": 0.303, + "step": 20990 + }, + { + "epoch": 0.61, + "grad_norm": 1.3099835212773943, + "learning_rate": 3.503777133035305e-06, + "loss": 0.305, + "step": 20991 + }, + { + "epoch": 0.61, + "grad_norm": 1.4077726025339308, + "learning_rate": 3.5033289556620563e-06, + "loss": 0.2872, + "step": 20992 + }, + { + "epoch": 0.61, + "grad_norm": 1.5622897642201048, + "learning_rate": 3.502880791496948e-06, + "loss": 0.2831, + "step": 20993 + }, + { + "epoch": 0.61, + "grad_norm": 1.5500210387003086, + "learning_rate": 3.5024326405439323e-06, + "loss": 0.2621, + "step": 20994 + }, + { + "epoch": 0.61, + "grad_norm": 1.3469311622170195, + "learning_rate": 3.501984502806965e-06, + "loss": 0.2957, + "step": 20995 + }, + { + "epoch": 0.61, + "grad_norm": 1.1986708195470175, + "learning_rate": 3.5015363782900015e-06, + "loss": 0.2918, + "step": 20996 + }, + { + "epoch": 0.61, + "grad_norm": 1.7935591121897978, + "learning_rate": 3.5010882669969957e-06, + "loss": 0.3199, + "step": 20997 + }, + { + "epoch": 0.61, + "grad_norm": 1.4345648374895654, + "learning_rate": 3.500640168931903e-06, + "loss": 0.3077, + "step": 20998 + }, + { + "epoch": 0.61, + "grad_norm": 10.863238043479974, + "learning_rate": 3.500192084098677e-06, + "loss": 0.2815, + "step": 20999 + }, + { + "epoch": 0.61, + "grad_norm": 1.2589203355586533, + "learning_rate": 3.4997440125012725e-06, + "loss": 0.2914, + "step": 21000 + }, + { + "epoch": 0.61, + "grad_norm": 1.2679288946553942, + "learning_rate": 3.4992959541436433e-06, + "loss": 0.2877, + "step": 21001 + }, + { + "epoch": 0.61, + "grad_norm": 1.6657701072409974, + "learning_rate": 3.498847909029747e-06, + "loss": 0.303, + "step": 21002 + }, + { + "epoch": 0.61, + "grad_norm": 1.2582192055947623, + "learning_rate": 3.4983998771635325e-06, + "loss": 0.2972, + "step": 21003 + }, + { + "epoch": 0.61, + "grad_norm": 1.4210299255800605, + "learning_rate": 3.497951858548955e-06, + "loss": 0.3089, + "step": 21004 + }, + { + "epoch": 0.61, + "grad_norm": 1.7652191762610394, + "learning_rate": 3.4975038531899695e-06, + "loss": 0.3159, + "step": 21005 + }, + { + "epoch": 0.61, + "grad_norm": 1.2803122867040746, + "learning_rate": 3.4970558610905286e-06, + "loss": 0.2906, + "step": 21006 + }, + { + "epoch": 0.61, + "grad_norm": 1.4176818976858876, + "learning_rate": 3.496607882254587e-06, + "loss": 0.3036, + "step": 21007 + }, + { + "epoch": 0.61, + "grad_norm": 1.3086333972201483, + "learning_rate": 3.4961599166860975e-06, + "loss": 0.2805, + "step": 21008 + }, + { + "epoch": 0.61, + "grad_norm": 1.3250459802938537, + "learning_rate": 3.4957119643890135e-06, + "loss": 0.2983, + "step": 21009 + }, + { + "epoch": 0.61, + "grad_norm": 1.3277498899323246, + "learning_rate": 3.4952640253672875e-06, + "loss": 0.2896, + "step": 21010 + }, + { + "epoch": 0.61, + "grad_norm": 1.2318792934235414, + "learning_rate": 3.494816099624873e-06, + "loss": 0.3004, + "step": 21011 + }, + { + "epoch": 0.61, + "grad_norm": 2.1330724369712124, + "learning_rate": 3.4943681871657244e-06, + "loss": 0.2858, + "step": 21012 + }, + { + "epoch": 0.61, + "grad_norm": 1.3859080315582286, + "learning_rate": 3.493920287993793e-06, + "loss": 0.2897, + "step": 21013 + }, + { + "epoch": 0.61, + "grad_norm": 0.9506249051865251, + "learning_rate": 3.4934724021130308e-06, + "loss": 0.548, + "step": 21014 + }, + { + "epoch": 0.61, + "grad_norm": 1.3581353698624778, + "learning_rate": 3.4930245295273913e-06, + "loss": 0.2842, + "step": 21015 + }, + { + "epoch": 0.61, + "grad_norm": 1.6526273727062002, + "learning_rate": 3.492576670240828e-06, + "loss": 0.2838, + "step": 21016 + }, + { + "epoch": 0.61, + "grad_norm": 1.4090694533328059, + "learning_rate": 3.492128824257292e-06, + "loss": 0.2771, + "step": 21017 + }, + { + "epoch": 0.61, + "grad_norm": 1.3457689566220148, + "learning_rate": 3.4916809915807352e-06, + "loss": 0.2989, + "step": 21018 + }, + { + "epoch": 0.61, + "grad_norm": 1.3830785025625245, + "learning_rate": 3.491233172215111e-06, + "loss": 0.2924, + "step": 21019 + }, + { + "epoch": 0.61, + "grad_norm": 1.7854597992579908, + "learning_rate": 3.4907853661643716e-06, + "loss": 0.2953, + "step": 21020 + }, + { + "epoch": 0.61, + "grad_norm": 1.4593257225135825, + "learning_rate": 3.4903375734324667e-06, + "loss": 0.282, + "step": 21021 + }, + { + "epoch": 0.61, + "grad_norm": 1.4358837915793365, + "learning_rate": 3.48988979402335e-06, + "loss": 0.2939, + "step": 21022 + }, + { + "epoch": 0.61, + "grad_norm": 1.3147396074160436, + "learning_rate": 3.4894420279409724e-06, + "loss": 0.2746, + "step": 21023 + }, + { + "epoch": 0.61, + "grad_norm": 1.2598916967371903, + "learning_rate": 3.4889942751892852e-06, + "loss": 0.2968, + "step": 21024 + }, + { + "epoch": 0.61, + "grad_norm": 1.3824465372758838, + "learning_rate": 3.488546535772242e-06, + "loss": 0.2932, + "step": 21025 + }, + { + "epoch": 0.61, + "grad_norm": 1.2929209076860093, + "learning_rate": 3.4880988096937906e-06, + "loss": 0.2972, + "step": 21026 + }, + { + "epoch": 0.61, + "grad_norm": 2.9386232259069622, + "learning_rate": 3.4876510969578847e-06, + "loss": 0.2969, + "step": 21027 + }, + { + "epoch": 0.61, + "grad_norm": 1.3561421173705324, + "learning_rate": 3.4872033975684745e-06, + "loss": 0.2902, + "step": 21028 + }, + { + "epoch": 0.61, + "grad_norm": 1.4490663906208445, + "learning_rate": 3.486755711529512e-06, + "loss": 0.3031, + "step": 21029 + }, + { + "epoch": 0.61, + "grad_norm": 1.44699944001436, + "learning_rate": 3.4863080388449473e-06, + "loss": 0.2953, + "step": 21030 + }, + { + "epoch": 0.61, + "grad_norm": 1.2780606611131542, + "learning_rate": 3.4858603795187305e-06, + "loss": 0.2811, + "step": 21031 + }, + { + "epoch": 0.61, + "grad_norm": 1.37944869672506, + "learning_rate": 3.485412733554812e-06, + "loss": 0.3003, + "step": 21032 + }, + { + "epoch": 0.61, + "grad_norm": 1.720139378233942, + "learning_rate": 3.4849651009571427e-06, + "loss": 0.301, + "step": 21033 + }, + { + "epoch": 0.61, + "grad_norm": 1.4105526067420617, + "learning_rate": 3.4845174817296736e-06, + "loss": 0.3029, + "step": 21034 + }, + { + "epoch": 0.61, + "grad_norm": 1.3431123367990783, + "learning_rate": 3.4840698758763554e-06, + "loss": 0.2905, + "step": 21035 + }, + { + "epoch": 0.61, + "grad_norm": 1.4152170303501164, + "learning_rate": 3.483622283401136e-06, + "loss": 0.3158, + "step": 21036 + }, + { + "epoch": 0.61, + "grad_norm": 1.434782703958799, + "learning_rate": 3.4831747043079673e-06, + "loss": 0.3096, + "step": 21037 + }, + { + "epoch": 0.61, + "grad_norm": 1.3707181545590408, + "learning_rate": 3.4827271386007985e-06, + "loss": 0.3112, + "step": 21038 + }, + { + "epoch": 0.61, + "grad_norm": 1.2973255462763253, + "learning_rate": 3.4822795862835802e-06, + "loss": 0.2798, + "step": 21039 + }, + { + "epoch": 0.61, + "grad_norm": 1.7978507384491684, + "learning_rate": 3.4818320473602606e-06, + "loss": 0.2873, + "step": 21040 + }, + { + "epoch": 0.61, + "grad_norm": 1.247876944538614, + "learning_rate": 3.48138452183479e-06, + "loss": 0.2709, + "step": 21041 + }, + { + "epoch": 0.61, + "grad_norm": 1.7651577637820783, + "learning_rate": 3.480937009711118e-06, + "loss": 0.2989, + "step": 21042 + }, + { + "epoch": 0.61, + "grad_norm": 1.3421866927830266, + "learning_rate": 3.4804895109931934e-06, + "loss": 0.2817, + "step": 21043 + }, + { + "epoch": 0.61, + "grad_norm": 1.331856181474626, + "learning_rate": 3.480042025684967e-06, + "loss": 0.2978, + "step": 21044 + }, + { + "epoch": 0.61, + "grad_norm": 1.443925255299491, + "learning_rate": 3.4795945537903852e-06, + "loss": 0.2948, + "step": 21045 + }, + { + "epoch": 0.61, + "grad_norm": 1.4590208972268701, + "learning_rate": 3.479147095313399e-06, + "loss": 0.3013, + "step": 21046 + }, + { + "epoch": 0.61, + "grad_norm": 1.3460922015113885, + "learning_rate": 3.4786996502579563e-06, + "loss": 0.3021, + "step": 21047 + }, + { + "epoch": 0.61, + "grad_norm": 1.248840973067432, + "learning_rate": 3.4782522186280066e-06, + "loss": 0.2934, + "step": 21048 + }, + { + "epoch": 0.61, + "grad_norm": 1.428814271407462, + "learning_rate": 3.477804800427498e-06, + "loss": 0.3218, + "step": 21049 + }, + { + "epoch": 0.61, + "grad_norm": 1.360366067747312, + "learning_rate": 3.477357395660378e-06, + "loss": 0.2854, + "step": 21050 + }, + { + "epoch": 0.61, + "grad_norm": 1.4552617329510822, + "learning_rate": 3.476910004330596e-06, + "loss": 0.3079, + "step": 21051 + }, + { + "epoch": 0.61, + "grad_norm": 2.6650968704367473, + "learning_rate": 3.476462626442101e-06, + "loss": 0.2991, + "step": 21052 + }, + { + "epoch": 0.61, + "grad_norm": 1.3515553551694282, + "learning_rate": 3.4760152619988395e-06, + "loss": 0.2688, + "step": 21053 + }, + { + "epoch": 0.61, + "grad_norm": 1.2693964640558817, + "learning_rate": 3.4755679110047603e-06, + "loss": 0.2976, + "step": 21054 + }, + { + "epoch": 0.61, + "grad_norm": 1.4302854625271755, + "learning_rate": 3.475120573463811e-06, + "loss": 0.2785, + "step": 21055 + }, + { + "epoch": 0.61, + "grad_norm": 1.2915751286119728, + "learning_rate": 3.4746732493799407e-06, + "loss": 0.2914, + "step": 21056 + }, + { + "epoch": 0.61, + "grad_norm": 1.2652586046877847, + "learning_rate": 3.474225938757095e-06, + "loss": 0.2767, + "step": 21057 + }, + { + "epoch": 0.61, + "grad_norm": 1.549279836435879, + "learning_rate": 3.473778641599222e-06, + "loss": 0.275, + "step": 21058 + }, + { + "epoch": 0.61, + "grad_norm": 1.4117813550988874, + "learning_rate": 3.473331357910271e-06, + "loss": 0.3117, + "step": 21059 + }, + { + "epoch": 0.61, + "grad_norm": 1.3963597837963841, + "learning_rate": 3.4728840876941873e-06, + "loss": 0.2883, + "step": 21060 + }, + { + "epoch": 0.61, + "grad_norm": 1.3031471687462988, + "learning_rate": 3.4724368309549174e-06, + "loss": 0.2987, + "step": 21061 + }, + { + "epoch": 0.61, + "grad_norm": 1.4223879671406736, + "learning_rate": 3.471989587696409e-06, + "loss": 0.2967, + "step": 21062 + }, + { + "epoch": 0.61, + "grad_norm": 1.483231142076155, + "learning_rate": 3.471542357922611e-06, + "loss": 0.2991, + "step": 21063 + }, + { + "epoch": 0.61, + "grad_norm": 1.4791941588819713, + "learning_rate": 3.471095141637467e-06, + "loss": 0.2886, + "step": 21064 + }, + { + "epoch": 0.61, + "grad_norm": 1.8441755744018262, + "learning_rate": 3.4706479388449267e-06, + "loss": 0.2896, + "step": 21065 + }, + { + "epoch": 0.61, + "grad_norm": 1.2793424673255902, + "learning_rate": 3.4702007495489344e-06, + "loss": 0.2918, + "step": 21066 + }, + { + "epoch": 0.61, + "grad_norm": 1.485775263550296, + "learning_rate": 3.469753573753438e-06, + "loss": 0.2772, + "step": 21067 + }, + { + "epoch": 0.61, + "grad_norm": 1.3013284372755018, + "learning_rate": 3.469306411462383e-06, + "loss": 0.3227, + "step": 21068 + }, + { + "epoch": 0.61, + "grad_norm": 1.4157538490538915, + "learning_rate": 3.4688592626797157e-06, + "loss": 0.2886, + "step": 21069 + }, + { + "epoch": 0.61, + "grad_norm": 1.4964134352631158, + "learning_rate": 3.4684121274093818e-06, + "loss": 0.3206, + "step": 21070 + }, + { + "epoch": 0.61, + "grad_norm": 1.2947496322789738, + "learning_rate": 3.46796500565533e-06, + "loss": 0.3097, + "step": 21071 + }, + { + "epoch": 0.61, + "grad_norm": 1.4935137770472804, + "learning_rate": 3.4675178974215025e-06, + "loss": 0.3259, + "step": 21072 + }, + { + "epoch": 0.61, + "grad_norm": 1.6673141412667747, + "learning_rate": 3.467070802711846e-06, + "loss": 0.2854, + "step": 21073 + }, + { + "epoch": 0.61, + "grad_norm": 1.3504314056121667, + "learning_rate": 3.466623721530308e-06, + "loss": 0.2907, + "step": 21074 + }, + { + "epoch": 0.61, + "grad_norm": 1.3877677425168773, + "learning_rate": 3.466176653880833e-06, + "loss": 0.2876, + "step": 21075 + }, + { + "epoch": 0.61, + "grad_norm": 1.4064878954296676, + "learning_rate": 3.4657295997673644e-06, + "loss": 0.2926, + "step": 21076 + }, + { + "epoch": 0.61, + "grad_norm": 1.3940581889786994, + "learning_rate": 3.46528255919385e-06, + "loss": 0.3009, + "step": 21077 + }, + { + "epoch": 0.61, + "grad_norm": 1.3722318921067123, + "learning_rate": 3.4648355321642333e-06, + "loss": 0.2962, + "step": 21078 + }, + { + "epoch": 0.61, + "grad_norm": 1.278308431230146, + "learning_rate": 3.4643885186824607e-06, + "loss": 0.2681, + "step": 21079 + }, + { + "epoch": 0.61, + "grad_norm": 1.3543752536554061, + "learning_rate": 3.463941518752477e-06, + "loss": 0.3062, + "step": 21080 + }, + { + "epoch": 0.61, + "grad_norm": 1.4554958678806507, + "learning_rate": 3.4634945323782252e-06, + "loss": 0.3091, + "step": 21081 + }, + { + "epoch": 0.61, + "grad_norm": 1.5572617454753377, + "learning_rate": 3.463047559563652e-06, + "loss": 0.2887, + "step": 21082 + }, + { + "epoch": 0.61, + "grad_norm": 1.3184106497325825, + "learning_rate": 3.4626006003127006e-06, + "loss": 0.2793, + "step": 21083 + }, + { + "epoch": 0.61, + "grad_norm": 1.469549411538423, + "learning_rate": 3.462153654629317e-06, + "loss": 0.3145, + "step": 21084 + }, + { + "epoch": 0.61, + "grad_norm": 1.3875522221077268, + "learning_rate": 3.461706722517444e-06, + "loss": 0.3026, + "step": 21085 + }, + { + "epoch": 0.61, + "grad_norm": 2.809246823325071, + "learning_rate": 3.4612598039810253e-06, + "loss": 0.2884, + "step": 21086 + }, + { + "epoch": 0.61, + "grad_norm": 1.2636945120708785, + "learning_rate": 3.4608128990240086e-06, + "loss": 0.2903, + "step": 21087 + }, + { + "epoch": 0.61, + "grad_norm": 1.339811344687933, + "learning_rate": 3.4603660076503332e-06, + "loss": 0.2932, + "step": 21088 + }, + { + "epoch": 0.61, + "grad_norm": 1.480840462792757, + "learning_rate": 3.4599191298639443e-06, + "loss": 0.3288, + "step": 21089 + }, + { + "epoch": 0.61, + "grad_norm": 1.3384455880153063, + "learning_rate": 3.4594722656687873e-06, + "loss": 0.3097, + "step": 21090 + }, + { + "epoch": 0.61, + "grad_norm": 1.471381116502754, + "learning_rate": 3.459025415068804e-06, + "loss": 0.2953, + "step": 21091 + }, + { + "epoch": 0.61, + "grad_norm": 1.256684171523454, + "learning_rate": 3.458578578067938e-06, + "loss": 0.2879, + "step": 21092 + }, + { + "epoch": 0.61, + "grad_norm": 1.8249453575805024, + "learning_rate": 3.458131754670134e-06, + "loss": 0.296, + "step": 21093 + }, + { + "epoch": 0.61, + "grad_norm": 1.4326858442514734, + "learning_rate": 3.457684944879335e-06, + "loss": 0.2909, + "step": 21094 + }, + { + "epoch": 0.61, + "grad_norm": 1.414460840365971, + "learning_rate": 3.4572381486994824e-06, + "loss": 0.2894, + "step": 21095 + }, + { + "epoch": 0.61, + "grad_norm": 0.9634457346273115, + "learning_rate": 3.4567913661345197e-06, + "loss": 0.6113, + "step": 21096 + }, + { + "epoch": 0.61, + "grad_norm": 1.2160621302192047, + "learning_rate": 3.4563445971883913e-06, + "loss": 0.3054, + "step": 21097 + }, + { + "epoch": 0.61, + "grad_norm": 1.4513313555446872, + "learning_rate": 3.455897841865038e-06, + "loss": 0.3024, + "step": 21098 + }, + { + "epoch": 0.61, + "grad_norm": 1.861501712217864, + "learning_rate": 3.455451100168405e-06, + "loss": 0.3481, + "step": 21099 + }, + { + "epoch": 0.61, + "grad_norm": 1.2529214812965903, + "learning_rate": 3.455004372102432e-06, + "loss": 0.2739, + "step": 21100 + }, + { + "epoch": 0.61, + "grad_norm": 1.6262008413992972, + "learning_rate": 3.4545576576710627e-06, + "loss": 0.308, + "step": 21101 + }, + { + "epoch": 0.61, + "grad_norm": 1.495560579767692, + "learning_rate": 3.454110956878239e-06, + "loss": 0.2949, + "step": 21102 + }, + { + "epoch": 0.61, + "grad_norm": 2.3682083171223804, + "learning_rate": 3.4536642697279045e-06, + "loss": 0.3018, + "step": 21103 + }, + { + "epoch": 0.61, + "grad_norm": 1.261572283466952, + "learning_rate": 3.4532175962239984e-06, + "loss": 0.2747, + "step": 21104 + }, + { + "epoch": 0.61, + "grad_norm": 1.3047111082267981, + "learning_rate": 3.452770936370464e-06, + "loss": 0.2967, + "step": 21105 + }, + { + "epoch": 0.61, + "grad_norm": 1.257821667064471, + "learning_rate": 3.452324290171244e-06, + "loss": 0.2902, + "step": 21106 + }, + { + "epoch": 0.61, + "grad_norm": 1.3540508713976709, + "learning_rate": 3.4518776576302803e-06, + "loss": 0.2944, + "step": 21107 + }, + { + "epoch": 0.61, + "grad_norm": 1.2940605707654431, + "learning_rate": 3.4514310387515124e-06, + "loss": 0.2844, + "step": 21108 + }, + { + "epoch": 0.61, + "grad_norm": 1.6837399858618762, + "learning_rate": 3.4509844335388826e-06, + "loss": 0.2879, + "step": 21109 + }, + { + "epoch": 0.61, + "grad_norm": 1.4737135721203878, + "learning_rate": 3.450537841996332e-06, + "loss": 0.304, + "step": 21110 + }, + { + "epoch": 0.61, + "grad_norm": 1.3045206498226645, + "learning_rate": 3.450091264127804e-06, + "loss": 0.2825, + "step": 21111 + }, + { + "epoch": 0.61, + "grad_norm": 1.4910632689757695, + "learning_rate": 3.4496446999372365e-06, + "loss": 0.288, + "step": 21112 + }, + { + "epoch": 0.61, + "grad_norm": 1.3585108389816214, + "learning_rate": 3.449198149428571e-06, + "loss": 0.3149, + "step": 21113 + }, + { + "epoch": 0.61, + "grad_norm": 1.4567451154906976, + "learning_rate": 3.44875161260575e-06, + "loss": 0.2867, + "step": 21114 + }, + { + "epoch": 0.61, + "grad_norm": 1.3330739360672135, + "learning_rate": 3.4483050894727145e-06, + "loss": 0.2949, + "step": 21115 + }, + { + "epoch": 0.61, + "grad_norm": 1.3825623955166082, + "learning_rate": 3.447858580033402e-06, + "loss": 0.3066, + "step": 21116 + }, + { + "epoch": 0.61, + "grad_norm": 1.325550380312488, + "learning_rate": 3.447412084291756e-06, + "loss": 0.3024, + "step": 21117 + }, + { + "epoch": 0.61, + "grad_norm": 1.5062096192228362, + "learning_rate": 3.4469656022517146e-06, + "loss": 0.2906, + "step": 21118 + }, + { + "epoch": 0.61, + "grad_norm": 1.416832995083468, + "learning_rate": 3.4465191339172187e-06, + "loss": 0.3167, + "step": 21119 + }, + { + "epoch": 0.61, + "grad_norm": 1.336985132123525, + "learning_rate": 3.446072679292208e-06, + "loss": 0.2722, + "step": 21120 + }, + { + "epoch": 0.61, + "grad_norm": 1.4854835670109623, + "learning_rate": 3.445626238380625e-06, + "loss": 0.2843, + "step": 21121 + }, + { + "epoch": 0.61, + "grad_norm": 1.6199912932793683, + "learning_rate": 3.445179811186407e-06, + "loss": 0.3023, + "step": 21122 + }, + { + "epoch": 0.61, + "grad_norm": 1.5035651881295877, + "learning_rate": 3.4447333977134945e-06, + "loss": 0.2847, + "step": 21123 + }, + { + "epoch": 0.61, + "grad_norm": 1.380837938149878, + "learning_rate": 3.4442869979658267e-06, + "loss": 0.2882, + "step": 21124 + }, + { + "epoch": 0.61, + "grad_norm": 1.375124151001224, + "learning_rate": 3.443840611947343e-06, + "loss": 0.2895, + "step": 21125 + }, + { + "epoch": 0.61, + "grad_norm": 1.2094500195788096, + "learning_rate": 3.4433942396619845e-06, + "loss": 0.2857, + "step": 21126 + }, + { + "epoch": 0.61, + "grad_norm": 0.9712920669851339, + "learning_rate": 3.4429478811136875e-06, + "loss": 0.583, + "step": 21127 + }, + { + "epoch": 0.61, + "grad_norm": 1.3824456910039773, + "learning_rate": 3.4425015363063936e-06, + "loss": 0.2959, + "step": 21128 + }, + { + "epoch": 0.61, + "grad_norm": 1.2765466905541476, + "learning_rate": 3.4420552052440404e-06, + "loss": 0.2935, + "step": 21129 + }, + { + "epoch": 0.61, + "grad_norm": 1.6144864989109444, + "learning_rate": 3.4416088879305686e-06, + "loss": 0.2897, + "step": 21130 + }, + { + "epoch": 0.61, + "grad_norm": 1.455627759395065, + "learning_rate": 3.441162584369914e-06, + "loss": 0.3157, + "step": 21131 + }, + { + "epoch": 0.61, + "grad_norm": 1.3141299448277166, + "learning_rate": 3.440716294566018e-06, + "loss": 0.3012, + "step": 21132 + }, + { + "epoch": 0.61, + "grad_norm": 1.677678723371193, + "learning_rate": 3.4402700185228173e-06, + "loss": 0.2936, + "step": 21133 + }, + { + "epoch": 0.61, + "grad_norm": 1.5484900397149335, + "learning_rate": 3.4398237562442503e-06, + "loss": 0.3051, + "step": 21134 + }, + { + "epoch": 0.61, + "grad_norm": 1.1364688344373204, + "learning_rate": 3.4393775077342584e-06, + "loss": 0.274, + "step": 21135 + }, + { + "epoch": 0.61, + "grad_norm": 1.4534120575655136, + "learning_rate": 3.4389312729967754e-06, + "loss": 0.3013, + "step": 21136 + }, + { + "epoch": 0.61, + "grad_norm": 1.3814397592685277, + "learning_rate": 3.4384850520357416e-06, + "loss": 0.3237, + "step": 21137 + }, + { + "epoch": 0.61, + "grad_norm": 1.4262783241746542, + "learning_rate": 3.4380388448550944e-06, + "loss": 0.2751, + "step": 21138 + }, + { + "epoch": 0.61, + "grad_norm": 1.4847013940151452, + "learning_rate": 3.437592651458773e-06, + "loss": 0.3219, + "step": 21139 + }, + { + "epoch": 0.61, + "grad_norm": 1.2681900795409167, + "learning_rate": 3.437146471850712e-06, + "loss": 0.3006, + "step": 21140 + }, + { + "epoch": 0.61, + "grad_norm": 1.3458826695088628, + "learning_rate": 3.4367003060348513e-06, + "loss": 0.2835, + "step": 21141 + }, + { + "epoch": 0.61, + "grad_norm": 1.2178564545964348, + "learning_rate": 3.436254154015128e-06, + "loss": 0.2887, + "step": 21142 + }, + { + "epoch": 0.61, + "grad_norm": 1.3966828657574355, + "learning_rate": 3.435808015795481e-06, + "loss": 0.3433, + "step": 21143 + }, + { + "epoch": 0.61, + "grad_norm": 1.2492882265598055, + "learning_rate": 3.435361891379843e-06, + "loss": 0.2963, + "step": 21144 + }, + { + "epoch": 0.61, + "grad_norm": 1.2518084033065133, + "learning_rate": 3.4349157807721557e-06, + "loss": 0.2717, + "step": 21145 + }, + { + "epoch": 0.61, + "grad_norm": 1.2855489454071172, + "learning_rate": 3.434469683976352e-06, + "loss": 0.2888, + "step": 21146 + }, + { + "epoch": 0.61, + "grad_norm": 1.5081320093561708, + "learning_rate": 3.4340236009963713e-06, + "loss": 0.3084, + "step": 21147 + }, + { + "epoch": 0.61, + "grad_norm": 2.236404675331473, + "learning_rate": 3.43357753183615e-06, + "loss": 0.3016, + "step": 21148 + }, + { + "epoch": 0.61, + "grad_norm": 1.4716576015559693, + "learning_rate": 3.433131476499624e-06, + "loss": 0.3059, + "step": 21149 + }, + { + "epoch": 0.61, + "grad_norm": 1.3943276543989207, + "learning_rate": 3.43268543499073e-06, + "loss": 0.2929, + "step": 21150 + }, + { + "epoch": 0.61, + "grad_norm": 1.7036254848090011, + "learning_rate": 3.4322394073134046e-06, + "loss": 0.2965, + "step": 21151 + }, + { + "epoch": 0.61, + "grad_norm": 1.4535215573861457, + "learning_rate": 3.4317933934715834e-06, + "loss": 0.2735, + "step": 21152 + }, + { + "epoch": 0.61, + "grad_norm": 1.4871741560832439, + "learning_rate": 3.431347393469204e-06, + "loss": 0.3011, + "step": 21153 + }, + { + "epoch": 0.61, + "grad_norm": 1.464767104571651, + "learning_rate": 3.430901407310201e-06, + "loss": 0.3074, + "step": 21154 + }, + { + "epoch": 0.61, + "grad_norm": 1.3790655886230396, + "learning_rate": 3.43045543499851e-06, + "loss": 0.2892, + "step": 21155 + }, + { + "epoch": 0.61, + "grad_norm": 1.3113837258330465, + "learning_rate": 3.4300094765380665e-06, + "loss": 0.3069, + "step": 21156 + }, + { + "epoch": 0.61, + "grad_norm": 1.3216709171441072, + "learning_rate": 3.429563531932807e-06, + "loss": 0.2751, + "step": 21157 + }, + { + "epoch": 0.61, + "grad_norm": 1.6095553722631746, + "learning_rate": 3.4291176011866684e-06, + "loss": 0.3234, + "step": 21158 + }, + { + "epoch": 0.61, + "grad_norm": 1.4044800123043883, + "learning_rate": 3.4286716843035826e-06, + "loss": 0.2933, + "step": 21159 + }, + { + "epoch": 0.61, + "grad_norm": 1.356861064139496, + "learning_rate": 3.4282257812874875e-06, + "loss": 0.2766, + "step": 21160 + }, + { + "epoch": 0.61, + "grad_norm": 1.587868186635497, + "learning_rate": 3.4277798921423167e-06, + "loss": 0.3009, + "step": 21161 + }, + { + "epoch": 0.61, + "grad_norm": 2.50953285656046, + "learning_rate": 3.4273340168720072e-06, + "loss": 0.2899, + "step": 21162 + }, + { + "epoch": 0.61, + "grad_norm": 1.3013670458892983, + "learning_rate": 3.426888155480491e-06, + "loss": 0.27, + "step": 21163 + }, + { + "epoch": 0.61, + "grad_norm": 1.6353397320743845, + "learning_rate": 3.4264423079717045e-06, + "loss": 0.2902, + "step": 21164 + }, + { + "epoch": 0.61, + "grad_norm": 1.4383641588191725, + "learning_rate": 3.425996474349582e-06, + "loss": 0.2993, + "step": 21165 + }, + { + "epoch": 0.61, + "grad_norm": 1.2563097019732095, + "learning_rate": 3.4255506546180594e-06, + "loss": 0.2757, + "step": 21166 + }, + { + "epoch": 0.61, + "grad_norm": 1.3351820300329476, + "learning_rate": 3.425104848781069e-06, + "loss": 0.2957, + "step": 21167 + }, + { + "epoch": 0.61, + "grad_norm": 1.399575919965252, + "learning_rate": 3.4246590568425455e-06, + "loss": 0.3099, + "step": 21168 + }, + { + "epoch": 0.61, + "grad_norm": 1.2252279358807145, + "learning_rate": 3.4242132788064227e-06, + "loss": 0.2833, + "step": 21169 + }, + { + "epoch": 0.61, + "grad_norm": 1.6034381431601772, + "learning_rate": 3.423767514676636e-06, + "loss": 0.3061, + "step": 21170 + }, + { + "epoch": 0.61, + "grad_norm": 0.9203173213551993, + "learning_rate": 3.423321764457119e-06, + "loss": 0.5765, + "step": 21171 + }, + { + "epoch": 0.61, + "grad_norm": 1.2546127063042976, + "learning_rate": 3.422876028151806e-06, + "loss": 0.3046, + "step": 21172 + }, + { + "epoch": 0.61, + "grad_norm": 3.474811238345733, + "learning_rate": 3.422430305764627e-06, + "loss": 0.2886, + "step": 21173 + }, + { + "epoch": 0.61, + "grad_norm": 1.3972617769306124, + "learning_rate": 3.4219845972995193e-06, + "loss": 0.3167, + "step": 21174 + }, + { + "epoch": 0.61, + "grad_norm": 1.4807561761036159, + "learning_rate": 3.421538902760414e-06, + "loss": 0.3057, + "step": 21175 + }, + { + "epoch": 0.61, + "grad_norm": 0.8987050104617015, + "learning_rate": 3.4210932221512457e-06, + "loss": 0.5243, + "step": 21176 + }, + { + "epoch": 0.61, + "grad_norm": 1.2600912303042517, + "learning_rate": 3.4206475554759475e-06, + "loss": 0.3024, + "step": 21177 + }, + { + "epoch": 0.61, + "grad_norm": 1.531912858299377, + "learning_rate": 3.4202019027384524e-06, + "loss": 0.3113, + "step": 21178 + }, + { + "epoch": 0.61, + "grad_norm": 1.3699238645946141, + "learning_rate": 3.4197562639426927e-06, + "loss": 0.2812, + "step": 21179 + }, + { + "epoch": 0.61, + "grad_norm": 1.3363392395836975, + "learning_rate": 3.4193106390926012e-06, + "loss": 0.3094, + "step": 21180 + }, + { + "epoch": 0.61, + "grad_norm": 1.3539858331361612, + "learning_rate": 3.418865028192112e-06, + "loss": 0.288, + "step": 21181 + }, + { + "epoch": 0.61, + "grad_norm": 1.3471499871435797, + "learning_rate": 3.418419431245156e-06, + "loss": 0.2909, + "step": 21182 + }, + { + "epoch": 0.61, + "grad_norm": 1.2795707840685027, + "learning_rate": 3.4179738482556648e-06, + "loss": 0.2803, + "step": 21183 + }, + { + "epoch": 0.61, + "grad_norm": 1.2012213797648736, + "learning_rate": 3.417528279227573e-06, + "loss": 0.2732, + "step": 21184 + }, + { + "epoch": 0.61, + "grad_norm": 1.2942713672107893, + "learning_rate": 3.4170827241648117e-06, + "loss": 0.2803, + "step": 21185 + }, + { + "epoch": 0.61, + "grad_norm": 1.4708575829198454, + "learning_rate": 3.416637183071313e-06, + "loss": 0.3123, + "step": 21186 + }, + { + "epoch": 0.61, + "grad_norm": 1.5030929669279647, + "learning_rate": 3.4161916559510083e-06, + "loss": 0.2754, + "step": 21187 + }, + { + "epoch": 0.61, + "grad_norm": 1.2096355233823164, + "learning_rate": 3.4157461428078298e-06, + "loss": 0.2877, + "step": 21188 + }, + { + "epoch": 0.61, + "grad_norm": 1.2916167069462197, + "learning_rate": 3.41530064364571e-06, + "loss": 0.2753, + "step": 21189 + }, + { + "epoch": 0.61, + "grad_norm": 1.2696388833415901, + "learning_rate": 3.4148551584685785e-06, + "loss": 0.2934, + "step": 21190 + }, + { + "epoch": 0.61, + "grad_norm": 1.505981382621373, + "learning_rate": 3.4144096872803683e-06, + "loss": 0.2924, + "step": 21191 + }, + { + "epoch": 0.61, + "grad_norm": 1.2589864520761427, + "learning_rate": 3.4139642300850102e-06, + "loss": 0.311, + "step": 21192 + }, + { + "epoch": 0.61, + "grad_norm": 1.7214724203452236, + "learning_rate": 3.413518786886435e-06, + "loss": 0.3261, + "step": 21193 + }, + { + "epoch": 0.61, + "grad_norm": 1.2207612434359043, + "learning_rate": 3.4130733576885753e-06, + "loss": 0.2848, + "step": 21194 + }, + { + "epoch": 0.61, + "grad_norm": 1.6253324120439487, + "learning_rate": 3.4126279424953594e-06, + "loss": 0.3001, + "step": 21195 + }, + { + "epoch": 0.61, + "grad_norm": 1.346775144525118, + "learning_rate": 3.4121825413107203e-06, + "loss": 0.2794, + "step": 21196 + }, + { + "epoch": 0.61, + "grad_norm": 1.2499390523835892, + "learning_rate": 3.411737154138587e-06, + "loss": 0.2846, + "step": 21197 + }, + { + "epoch": 0.61, + "grad_norm": 1.3388088913384266, + "learning_rate": 3.411291780982893e-06, + "loss": 0.294, + "step": 21198 + }, + { + "epoch": 0.61, + "grad_norm": 1.6569523054220292, + "learning_rate": 3.4108464218475655e-06, + "loss": 0.2818, + "step": 21199 + }, + { + "epoch": 0.61, + "grad_norm": 1.4344239644818613, + "learning_rate": 3.410401076736537e-06, + "loss": 0.3195, + "step": 21200 + }, + { + "epoch": 0.61, + "grad_norm": 1.402029924495842, + "learning_rate": 3.4099557456537348e-06, + "loss": 0.2892, + "step": 21201 + }, + { + "epoch": 0.61, + "grad_norm": 1.4387077608570984, + "learning_rate": 3.409510428603091e-06, + "loss": 0.2962, + "step": 21202 + }, + { + "epoch": 0.61, + "grad_norm": 1.3272175074777477, + "learning_rate": 3.4090651255885356e-06, + "loss": 0.2713, + "step": 21203 + }, + { + "epoch": 0.62, + "grad_norm": 1.9770972636982431, + "learning_rate": 3.4086198366139988e-06, + "loss": 0.3157, + "step": 21204 + }, + { + "epoch": 0.62, + "grad_norm": 1.2290072710229025, + "learning_rate": 3.408174561683409e-06, + "loss": 0.282, + "step": 21205 + }, + { + "epoch": 0.62, + "grad_norm": 1.3592761787239471, + "learning_rate": 3.407729300800696e-06, + "loss": 0.2889, + "step": 21206 + }, + { + "epoch": 0.62, + "grad_norm": 1.3015265312869098, + "learning_rate": 3.4072840539697892e-06, + "loss": 0.2825, + "step": 21207 + }, + { + "epoch": 0.62, + "grad_norm": 1.227286600774215, + "learning_rate": 3.40683882119462e-06, + "loss": 0.297, + "step": 21208 + }, + { + "epoch": 0.62, + "grad_norm": 1.5256339815904143, + "learning_rate": 3.406393602479115e-06, + "loss": 0.3282, + "step": 21209 + }, + { + "epoch": 0.62, + "grad_norm": 1.750726124415034, + "learning_rate": 3.4059483978272035e-06, + "loss": 0.2955, + "step": 21210 + }, + { + "epoch": 0.62, + "grad_norm": 1.2860388357706556, + "learning_rate": 3.4055032072428157e-06, + "loss": 0.2845, + "step": 21211 + }, + { + "epoch": 0.62, + "grad_norm": 1.414113277004915, + "learning_rate": 3.405058030729879e-06, + "loss": 0.299, + "step": 21212 + }, + { + "epoch": 0.62, + "grad_norm": 9.62751942269344, + "learning_rate": 3.4046128682923247e-06, + "loss": 0.2994, + "step": 21213 + }, + { + "epoch": 0.62, + "grad_norm": 1.387971225739043, + "learning_rate": 3.4041677199340786e-06, + "loss": 0.3199, + "step": 21214 + }, + { + "epoch": 0.62, + "grad_norm": 1.1694168244826035, + "learning_rate": 3.4037225856590693e-06, + "loss": 0.2726, + "step": 21215 + }, + { + "epoch": 0.62, + "grad_norm": 1.5620629740163474, + "learning_rate": 3.4032774654712264e-06, + "loss": 0.2816, + "step": 21216 + }, + { + "epoch": 0.62, + "grad_norm": 1.3206201489161564, + "learning_rate": 3.4028323593744784e-06, + "loss": 0.2842, + "step": 21217 + }, + { + "epoch": 0.62, + "grad_norm": 1.2535336224728897, + "learning_rate": 3.402387267372751e-06, + "loss": 0.2962, + "step": 21218 + }, + { + "epoch": 0.62, + "grad_norm": 1.2031640823677667, + "learning_rate": 3.4019421894699746e-06, + "loss": 0.278, + "step": 21219 + }, + { + "epoch": 0.62, + "grad_norm": 1.2274488245676534, + "learning_rate": 3.401497125670076e-06, + "loss": 0.2861, + "step": 21220 + }, + { + "epoch": 0.62, + "grad_norm": 1.4575465186958583, + "learning_rate": 3.4010520759769833e-06, + "loss": 0.3029, + "step": 21221 + }, + { + "epoch": 0.62, + "grad_norm": 1.453508593188047, + "learning_rate": 3.4006070403946236e-06, + "loss": 0.2832, + "step": 21222 + }, + { + "epoch": 0.62, + "grad_norm": 1.1933931998035832, + "learning_rate": 3.4001620189269235e-06, + "loss": 0.2896, + "step": 21223 + }, + { + "epoch": 0.62, + "grad_norm": 1.3505911058543874, + "learning_rate": 3.3997170115778122e-06, + "loss": 0.2971, + "step": 21224 + }, + { + "epoch": 0.62, + "grad_norm": 1.299402882831846, + "learning_rate": 3.399272018351217e-06, + "loss": 0.3267, + "step": 21225 + }, + { + "epoch": 0.62, + "grad_norm": 1.3038647394225287, + "learning_rate": 3.3988270392510626e-06, + "loss": 0.2913, + "step": 21226 + }, + { + "epoch": 0.62, + "grad_norm": 1.4505791985120549, + "learning_rate": 3.3983820742812797e-06, + "loss": 0.2982, + "step": 21227 + }, + { + "epoch": 0.62, + "grad_norm": 1.439490225048875, + "learning_rate": 3.3979371234457903e-06, + "loss": 0.3046, + "step": 21228 + }, + { + "epoch": 0.62, + "grad_norm": 1.9870775603591149, + "learning_rate": 3.3974921867485238e-06, + "loss": 0.2864, + "step": 21229 + }, + { + "epoch": 0.62, + "grad_norm": 1.3943142481676118, + "learning_rate": 3.3970472641934066e-06, + "loss": 0.2771, + "step": 21230 + }, + { + "epoch": 0.62, + "grad_norm": 2.2770745277457105, + "learning_rate": 3.396602355784365e-06, + "loss": 0.3105, + "step": 21231 + }, + { + "epoch": 0.62, + "grad_norm": 1.5096258866524954, + "learning_rate": 3.396157461525327e-06, + "loss": 0.3035, + "step": 21232 + }, + { + "epoch": 0.62, + "grad_norm": 1.4209585110207312, + "learning_rate": 3.3957125814202153e-06, + "loss": 0.3108, + "step": 21233 + }, + { + "epoch": 0.62, + "grad_norm": 0.9516358203727455, + "learning_rate": 3.395267715472958e-06, + "loss": 0.5283, + "step": 21234 + }, + { + "epoch": 0.62, + "grad_norm": 1.6141844042775237, + "learning_rate": 3.3948228636874813e-06, + "loss": 0.2953, + "step": 21235 + }, + { + "epoch": 0.62, + "grad_norm": 1.4056876206944473, + "learning_rate": 3.394378026067711e-06, + "loss": 0.2855, + "step": 21236 + }, + { + "epoch": 0.62, + "grad_norm": 2.477354970454511, + "learning_rate": 3.393933202617572e-06, + "loss": 0.35, + "step": 21237 + }, + { + "epoch": 0.62, + "grad_norm": 2.772311685076957, + "learning_rate": 3.39348839334099e-06, + "loss": 0.305, + "step": 21238 + }, + { + "epoch": 0.62, + "grad_norm": 1.33381859982422, + "learning_rate": 3.393043598241891e-06, + "loss": 0.3092, + "step": 21239 + }, + { + "epoch": 0.62, + "grad_norm": 1.5185188891507282, + "learning_rate": 3.3925988173242008e-06, + "loss": 0.3193, + "step": 21240 + }, + { + "epoch": 0.62, + "grad_norm": 1.2470221556327286, + "learning_rate": 3.3921540505918434e-06, + "loss": 0.2988, + "step": 21241 + }, + { + "epoch": 0.62, + "grad_norm": 1.530497228542949, + "learning_rate": 3.3917092980487443e-06, + "loss": 0.3196, + "step": 21242 + }, + { + "epoch": 0.62, + "grad_norm": 1.7934851883197562, + "learning_rate": 3.391264559698828e-06, + "loss": 0.2934, + "step": 21243 + }, + { + "epoch": 0.62, + "grad_norm": 1.30844685461334, + "learning_rate": 3.390819835546021e-06, + "loss": 0.2943, + "step": 21244 + }, + { + "epoch": 0.62, + "grad_norm": 1.2108263823091998, + "learning_rate": 3.3903751255942458e-06, + "loss": 0.299, + "step": 21245 + }, + { + "epoch": 0.62, + "grad_norm": 1.3173888874573705, + "learning_rate": 3.3899304298474278e-06, + "loss": 0.3038, + "step": 21246 + }, + { + "epoch": 0.62, + "grad_norm": 1.3990329306033766, + "learning_rate": 3.3894857483094917e-06, + "loss": 0.2959, + "step": 21247 + }, + { + "epoch": 0.62, + "grad_norm": 1.2822965586657677, + "learning_rate": 3.389041080984362e-06, + "loss": 0.2915, + "step": 21248 + }, + { + "epoch": 0.62, + "grad_norm": 1.3532602966725098, + "learning_rate": 3.3885964278759633e-06, + "loss": 0.2979, + "step": 21249 + }, + { + "epoch": 0.62, + "grad_norm": 1.4197296798166061, + "learning_rate": 3.388151788988218e-06, + "loss": 0.2962, + "step": 21250 + }, + { + "epoch": 0.62, + "grad_norm": 1.532175985802362, + "learning_rate": 3.3877071643250514e-06, + "loss": 0.3046, + "step": 21251 + }, + { + "epoch": 0.62, + "grad_norm": 1.3397112825122537, + "learning_rate": 3.387262553890387e-06, + "loss": 0.2836, + "step": 21252 + }, + { + "epoch": 0.62, + "grad_norm": 1.3123918050104224, + "learning_rate": 3.3868179576881488e-06, + "loss": 0.3091, + "step": 21253 + }, + { + "epoch": 0.62, + "grad_norm": 1.3012309429850126, + "learning_rate": 3.3863733757222595e-06, + "loss": 0.2866, + "step": 21254 + }, + { + "epoch": 0.62, + "grad_norm": 1.2274403539852428, + "learning_rate": 3.3859288079966453e-06, + "loss": 0.3088, + "step": 21255 + }, + { + "epoch": 0.62, + "grad_norm": 1.432750349330277, + "learning_rate": 3.3854842545152246e-06, + "loss": 0.2918, + "step": 21256 + }, + { + "epoch": 0.62, + "grad_norm": 6.292856213445648, + "learning_rate": 3.385039715281924e-06, + "loss": 0.2959, + "step": 21257 + }, + { + "epoch": 0.62, + "grad_norm": 1.3951945389287062, + "learning_rate": 3.3845951903006652e-06, + "loss": 0.3035, + "step": 21258 + }, + { + "epoch": 0.62, + "grad_norm": 1.2235637207572172, + "learning_rate": 3.384150679575373e-06, + "loss": 0.2944, + "step": 21259 + }, + { + "epoch": 0.62, + "grad_norm": 1.4383332160041427, + "learning_rate": 3.383706183109968e-06, + "loss": 0.277, + "step": 21260 + }, + { + "epoch": 0.62, + "grad_norm": 3.123345796967046, + "learning_rate": 3.383261700908374e-06, + "loss": 0.2953, + "step": 21261 + }, + { + "epoch": 0.62, + "grad_norm": 1.2352751179070531, + "learning_rate": 3.3828172329745134e-06, + "loss": 0.2939, + "step": 21262 + }, + { + "epoch": 0.62, + "grad_norm": 1.6811429134782756, + "learning_rate": 3.3823727793123096e-06, + "loss": 0.2925, + "step": 21263 + }, + { + "epoch": 0.62, + "grad_norm": 1.394123893902247, + "learning_rate": 3.381928339925683e-06, + "loss": 0.296, + "step": 21264 + }, + { + "epoch": 0.62, + "grad_norm": 1.434441739806657, + "learning_rate": 3.381483914818556e-06, + "loss": 0.3172, + "step": 21265 + }, + { + "epoch": 0.62, + "grad_norm": 1.2686065618036666, + "learning_rate": 3.381039503994853e-06, + "loss": 0.2735, + "step": 21266 + }, + { + "epoch": 0.62, + "grad_norm": 1.2395781378837296, + "learning_rate": 3.380595107458493e-06, + "loss": 0.2939, + "step": 21267 + }, + { + "epoch": 0.62, + "grad_norm": 1.2843067309115823, + "learning_rate": 3.3801507252134003e-06, + "loss": 0.2976, + "step": 21268 + }, + { + "epoch": 0.62, + "grad_norm": 1.2966056776405024, + "learning_rate": 3.3797063572634947e-06, + "loss": 0.3018, + "step": 21269 + }, + { + "epoch": 0.62, + "grad_norm": 1.3904102252398274, + "learning_rate": 3.3792620036126986e-06, + "loss": 0.2974, + "step": 21270 + }, + { + "epoch": 0.62, + "grad_norm": 1.6562931147413293, + "learning_rate": 3.3788176642649327e-06, + "loss": 0.352, + "step": 21271 + }, + { + "epoch": 0.62, + "grad_norm": 1.6015993656967022, + "learning_rate": 3.3783733392241208e-06, + "loss": 0.2973, + "step": 21272 + }, + { + "epoch": 0.62, + "grad_norm": 1.3037112555099717, + "learning_rate": 3.3779290284941808e-06, + "loss": 0.2994, + "step": 21273 + }, + { + "epoch": 0.62, + "grad_norm": 1.262153904429346, + "learning_rate": 3.377484732079035e-06, + "loss": 0.2883, + "step": 21274 + }, + { + "epoch": 0.62, + "grad_norm": 1.35158758236538, + "learning_rate": 3.377040449982604e-06, + "loss": 0.3097, + "step": 21275 + }, + { + "epoch": 0.62, + "grad_norm": 1.3716904392370282, + "learning_rate": 3.3765961822088113e-06, + "loss": 0.3063, + "step": 21276 + }, + { + "epoch": 0.62, + "grad_norm": 1.4092988988293507, + "learning_rate": 3.376151928761573e-06, + "loss": 0.2847, + "step": 21277 + }, + { + "epoch": 0.62, + "grad_norm": 1.3728927706554028, + "learning_rate": 3.3757076896448127e-06, + "loss": 0.2879, + "step": 21278 + }, + { + "epoch": 0.62, + "grad_norm": 1.4368250432678487, + "learning_rate": 3.3752634648624495e-06, + "loss": 0.3096, + "step": 21279 + }, + { + "epoch": 0.62, + "grad_norm": 1.5365427059553252, + "learning_rate": 3.3748192544184055e-06, + "loss": 0.2761, + "step": 21280 + }, + { + "epoch": 0.62, + "grad_norm": 1.4796871655777226, + "learning_rate": 3.374375058316599e-06, + "loss": 0.283, + "step": 21281 + }, + { + "epoch": 0.62, + "grad_norm": 1.2646711051015072, + "learning_rate": 3.3739308765609503e-06, + "loss": 0.2844, + "step": 21282 + }, + { + "epoch": 0.62, + "grad_norm": 1.4036197875138907, + "learning_rate": 3.3734867091553814e-06, + "loss": 0.2989, + "step": 21283 + }, + { + "epoch": 0.62, + "grad_norm": 1.3660300568966663, + "learning_rate": 3.373042556103809e-06, + "loss": 0.2805, + "step": 21284 + }, + { + "epoch": 0.62, + "grad_norm": 1.5571643748695627, + "learning_rate": 3.3725984174101535e-06, + "loss": 0.297, + "step": 21285 + }, + { + "epoch": 0.62, + "grad_norm": 1.756250977788716, + "learning_rate": 3.3721542930783363e-06, + "loss": 0.297, + "step": 21286 + }, + { + "epoch": 0.62, + "grad_norm": 1.1874063998942443, + "learning_rate": 3.3717101831122747e-06, + "loss": 0.2789, + "step": 21287 + }, + { + "epoch": 0.62, + "grad_norm": 1.5685149308633417, + "learning_rate": 3.371266087515888e-06, + "loss": 0.2816, + "step": 21288 + }, + { + "epoch": 0.62, + "grad_norm": 1.2608416004575527, + "learning_rate": 3.3708220062930967e-06, + "loss": 0.2898, + "step": 21289 + }, + { + "epoch": 0.62, + "grad_norm": 1.3078619947745866, + "learning_rate": 3.370377939447819e-06, + "loss": 0.3093, + "step": 21290 + }, + { + "epoch": 0.62, + "grad_norm": 1.356637289354311, + "learning_rate": 3.3699338869839757e-06, + "loss": 0.2892, + "step": 21291 + }, + { + "epoch": 0.62, + "grad_norm": 1.2566223093524551, + "learning_rate": 3.369489848905483e-06, + "loss": 0.2836, + "step": 21292 + }, + { + "epoch": 0.62, + "grad_norm": 1.575004921211805, + "learning_rate": 3.3690458252162593e-06, + "loss": 0.2879, + "step": 21293 + }, + { + "epoch": 0.62, + "grad_norm": 1.360902466574982, + "learning_rate": 3.368601815920225e-06, + "loss": 0.2757, + "step": 21294 + }, + { + "epoch": 0.62, + "grad_norm": 1.4735734122365587, + "learning_rate": 3.368157821021299e-06, + "loss": 0.2768, + "step": 21295 + }, + { + "epoch": 0.62, + "grad_norm": 2.468558982187593, + "learning_rate": 3.3677138405233967e-06, + "loss": 0.2999, + "step": 21296 + }, + { + "epoch": 0.62, + "grad_norm": 1.2596698528165267, + "learning_rate": 3.367269874430439e-06, + "loss": 0.305, + "step": 21297 + }, + { + "epoch": 0.62, + "grad_norm": 1.222984490569509, + "learning_rate": 3.366825922746342e-06, + "loss": 0.26, + "step": 21298 + }, + { + "epoch": 0.62, + "grad_norm": 1.3741800091549632, + "learning_rate": 3.3663819854750244e-06, + "loss": 0.2987, + "step": 21299 + }, + { + "epoch": 0.62, + "grad_norm": 1.219736150766426, + "learning_rate": 3.3659380626204042e-06, + "loss": 0.2877, + "step": 21300 + }, + { + "epoch": 0.62, + "grad_norm": 1.449901840873437, + "learning_rate": 3.3654941541863985e-06, + "loss": 0.2799, + "step": 21301 + }, + { + "epoch": 0.62, + "grad_norm": 1.4914621685482463, + "learning_rate": 3.3650502601769255e-06, + "loss": 0.3059, + "step": 21302 + }, + { + "epoch": 0.62, + "grad_norm": 0.9504891908829209, + "learning_rate": 3.3646063805959006e-06, + "loss": 0.6105, + "step": 21303 + }, + { + "epoch": 0.62, + "grad_norm": 1.3871359212703749, + "learning_rate": 3.3641625154472444e-06, + "loss": 0.2883, + "step": 21304 + }, + { + "epoch": 0.62, + "grad_norm": 1.950758661177621, + "learning_rate": 3.363718664734872e-06, + "loss": 0.3159, + "step": 21305 + }, + { + "epoch": 0.62, + "grad_norm": 1.471421034952798, + "learning_rate": 3.363274828462699e-06, + "loss": 0.2734, + "step": 21306 + }, + { + "epoch": 0.62, + "grad_norm": 1.4708758414435756, + "learning_rate": 3.3628310066346448e-06, + "loss": 0.2727, + "step": 21307 + }, + { + "epoch": 0.62, + "grad_norm": 1.372809209138818, + "learning_rate": 3.3623871992546258e-06, + "loss": 0.2824, + "step": 21308 + }, + { + "epoch": 0.62, + "grad_norm": 1.6493841362147315, + "learning_rate": 3.361943406326557e-06, + "loss": 0.2986, + "step": 21309 + }, + { + "epoch": 0.62, + "grad_norm": 1.299132855224844, + "learning_rate": 3.361499627854356e-06, + "loss": 0.2861, + "step": 21310 + }, + { + "epoch": 0.62, + "grad_norm": 1.2819012959772804, + "learning_rate": 3.361055863841941e-06, + "loss": 0.2889, + "step": 21311 + }, + { + "epoch": 0.62, + "grad_norm": 1.4210985127860096, + "learning_rate": 3.3606121142932235e-06, + "loss": 0.2815, + "step": 21312 + }, + { + "epoch": 0.62, + "grad_norm": 1.229889378371792, + "learning_rate": 3.360168379212123e-06, + "loss": 0.3012, + "step": 21313 + }, + { + "epoch": 0.62, + "grad_norm": 1.239359719156742, + "learning_rate": 3.3597246586025556e-06, + "loss": 0.308, + "step": 21314 + }, + { + "epoch": 0.62, + "grad_norm": 2.524962373706696, + "learning_rate": 3.359280952468435e-06, + "loss": 0.2833, + "step": 21315 + }, + { + "epoch": 0.62, + "grad_norm": 1.211182188220168, + "learning_rate": 3.358837260813678e-06, + "loss": 0.2831, + "step": 21316 + }, + { + "epoch": 0.62, + "grad_norm": 1.416323629051261, + "learning_rate": 3.358393583642201e-06, + "loss": 0.2824, + "step": 21317 + }, + { + "epoch": 0.62, + "grad_norm": 1.5442156675645675, + "learning_rate": 3.357949920957919e-06, + "loss": 0.3164, + "step": 21318 + }, + { + "epoch": 0.62, + "grad_norm": 1.350447427998774, + "learning_rate": 3.3575062727647474e-06, + "loss": 0.2997, + "step": 21319 + }, + { + "epoch": 0.62, + "grad_norm": 1.4687229576319238, + "learning_rate": 3.3570626390666006e-06, + "loss": 0.3132, + "step": 21320 + }, + { + "epoch": 0.62, + "grad_norm": 1.3226700242919247, + "learning_rate": 3.356619019867394e-06, + "loss": 0.29, + "step": 21321 + }, + { + "epoch": 0.62, + "grad_norm": 1.6260907262426025, + "learning_rate": 3.3561754151710435e-06, + "loss": 0.3347, + "step": 21322 + }, + { + "epoch": 0.62, + "grad_norm": 1.6283851009910741, + "learning_rate": 3.355731824981463e-06, + "loss": 0.3025, + "step": 21323 + }, + { + "epoch": 0.62, + "grad_norm": 0.9476092879806196, + "learning_rate": 3.355288249302567e-06, + "loss": 0.5865, + "step": 21324 + }, + { + "epoch": 0.62, + "grad_norm": 1.2546606646199292, + "learning_rate": 3.35484468813827e-06, + "loss": 0.2892, + "step": 21325 + }, + { + "epoch": 0.62, + "grad_norm": 1.4987734821548329, + "learning_rate": 3.3544011414924876e-06, + "loss": 0.2875, + "step": 21326 + }, + { + "epoch": 0.62, + "grad_norm": 1.2944937551983884, + "learning_rate": 3.353957609369134e-06, + "loss": 0.2771, + "step": 21327 + }, + { + "epoch": 0.62, + "grad_norm": 1.343544642113515, + "learning_rate": 3.353514091772122e-06, + "loss": 0.2996, + "step": 21328 + }, + { + "epoch": 0.62, + "grad_norm": 1.601483426889892, + "learning_rate": 3.353070588705366e-06, + "loss": 0.2823, + "step": 21329 + }, + { + "epoch": 0.62, + "grad_norm": 1.384298887330589, + "learning_rate": 3.3526271001727807e-06, + "loss": 0.3011, + "step": 21330 + }, + { + "epoch": 0.62, + "grad_norm": 1.3301932436383317, + "learning_rate": 3.3521836261782805e-06, + "loss": 0.2952, + "step": 21331 + }, + { + "epoch": 0.62, + "grad_norm": 1.7005632603524294, + "learning_rate": 3.3517401667257766e-06, + "loss": 0.2913, + "step": 21332 + }, + { + "epoch": 0.62, + "grad_norm": 1.3853463432864095, + "learning_rate": 3.351296721819185e-06, + "loss": 0.2846, + "step": 21333 + }, + { + "epoch": 0.62, + "grad_norm": 1.291772347050163, + "learning_rate": 3.350853291462417e-06, + "loss": 0.3034, + "step": 21334 + }, + { + "epoch": 0.62, + "grad_norm": 1.24677467088005, + "learning_rate": 3.3504098756593885e-06, + "loss": 0.2955, + "step": 21335 + }, + { + "epoch": 0.62, + "grad_norm": 1.3422754100056686, + "learning_rate": 3.3499664744140105e-06, + "loss": 0.3061, + "step": 21336 + }, + { + "epoch": 0.62, + "grad_norm": 1.3063604460425888, + "learning_rate": 3.3495230877301965e-06, + "loss": 0.3081, + "step": 21337 + }, + { + "epoch": 0.62, + "grad_norm": 1.406079057727035, + "learning_rate": 3.3490797156118594e-06, + "loss": 0.3035, + "step": 21338 + }, + { + "epoch": 0.62, + "grad_norm": 1.4685538799385647, + "learning_rate": 3.3486363580629116e-06, + "loss": 0.2911, + "step": 21339 + }, + { + "epoch": 0.62, + "grad_norm": 1.3162242024176871, + "learning_rate": 3.3481930150872687e-06, + "loss": 0.3061, + "step": 21340 + }, + { + "epoch": 0.62, + "grad_norm": 1.455934711260127, + "learning_rate": 3.3477496866888404e-06, + "loss": 0.2719, + "step": 21341 + }, + { + "epoch": 0.62, + "grad_norm": 0.9990781630215736, + "learning_rate": 3.3473063728715376e-06, + "loss": 0.6624, + "step": 21342 + }, + { + "epoch": 0.62, + "grad_norm": 1.4679842163941328, + "learning_rate": 3.3468630736392747e-06, + "loss": 0.3201, + "step": 21343 + }, + { + "epoch": 0.62, + "grad_norm": 1.5334468672997592, + "learning_rate": 3.346419788995964e-06, + "loss": 0.2848, + "step": 21344 + }, + { + "epoch": 0.62, + "grad_norm": 1.3915509882280581, + "learning_rate": 3.345976518945516e-06, + "loss": 0.3065, + "step": 21345 + }, + { + "epoch": 0.62, + "grad_norm": 1.3795778270828918, + "learning_rate": 3.3455332634918446e-06, + "loss": 0.2819, + "step": 21346 + }, + { + "epoch": 0.62, + "grad_norm": 1.3121262326709144, + "learning_rate": 3.3450900226388606e-06, + "loss": 0.2883, + "step": 21347 + }, + { + "epoch": 0.62, + "grad_norm": 1.414968115793995, + "learning_rate": 3.344646796390475e-06, + "loss": 0.3267, + "step": 21348 + }, + { + "epoch": 0.62, + "grad_norm": 1.6366909802975345, + "learning_rate": 3.3442035847505995e-06, + "loss": 0.3037, + "step": 21349 + }, + { + "epoch": 0.62, + "grad_norm": 0.9349372374875748, + "learning_rate": 3.343760387723147e-06, + "loss": 0.6332, + "step": 21350 + }, + { + "epoch": 0.62, + "grad_norm": 1.2688462383178798, + "learning_rate": 3.3433172053120256e-06, + "loss": 0.3014, + "step": 21351 + }, + { + "epoch": 0.62, + "grad_norm": 1.3018306380187188, + "learning_rate": 3.3428740375211493e-06, + "loss": 0.2792, + "step": 21352 + }, + { + "epoch": 0.62, + "grad_norm": 1.2708758102435964, + "learning_rate": 3.342430884354427e-06, + "loss": 0.2883, + "step": 21353 + }, + { + "epoch": 0.62, + "grad_norm": 1.2749093926754351, + "learning_rate": 3.341987745815773e-06, + "loss": 0.3094, + "step": 21354 + }, + { + "epoch": 0.62, + "grad_norm": 1.3164563123704054, + "learning_rate": 3.3415446219090934e-06, + "loss": 0.2931, + "step": 21355 + }, + { + "epoch": 0.62, + "grad_norm": 1.282532052644482, + "learning_rate": 3.3411015126383016e-06, + "loss": 0.3361, + "step": 21356 + }, + { + "epoch": 0.62, + "grad_norm": 1.3620506554065512, + "learning_rate": 3.3406584180073067e-06, + "loss": 0.2964, + "step": 21357 + }, + { + "epoch": 0.62, + "grad_norm": 1.3218938088000307, + "learning_rate": 3.3402153380200206e-06, + "loss": 0.2771, + "step": 21358 + }, + { + "epoch": 0.62, + "grad_norm": 0.9477049185706148, + "learning_rate": 3.339772272680353e-06, + "loss": 0.6362, + "step": 21359 + }, + { + "epoch": 0.62, + "grad_norm": 1.4043521861620643, + "learning_rate": 3.339329221992212e-06, + "loss": 0.3219, + "step": 21360 + }, + { + "epoch": 0.62, + "grad_norm": 1.2071972875615575, + "learning_rate": 3.3388861859595097e-06, + "loss": 0.2756, + "step": 21361 + }, + { + "epoch": 0.62, + "grad_norm": 1.2878537086268746, + "learning_rate": 3.338443164586155e-06, + "loss": 0.2855, + "step": 21362 + }, + { + "epoch": 0.62, + "grad_norm": 1.427080204411674, + "learning_rate": 3.338000157876059e-06, + "loss": 0.32, + "step": 21363 + }, + { + "epoch": 0.62, + "grad_norm": 1.2131531890575589, + "learning_rate": 3.3375571658331284e-06, + "loss": 0.2863, + "step": 21364 + }, + { + "epoch": 0.62, + "grad_norm": 1.4171226980028575, + "learning_rate": 3.3371141884612747e-06, + "loss": 0.3148, + "step": 21365 + }, + { + "epoch": 0.62, + "grad_norm": 1.2340935604150818, + "learning_rate": 3.336671225764407e-06, + "loss": 0.2732, + "step": 21366 + }, + { + "epoch": 0.62, + "grad_norm": 1.2959593230795727, + "learning_rate": 3.336228277746435e-06, + "loss": 0.3037, + "step": 21367 + }, + { + "epoch": 0.62, + "grad_norm": 1.3674290139699639, + "learning_rate": 3.3357853444112664e-06, + "loss": 0.3179, + "step": 21368 + }, + { + "epoch": 0.62, + "grad_norm": 1.3526479284400186, + "learning_rate": 3.3353424257628113e-06, + "loss": 0.3118, + "step": 21369 + }, + { + "epoch": 0.62, + "grad_norm": 1.7083072965616353, + "learning_rate": 3.3348995218049764e-06, + "loss": 0.2986, + "step": 21370 + }, + { + "epoch": 0.62, + "grad_norm": 1.3771667495674322, + "learning_rate": 3.3344566325416715e-06, + "loss": 0.307, + "step": 21371 + }, + { + "epoch": 0.62, + "grad_norm": 1.2816449138018866, + "learning_rate": 3.334013757976806e-06, + "loss": 0.2948, + "step": 21372 + }, + { + "epoch": 0.62, + "grad_norm": 1.5610691810289512, + "learning_rate": 3.3335708981142883e-06, + "loss": 0.2713, + "step": 21373 + }, + { + "epoch": 0.62, + "grad_norm": 1.577790439018207, + "learning_rate": 3.3331280529580243e-06, + "loss": 0.2892, + "step": 21374 + }, + { + "epoch": 0.62, + "grad_norm": 1.311880479632781, + "learning_rate": 3.3326852225119245e-06, + "loss": 0.2959, + "step": 21375 + }, + { + "epoch": 0.62, + "grad_norm": 1.2121467365763912, + "learning_rate": 3.3322424067798958e-06, + "loss": 0.2928, + "step": 21376 + }, + { + "epoch": 0.62, + "grad_norm": 1.4695035580350149, + "learning_rate": 3.3317996057658476e-06, + "loss": 0.299, + "step": 21377 + }, + { + "epoch": 0.62, + "grad_norm": 1.349472241202647, + "learning_rate": 3.331356819473685e-06, + "loss": 0.2971, + "step": 21378 + }, + { + "epoch": 0.62, + "grad_norm": 1.3251687757017916, + "learning_rate": 3.330914047907317e-06, + "loss": 0.3074, + "step": 21379 + }, + { + "epoch": 0.62, + "grad_norm": 1.3662537818735545, + "learning_rate": 3.3304712910706517e-06, + "loss": 0.29, + "step": 21380 + }, + { + "epoch": 0.62, + "grad_norm": 2.3726627594421643, + "learning_rate": 3.330028548967595e-06, + "loss": 0.2626, + "step": 21381 + }, + { + "epoch": 0.62, + "grad_norm": 1.3590888197176998, + "learning_rate": 3.329585821602056e-06, + "loss": 0.2939, + "step": 21382 + }, + { + "epoch": 0.62, + "grad_norm": 1.3828649493191416, + "learning_rate": 3.32914310897794e-06, + "loss": 0.306, + "step": 21383 + }, + { + "epoch": 0.62, + "grad_norm": 1.3433221197533602, + "learning_rate": 3.328700411099154e-06, + "loss": 0.2942, + "step": 21384 + }, + { + "epoch": 0.62, + "grad_norm": 1.2533657496175312, + "learning_rate": 3.328257727969606e-06, + "loss": 0.2795, + "step": 21385 + }, + { + "epoch": 0.62, + "grad_norm": 1.2660586746517735, + "learning_rate": 3.327815059593203e-06, + "loss": 0.294, + "step": 21386 + }, + { + "epoch": 0.62, + "grad_norm": 1.4614262154941613, + "learning_rate": 3.3273724059738493e-06, + "loss": 0.2711, + "step": 21387 + }, + { + "epoch": 0.62, + "grad_norm": 1.8090775897433513, + "learning_rate": 3.326929767115453e-06, + "loss": 0.3132, + "step": 21388 + }, + { + "epoch": 0.62, + "grad_norm": 1.248808563442271, + "learning_rate": 3.32648714302192e-06, + "loss": 0.3128, + "step": 21389 + }, + { + "epoch": 0.62, + "grad_norm": 1.3273191831022813, + "learning_rate": 3.3260445336971573e-06, + "loss": 0.3155, + "step": 21390 + }, + { + "epoch": 0.62, + "grad_norm": 1.399273755999428, + "learning_rate": 3.3256019391450696e-06, + "loss": 0.2873, + "step": 21391 + }, + { + "epoch": 0.62, + "grad_norm": 0.978307797706669, + "learning_rate": 3.325159359369563e-06, + "loss": 0.6044, + "step": 21392 + }, + { + "epoch": 0.62, + "grad_norm": 1.567908290354552, + "learning_rate": 3.324716794374544e-06, + "loss": 0.2751, + "step": 21393 + }, + { + "epoch": 0.62, + "grad_norm": 1.1520608991216106, + "learning_rate": 3.3242742441639185e-06, + "loss": 0.2602, + "step": 21394 + }, + { + "epoch": 0.62, + "grad_norm": 1.642424247994024, + "learning_rate": 3.3238317087415906e-06, + "loss": 0.3314, + "step": 21395 + }, + { + "epoch": 0.62, + "grad_norm": 1.9815254792294354, + "learning_rate": 3.3233891881114682e-06, + "loss": 0.2638, + "step": 21396 + }, + { + "epoch": 0.62, + "grad_norm": 1.303994385655574, + "learning_rate": 3.322946682277453e-06, + "loss": 0.283, + "step": 21397 + }, + { + "epoch": 0.62, + "grad_norm": 1.2165936001353934, + "learning_rate": 3.322504191243452e-06, + "loss": 0.2913, + "step": 21398 + }, + { + "epoch": 0.62, + "grad_norm": 1.293477207861328, + "learning_rate": 3.32206171501337e-06, + "loss": 0.2827, + "step": 21399 + }, + { + "epoch": 0.62, + "grad_norm": 1.2189037323828391, + "learning_rate": 3.3216192535911116e-06, + "loss": 0.2895, + "step": 21400 + }, + { + "epoch": 0.62, + "grad_norm": 1.4624516059021613, + "learning_rate": 3.3211768069805838e-06, + "loss": 0.3052, + "step": 21401 + }, + { + "epoch": 0.62, + "grad_norm": 1.406186391224972, + "learning_rate": 3.320734375185688e-06, + "loss": 0.2774, + "step": 21402 + }, + { + "epoch": 0.62, + "grad_norm": 1.543945486633179, + "learning_rate": 3.3202919582103298e-06, + "loss": 0.2906, + "step": 21403 + }, + { + "epoch": 0.62, + "grad_norm": 1.8787829780851901, + "learning_rate": 3.3198495560584132e-06, + "loss": 0.2844, + "step": 21404 + }, + { + "epoch": 0.62, + "grad_norm": 1.445466578872821, + "learning_rate": 3.319407168733845e-06, + "loss": 0.2886, + "step": 21405 + }, + { + "epoch": 0.62, + "grad_norm": 1.2982341177955834, + "learning_rate": 3.3189647962405257e-06, + "loss": 0.3123, + "step": 21406 + }, + { + "epoch": 0.62, + "grad_norm": 1.3734292458160853, + "learning_rate": 3.31852243858236e-06, + "loss": 0.3186, + "step": 21407 + }, + { + "epoch": 0.62, + "grad_norm": 1.2991316055219986, + "learning_rate": 3.318080095763253e-06, + "loss": 0.3001, + "step": 21408 + }, + { + "epoch": 0.62, + "grad_norm": 1.4113451247788458, + "learning_rate": 3.3176377677871097e-06, + "loss": 0.2868, + "step": 21409 + }, + { + "epoch": 0.62, + "grad_norm": 1.4019577813447008, + "learning_rate": 3.3171954546578295e-06, + "loss": 0.2852, + "step": 21410 + }, + { + "epoch": 0.62, + "grad_norm": 1.3321931044355493, + "learning_rate": 3.3167531563793188e-06, + "loss": 0.3021, + "step": 21411 + }, + { + "epoch": 0.62, + "grad_norm": 1.4455833389976733, + "learning_rate": 3.3163108729554794e-06, + "loss": 0.2807, + "step": 21412 + }, + { + "epoch": 0.62, + "grad_norm": 1.2803546031322772, + "learning_rate": 3.3158686043902166e-06, + "loss": 0.2827, + "step": 21413 + }, + { + "epoch": 0.62, + "grad_norm": 1.2103922028043879, + "learning_rate": 3.3154263506874317e-06, + "loss": 0.3002, + "step": 21414 + }, + { + "epoch": 0.62, + "grad_norm": 1.464547533518391, + "learning_rate": 3.3149841118510272e-06, + "loss": 0.2823, + "step": 21415 + }, + { + "epoch": 0.62, + "grad_norm": 1.5366931408144102, + "learning_rate": 3.3145418878849065e-06, + "loss": 0.3425, + "step": 21416 + }, + { + "epoch": 0.62, + "grad_norm": 1.485326284575558, + "learning_rate": 3.3140996787929724e-06, + "loss": 0.315, + "step": 21417 + }, + { + "epoch": 0.62, + "grad_norm": 1.2369364508738285, + "learning_rate": 3.313657484579129e-06, + "loss": 0.3087, + "step": 21418 + }, + { + "epoch": 0.62, + "grad_norm": 1.4471396978698865, + "learning_rate": 3.313215305247276e-06, + "loss": 0.2887, + "step": 21419 + }, + { + "epoch": 0.62, + "grad_norm": 1.3555363944237544, + "learning_rate": 3.312773140801316e-06, + "loss": 0.2868, + "step": 21420 + }, + { + "epoch": 0.62, + "grad_norm": 1.6797793924616504, + "learning_rate": 3.312330991245152e-06, + "loss": 0.3023, + "step": 21421 + }, + { + "epoch": 0.62, + "grad_norm": 1.3229430936876532, + "learning_rate": 3.311888856582687e-06, + "loss": 0.2932, + "step": 21422 + }, + { + "epoch": 0.62, + "grad_norm": 1.8326271342333613, + "learning_rate": 3.3114467368178195e-06, + "loss": 0.2984, + "step": 21423 + }, + { + "epoch": 0.62, + "grad_norm": 1.3358846500865433, + "learning_rate": 3.3110046319544563e-06, + "loss": 0.295, + "step": 21424 + }, + { + "epoch": 0.62, + "grad_norm": 1.2605831167896013, + "learning_rate": 3.3105625419964937e-06, + "loss": 0.3015, + "step": 21425 + }, + { + "epoch": 0.62, + "grad_norm": 1.2630721027222127, + "learning_rate": 3.3101204669478356e-06, + "loss": 0.29, + "step": 21426 + }, + { + "epoch": 0.62, + "grad_norm": 1.2304356929471942, + "learning_rate": 3.3096784068123826e-06, + "loss": 0.2685, + "step": 21427 + }, + { + "epoch": 0.62, + "grad_norm": 0.9951637953361412, + "learning_rate": 3.3092363615940383e-06, + "loss": 0.6099, + "step": 21428 + }, + { + "epoch": 0.62, + "grad_norm": 1.3071932428780828, + "learning_rate": 3.3087943312967e-06, + "loss": 0.3126, + "step": 21429 + }, + { + "epoch": 0.62, + "grad_norm": 1.2333397315502006, + "learning_rate": 3.3083523159242704e-06, + "loss": 0.2851, + "step": 21430 + }, + { + "epoch": 0.62, + "grad_norm": 1.3257423418837264, + "learning_rate": 3.3079103154806513e-06, + "loss": 0.2908, + "step": 21431 + }, + { + "epoch": 0.62, + "grad_norm": 1.4684805780924615, + "learning_rate": 3.307468329969743e-06, + "loss": 0.2833, + "step": 21432 + }, + { + "epoch": 0.62, + "grad_norm": 1.3975820577099656, + "learning_rate": 3.3070263593954444e-06, + "loss": 0.3008, + "step": 21433 + }, + { + "epoch": 0.62, + "grad_norm": 1.3875587279295494, + "learning_rate": 3.3065844037616568e-06, + "loss": 0.2971, + "step": 21434 + }, + { + "epoch": 0.62, + "grad_norm": 1.3513786313463938, + "learning_rate": 3.3061424630722807e-06, + "loss": 0.2935, + "step": 21435 + }, + { + "epoch": 0.62, + "grad_norm": 1.3647616772494349, + "learning_rate": 3.3057005373312162e-06, + "loss": 0.2896, + "step": 21436 + }, + { + "epoch": 0.62, + "grad_norm": 1.2502635132323987, + "learning_rate": 3.305258626542364e-06, + "loss": 0.2884, + "step": 21437 + }, + { + "epoch": 0.62, + "grad_norm": 1.4243623651012502, + "learning_rate": 3.304816730709622e-06, + "loss": 0.3023, + "step": 21438 + }, + { + "epoch": 0.62, + "grad_norm": 1.3848089942715867, + "learning_rate": 3.304374849836892e-06, + "loss": 0.2974, + "step": 21439 + }, + { + "epoch": 0.62, + "grad_norm": 1.209638822329031, + "learning_rate": 3.3039329839280725e-06, + "loss": 0.2913, + "step": 21440 + }, + { + "epoch": 0.62, + "grad_norm": 1.2942393218345913, + "learning_rate": 3.3034911329870647e-06, + "loss": 0.3009, + "step": 21441 + }, + { + "epoch": 0.62, + "grad_norm": 1.282237634127423, + "learning_rate": 3.3030492970177648e-06, + "loss": 0.3091, + "step": 21442 + }, + { + "epoch": 0.62, + "grad_norm": 1.7678759400875528, + "learning_rate": 3.3026074760240735e-06, + "loss": 0.3071, + "step": 21443 + }, + { + "epoch": 0.62, + "grad_norm": 1.3326444369480495, + "learning_rate": 3.3021656700098905e-06, + "loss": 0.2946, + "step": 21444 + }, + { + "epoch": 0.62, + "grad_norm": 0.9655545642405743, + "learning_rate": 3.301723878979115e-06, + "loss": 0.6012, + "step": 21445 + }, + { + "epoch": 0.62, + "grad_norm": 1.281790717358245, + "learning_rate": 3.3012821029356446e-06, + "loss": 0.2834, + "step": 21446 + }, + { + "epoch": 0.62, + "grad_norm": 2.284335093224845, + "learning_rate": 3.3008403418833778e-06, + "loss": 0.2876, + "step": 21447 + }, + { + "epoch": 0.62, + "grad_norm": 1.5690602448102302, + "learning_rate": 3.300398595826214e-06, + "loss": 0.3073, + "step": 21448 + }, + { + "epoch": 0.62, + "grad_norm": 1.2017368874231404, + "learning_rate": 3.2999568647680526e-06, + "loss": 0.2738, + "step": 21449 + }, + { + "epoch": 0.62, + "grad_norm": 1.6733457291476785, + "learning_rate": 3.2995151487127895e-06, + "loss": 0.297, + "step": 21450 + }, + { + "epoch": 0.62, + "grad_norm": 1.4253332905788074, + "learning_rate": 3.2990734476643248e-06, + "loss": 0.2798, + "step": 21451 + }, + { + "epoch": 0.62, + "grad_norm": 1.2635876445443475, + "learning_rate": 3.2986317616265574e-06, + "loss": 0.2823, + "step": 21452 + }, + { + "epoch": 0.62, + "grad_norm": 1.5596707313951625, + "learning_rate": 3.2981900906033817e-06, + "loss": 0.2996, + "step": 21453 + }, + { + "epoch": 0.62, + "grad_norm": 1.2290495429590909, + "learning_rate": 3.2977484345986964e-06, + "loss": 0.3083, + "step": 21454 + }, + { + "epoch": 0.62, + "grad_norm": 1.3552364641772447, + "learning_rate": 3.297306793616402e-06, + "loss": 0.3133, + "step": 21455 + }, + { + "epoch": 0.62, + "grad_norm": 1.28256992537367, + "learning_rate": 3.296865167660393e-06, + "loss": 0.2983, + "step": 21456 + }, + { + "epoch": 0.62, + "grad_norm": 1.4240718948372595, + "learning_rate": 3.2964235567345675e-06, + "loss": 0.3079, + "step": 21457 + }, + { + "epoch": 0.62, + "grad_norm": 0.9192652622655971, + "learning_rate": 3.2959819608428233e-06, + "loss": 0.5939, + "step": 21458 + }, + { + "epoch": 0.62, + "grad_norm": 1.4038740601308772, + "learning_rate": 3.2955403799890567e-06, + "loss": 0.2962, + "step": 21459 + }, + { + "epoch": 0.62, + "grad_norm": 1.354356057203916, + "learning_rate": 3.295098814177167e-06, + "loss": 0.3208, + "step": 21460 + }, + { + "epoch": 0.62, + "grad_norm": 1.4130869854725192, + "learning_rate": 3.2946572634110474e-06, + "loss": 0.3012, + "step": 21461 + }, + { + "epoch": 0.62, + "grad_norm": 1.5714716537037425, + "learning_rate": 3.2942157276945964e-06, + "loss": 0.3059, + "step": 21462 + }, + { + "epoch": 0.62, + "grad_norm": 1.2423213664119832, + "learning_rate": 3.2937742070317106e-06, + "loss": 0.2916, + "step": 21463 + }, + { + "epoch": 0.62, + "grad_norm": 1.3705498858607659, + "learning_rate": 3.2933327014262878e-06, + "loss": 0.2931, + "step": 21464 + }, + { + "epoch": 0.62, + "grad_norm": 1.2921306104241586, + "learning_rate": 3.2928912108822216e-06, + "loss": 0.3101, + "step": 21465 + }, + { + "epoch": 0.62, + "grad_norm": 1.328035828718957, + "learning_rate": 3.292449735403409e-06, + "loss": 0.3147, + "step": 21466 + }, + { + "epoch": 0.62, + "grad_norm": 2.9505728776133804, + "learning_rate": 3.2920082749937464e-06, + "loss": 0.3047, + "step": 21467 + }, + { + "epoch": 0.62, + "grad_norm": 2.1952553055152557, + "learning_rate": 3.2915668296571314e-06, + "loss": 0.2779, + "step": 21468 + }, + { + "epoch": 0.62, + "grad_norm": 1.286950072690588, + "learning_rate": 3.2911253993974567e-06, + "loss": 0.3286, + "step": 21469 + }, + { + "epoch": 0.62, + "grad_norm": 1.2083951037230887, + "learning_rate": 3.2906839842186197e-06, + "loss": 0.2939, + "step": 21470 + }, + { + "epoch": 0.62, + "grad_norm": 1.4807681815539933, + "learning_rate": 3.290242584124515e-06, + "loss": 0.2656, + "step": 21471 + }, + { + "epoch": 0.62, + "grad_norm": 1.3756069382213916, + "learning_rate": 3.2898011991190387e-06, + "loss": 0.3092, + "step": 21472 + }, + { + "epoch": 0.62, + "grad_norm": 1.3819133661153995, + "learning_rate": 3.289359829206087e-06, + "loss": 0.2926, + "step": 21473 + }, + { + "epoch": 0.62, + "grad_norm": 1.4235217273935905, + "learning_rate": 3.288918474389552e-06, + "loss": 0.3073, + "step": 21474 + }, + { + "epoch": 0.62, + "grad_norm": 1.2860796750872496, + "learning_rate": 3.2884771346733313e-06, + "loss": 0.3029, + "step": 21475 + }, + { + "epoch": 0.62, + "grad_norm": 1.33361330088213, + "learning_rate": 3.2880358100613186e-06, + "loss": 0.3197, + "step": 21476 + }, + { + "epoch": 0.62, + "grad_norm": 1.3233332140442018, + "learning_rate": 3.2875945005574096e-06, + "loss": 0.292, + "step": 21477 + }, + { + "epoch": 0.62, + "grad_norm": 1.461427683223094, + "learning_rate": 3.2871532061654975e-06, + "loss": 0.2708, + "step": 21478 + }, + { + "epoch": 0.62, + "grad_norm": 1.2189098748320732, + "learning_rate": 3.2867119268894766e-06, + "loss": 0.2847, + "step": 21479 + }, + { + "epoch": 0.62, + "grad_norm": 1.5737646081904195, + "learning_rate": 3.2862706627332454e-06, + "loss": 0.2952, + "step": 21480 + }, + { + "epoch": 0.62, + "grad_norm": 1.3813158427263375, + "learning_rate": 3.2858294137006913e-06, + "loss": 0.2938, + "step": 21481 + }, + { + "epoch": 0.62, + "grad_norm": 1.2712525844036162, + "learning_rate": 3.285388179795712e-06, + "loss": 0.2959, + "step": 21482 + }, + { + "epoch": 0.62, + "grad_norm": 1.4677616507905524, + "learning_rate": 3.2849469610222017e-06, + "loss": 0.3019, + "step": 21483 + }, + { + "epoch": 0.62, + "grad_norm": 1.2949066660801867, + "learning_rate": 3.2845057573840533e-06, + "loss": 0.2875, + "step": 21484 + }, + { + "epoch": 0.62, + "grad_norm": 1.46319135597328, + "learning_rate": 3.2840645688851603e-06, + "loss": 0.2994, + "step": 21485 + }, + { + "epoch": 0.62, + "grad_norm": 1.356175538011024, + "learning_rate": 3.283623395529416e-06, + "loss": 0.2906, + "step": 21486 + }, + { + "epoch": 0.62, + "grad_norm": 1.6153967257469646, + "learning_rate": 3.2831822373207163e-06, + "loss": 0.2997, + "step": 21487 + }, + { + "epoch": 0.62, + "grad_norm": 1.4837843540492655, + "learning_rate": 3.2827410942629505e-06, + "loss": 0.2904, + "step": 21488 + }, + { + "epoch": 0.62, + "grad_norm": 1.4445121969570123, + "learning_rate": 3.282299966360014e-06, + "loss": 0.3009, + "step": 21489 + }, + { + "epoch": 0.62, + "grad_norm": 1.3162597157821352, + "learning_rate": 3.2818588536157993e-06, + "loss": 0.2771, + "step": 21490 + }, + { + "epoch": 0.62, + "grad_norm": 1.4049043130345091, + "learning_rate": 3.2814177560342e-06, + "loss": 0.2754, + "step": 21491 + }, + { + "epoch": 0.62, + "grad_norm": 1.5522186600914052, + "learning_rate": 3.2809766736191074e-06, + "loss": 0.3137, + "step": 21492 + }, + { + "epoch": 0.62, + "grad_norm": 1.3992552292629588, + "learning_rate": 3.2805356063744148e-06, + "loss": 0.2834, + "step": 21493 + }, + { + "epoch": 0.62, + "grad_norm": 1.313278307589701, + "learning_rate": 3.280094554304014e-06, + "loss": 0.3017, + "step": 21494 + }, + { + "epoch": 0.62, + "grad_norm": 1.3763563173501376, + "learning_rate": 3.279653517411798e-06, + "loss": 0.2687, + "step": 21495 + }, + { + "epoch": 0.62, + "grad_norm": 1.3717661669621009, + "learning_rate": 3.27921249570166e-06, + "loss": 0.2919, + "step": 21496 + }, + { + "epoch": 0.62, + "grad_norm": 1.3471414987588075, + "learning_rate": 3.2787714891774903e-06, + "loss": 0.3072, + "step": 21497 + }, + { + "epoch": 0.62, + "grad_norm": 1.5436311218961147, + "learning_rate": 3.2783304978431806e-06, + "loss": 0.2873, + "step": 21498 + }, + { + "epoch": 0.62, + "grad_norm": 1.4723833995377307, + "learning_rate": 3.277889521702624e-06, + "loss": 0.304, + "step": 21499 + }, + { + "epoch": 0.62, + "grad_norm": 1.2343875952253593, + "learning_rate": 3.2774485607597116e-06, + "loss": 0.2936, + "step": 21500 + }, + { + "epoch": 0.62, + "grad_norm": 1.3842129718023244, + "learning_rate": 3.2770076150183344e-06, + "loss": 0.2844, + "step": 21501 + }, + { + "epoch": 0.62, + "grad_norm": 1.3519733406064065, + "learning_rate": 3.276566684482384e-06, + "loss": 0.3004, + "step": 21502 + }, + { + "epoch": 0.62, + "grad_norm": 1.4013298830052248, + "learning_rate": 3.276125769155751e-06, + "loss": 0.3058, + "step": 21503 + }, + { + "epoch": 0.62, + "grad_norm": 0.9641397727299382, + "learning_rate": 3.275684869042329e-06, + "loss": 0.6055, + "step": 21504 + }, + { + "epoch": 0.62, + "grad_norm": 2.2995539787477344, + "learning_rate": 3.2752439841460063e-06, + "loss": 0.2806, + "step": 21505 + }, + { + "epoch": 0.62, + "grad_norm": 1.2439073975015853, + "learning_rate": 3.274803114470675e-06, + "loss": 0.2868, + "step": 21506 + }, + { + "epoch": 0.62, + "grad_norm": 1.2349997897775682, + "learning_rate": 3.274362260020224e-06, + "loss": 0.2848, + "step": 21507 + }, + { + "epoch": 0.62, + "grad_norm": 1.2018693941898122, + "learning_rate": 3.2739214207985482e-06, + "loss": 0.2875, + "step": 21508 + }, + { + "epoch": 0.62, + "grad_norm": 1.3599147776669795, + "learning_rate": 3.273480596809533e-06, + "loss": 0.2875, + "step": 21509 + }, + { + "epoch": 0.62, + "grad_norm": 1.35918478657374, + "learning_rate": 3.2730397880570707e-06, + "loss": 0.2708, + "step": 21510 + }, + { + "epoch": 0.62, + "grad_norm": 1.3705532772919586, + "learning_rate": 3.2725989945450515e-06, + "loss": 0.2847, + "step": 21511 + }, + { + "epoch": 0.62, + "grad_norm": 1.3735179534172175, + "learning_rate": 3.2721582162773646e-06, + "loss": 0.3206, + "step": 21512 + }, + { + "epoch": 0.62, + "grad_norm": 1.4320932191833373, + "learning_rate": 3.271717453257901e-06, + "loss": 0.3095, + "step": 21513 + }, + { + "epoch": 0.62, + "grad_norm": 1.322917018295014, + "learning_rate": 3.2712767054905503e-06, + "loss": 0.3042, + "step": 21514 + }, + { + "epoch": 0.62, + "grad_norm": 4.05225754009945, + "learning_rate": 3.270835972979202e-06, + "loss": 0.3126, + "step": 21515 + }, + { + "epoch": 0.62, + "grad_norm": 1.2296547184068047, + "learning_rate": 3.270395255727745e-06, + "loss": 0.2723, + "step": 21516 + }, + { + "epoch": 0.62, + "grad_norm": 1.5125197780747834, + "learning_rate": 3.269954553740069e-06, + "loss": 0.2817, + "step": 21517 + }, + { + "epoch": 0.62, + "grad_norm": 1.4244013003434373, + "learning_rate": 3.269513867020063e-06, + "loss": 0.3048, + "step": 21518 + }, + { + "epoch": 0.62, + "grad_norm": 1.553606345232099, + "learning_rate": 3.269073195571618e-06, + "loss": 0.279, + "step": 21519 + }, + { + "epoch": 0.62, + "grad_norm": 1.3966367862725253, + "learning_rate": 3.2686325393986195e-06, + "loss": 0.2765, + "step": 21520 + }, + { + "epoch": 0.62, + "grad_norm": 1.3386190078871076, + "learning_rate": 3.2681918985049586e-06, + "loss": 0.3034, + "step": 21521 + }, + { + "epoch": 0.62, + "grad_norm": 1.3532378258225857, + "learning_rate": 3.2677512728945233e-06, + "loss": 0.2935, + "step": 21522 + }, + { + "epoch": 0.62, + "grad_norm": 1.3855147262093284, + "learning_rate": 3.267310662571203e-06, + "loss": 0.3411, + "step": 21523 + }, + { + "epoch": 0.62, + "grad_norm": 1.2712854398367757, + "learning_rate": 3.2668700675388844e-06, + "loss": 0.303, + "step": 21524 + }, + { + "epoch": 0.62, + "grad_norm": 1.3459236277564885, + "learning_rate": 3.2664294878014568e-06, + "loss": 0.3231, + "step": 21525 + }, + { + "epoch": 0.62, + "grad_norm": 1.2092473322018242, + "learning_rate": 3.2659889233628082e-06, + "loss": 0.2686, + "step": 21526 + }, + { + "epoch": 0.62, + "grad_norm": 1.2618728651978302, + "learning_rate": 3.265548374226828e-06, + "loss": 0.2857, + "step": 21527 + }, + { + "epoch": 0.62, + "grad_norm": 1.6714462519933384, + "learning_rate": 3.2651078403974007e-06, + "loss": 0.285, + "step": 21528 + }, + { + "epoch": 0.62, + "grad_norm": 1.3021503347593084, + "learning_rate": 3.2646673218784166e-06, + "loss": 0.2928, + "step": 21529 + }, + { + "epoch": 0.62, + "grad_norm": 1.6190015715448722, + "learning_rate": 3.2642268186737623e-06, + "loss": 0.289, + "step": 21530 + }, + { + "epoch": 0.62, + "grad_norm": 1.2851439166467749, + "learning_rate": 3.2637863307873253e-06, + "loss": 0.2877, + "step": 21531 + }, + { + "epoch": 0.62, + "grad_norm": 1.2374021590705402, + "learning_rate": 3.263345858222995e-06, + "loss": 0.2632, + "step": 21532 + }, + { + "epoch": 0.62, + "grad_norm": 1.476423429927284, + "learning_rate": 3.262905400984655e-06, + "loss": 0.3006, + "step": 21533 + }, + { + "epoch": 0.62, + "grad_norm": 1.2444120634341365, + "learning_rate": 3.2624649590761946e-06, + "loss": 0.2874, + "step": 21534 + }, + { + "epoch": 0.62, + "grad_norm": 1.3512654031759461, + "learning_rate": 3.2620245325014997e-06, + "loss": 0.2887, + "step": 21535 + }, + { + "epoch": 0.62, + "grad_norm": 1.266673184394061, + "learning_rate": 3.2615841212644608e-06, + "loss": 0.31, + "step": 21536 + }, + { + "epoch": 0.62, + "grad_norm": 1.563341487197524, + "learning_rate": 3.2611437253689583e-06, + "loss": 0.3067, + "step": 21537 + }, + { + "epoch": 0.62, + "grad_norm": 0.9136334280650237, + "learning_rate": 3.260703344818883e-06, + "loss": 0.57, + "step": 21538 + }, + { + "epoch": 0.62, + "grad_norm": 1.200425784884263, + "learning_rate": 3.2602629796181185e-06, + "loss": 0.2795, + "step": 21539 + }, + { + "epoch": 0.62, + "grad_norm": 1.292433825707433, + "learning_rate": 3.259822629770553e-06, + "loss": 0.3288, + "step": 21540 + }, + { + "epoch": 0.62, + "grad_norm": 1.2384419528339148, + "learning_rate": 3.2593822952800726e-06, + "loss": 0.2887, + "step": 21541 + }, + { + "epoch": 0.62, + "grad_norm": 1.4784473366037771, + "learning_rate": 3.2589419761505627e-06, + "loss": 0.2939, + "step": 21542 + }, + { + "epoch": 0.62, + "grad_norm": 1.3368384238438342, + "learning_rate": 3.2585016723859086e-06, + "loss": 0.2788, + "step": 21543 + }, + { + "epoch": 0.62, + "grad_norm": 1.3107539056793442, + "learning_rate": 3.2580613839899964e-06, + "loss": 0.288, + "step": 21544 + }, + { + "epoch": 0.62, + "grad_norm": 1.2616017063019447, + "learning_rate": 3.257621110966712e-06, + "loss": 0.2942, + "step": 21545 + }, + { + "epoch": 0.62, + "grad_norm": 1.4967870581725067, + "learning_rate": 3.2571808533199415e-06, + "loss": 0.2818, + "step": 21546 + }, + { + "epoch": 0.62, + "grad_norm": 1.2120795208693564, + "learning_rate": 3.256740611053568e-06, + "loss": 0.2878, + "step": 21547 + }, + { + "epoch": 0.62, + "grad_norm": 1.3513426427130266, + "learning_rate": 3.256300384171479e-06, + "loss": 0.2975, + "step": 21548 + }, + { + "epoch": 0.63, + "grad_norm": 1.3798041704836483, + "learning_rate": 3.2558601726775573e-06, + "loss": 0.3081, + "step": 21549 + }, + { + "epoch": 0.63, + "grad_norm": 1.365849824846852, + "learning_rate": 3.25541997657569e-06, + "loss": 0.2908, + "step": 21550 + }, + { + "epoch": 0.63, + "grad_norm": 1.2974179762656532, + "learning_rate": 3.254979795869761e-06, + "loss": 0.2971, + "step": 21551 + }, + { + "epoch": 0.63, + "grad_norm": 2.573509780513447, + "learning_rate": 3.254539630563654e-06, + "loss": 0.2896, + "step": 21552 + }, + { + "epoch": 0.63, + "grad_norm": 1.715076159901581, + "learning_rate": 3.2540994806612538e-06, + "loss": 0.2802, + "step": 21553 + }, + { + "epoch": 0.63, + "grad_norm": 2.138001706455173, + "learning_rate": 3.2536593461664458e-06, + "loss": 0.287, + "step": 21554 + }, + { + "epoch": 0.63, + "grad_norm": 1.3191297367456785, + "learning_rate": 3.253219227083113e-06, + "loss": 0.2752, + "step": 21555 + }, + { + "epoch": 0.63, + "grad_norm": 1.2529636058054292, + "learning_rate": 3.25277912341514e-06, + "loss": 0.285, + "step": 21556 + }, + { + "epoch": 0.63, + "grad_norm": 1.3098398089187147, + "learning_rate": 3.2523390351664106e-06, + "loss": 0.2588, + "step": 21557 + }, + { + "epoch": 0.63, + "grad_norm": 1.4280333922824635, + "learning_rate": 3.2518989623408084e-06, + "loss": 0.301, + "step": 21558 + }, + { + "epoch": 0.63, + "grad_norm": 1.3580549821462884, + "learning_rate": 3.2514589049422184e-06, + "loss": 0.2932, + "step": 21559 + }, + { + "epoch": 0.63, + "grad_norm": 1.7598843593712217, + "learning_rate": 3.2510188629745215e-06, + "loss": 0.2718, + "step": 21560 + }, + { + "epoch": 0.63, + "grad_norm": 2.613437708812301, + "learning_rate": 3.250578836441603e-06, + "loss": 0.2878, + "step": 21561 + }, + { + "epoch": 0.63, + "grad_norm": 1.4338322566520205, + "learning_rate": 3.2501388253473455e-06, + "loss": 0.3048, + "step": 21562 + }, + { + "epoch": 0.63, + "grad_norm": 1.2739427200393754, + "learning_rate": 3.2496988296956333e-06, + "loss": 0.2879, + "step": 21563 + }, + { + "epoch": 0.63, + "grad_norm": 1.3649770497828253, + "learning_rate": 3.2492588494903483e-06, + "loss": 0.3021, + "step": 21564 + }, + { + "epoch": 0.63, + "grad_norm": 1.1914030026095932, + "learning_rate": 3.248818884735373e-06, + "loss": 0.2718, + "step": 21565 + }, + { + "epoch": 0.63, + "grad_norm": 1.307660585174073, + "learning_rate": 3.24837893543459e-06, + "loss": 0.3031, + "step": 21566 + }, + { + "epoch": 0.63, + "grad_norm": 1.314775203011033, + "learning_rate": 3.2479390015918823e-06, + "loss": 0.2701, + "step": 21567 + }, + { + "epoch": 0.63, + "grad_norm": 1.6820399051370614, + "learning_rate": 3.2474990832111327e-06, + "loss": 0.2806, + "step": 21568 + }, + { + "epoch": 0.63, + "grad_norm": 1.31438075006674, + "learning_rate": 3.2470591802962226e-06, + "loss": 0.2829, + "step": 21569 + }, + { + "epoch": 0.63, + "grad_norm": 2.8062439991416213, + "learning_rate": 3.2466192928510352e-06, + "loss": 0.3195, + "step": 21570 + }, + { + "epoch": 0.63, + "grad_norm": 1.2362739421233937, + "learning_rate": 3.2461794208794517e-06, + "loss": 0.2911, + "step": 21571 + }, + { + "epoch": 0.63, + "grad_norm": 1.2433270241484158, + "learning_rate": 3.245739564385354e-06, + "loss": 0.3008, + "step": 21572 + }, + { + "epoch": 0.63, + "grad_norm": 1.438055901328728, + "learning_rate": 3.2452997233726246e-06, + "loss": 0.307, + "step": 21573 + }, + { + "epoch": 0.63, + "grad_norm": 1.9061164900248222, + "learning_rate": 3.244859897845145e-06, + "loss": 0.3153, + "step": 21574 + }, + { + "epoch": 0.63, + "grad_norm": 1.2553989372574172, + "learning_rate": 3.2444200878067954e-06, + "loss": 0.2842, + "step": 21575 + }, + { + "epoch": 0.63, + "grad_norm": 1.4727996911189967, + "learning_rate": 3.2439802932614585e-06, + "loss": 0.291, + "step": 21576 + }, + { + "epoch": 0.63, + "grad_norm": 1.6602951509418316, + "learning_rate": 3.2435405142130145e-06, + "loss": 0.304, + "step": 21577 + }, + { + "epoch": 0.63, + "grad_norm": 2.631635215642783, + "learning_rate": 3.243100750665346e-06, + "loss": 0.3167, + "step": 21578 + }, + { + "epoch": 0.63, + "grad_norm": 1.3003327110329208, + "learning_rate": 3.2426610026223317e-06, + "loss": 0.2786, + "step": 21579 + }, + { + "epoch": 0.63, + "grad_norm": 1.3018427007264408, + "learning_rate": 3.242221270087854e-06, + "loss": 0.2851, + "step": 21580 + }, + { + "epoch": 0.63, + "grad_norm": 1.2841785426459038, + "learning_rate": 3.2417815530657924e-06, + "loss": 0.2963, + "step": 21581 + }, + { + "epoch": 0.63, + "grad_norm": 1.405102152177565, + "learning_rate": 3.2413418515600303e-06, + "loss": 0.3172, + "step": 21582 + }, + { + "epoch": 0.63, + "grad_norm": 1.3923137837175203, + "learning_rate": 3.2409021655744436e-06, + "loss": 0.275, + "step": 21583 + }, + { + "epoch": 0.63, + "grad_norm": 1.6443040071532156, + "learning_rate": 3.2404624951129156e-06, + "loss": 0.2977, + "step": 21584 + }, + { + "epoch": 0.63, + "grad_norm": 1.4065770261213497, + "learning_rate": 3.2400228401793255e-06, + "loss": 0.3123, + "step": 21585 + }, + { + "epoch": 0.63, + "grad_norm": 1.4425788475426133, + "learning_rate": 3.239583200777553e-06, + "loss": 0.2817, + "step": 21586 + }, + { + "epoch": 0.63, + "grad_norm": 1.4558315251826275, + "learning_rate": 3.239143576911479e-06, + "loss": 0.2864, + "step": 21587 + }, + { + "epoch": 0.63, + "grad_norm": 1.3765598819094451, + "learning_rate": 3.238703968584982e-06, + "loss": 0.3055, + "step": 21588 + }, + { + "epoch": 0.63, + "grad_norm": 1.2901786116519671, + "learning_rate": 3.238264375801942e-06, + "loss": 0.289, + "step": 21589 + }, + { + "epoch": 0.63, + "grad_norm": 1.197394873483976, + "learning_rate": 3.2378247985662387e-06, + "loss": 0.2748, + "step": 21590 + }, + { + "epoch": 0.63, + "grad_norm": 1.340137335419953, + "learning_rate": 3.2373852368817515e-06, + "loss": 0.2975, + "step": 21591 + }, + { + "epoch": 0.63, + "grad_norm": 1.3993419520542378, + "learning_rate": 3.23694569075236e-06, + "loss": 0.2752, + "step": 21592 + }, + { + "epoch": 0.63, + "grad_norm": 1.431058771721201, + "learning_rate": 3.2365061601819416e-06, + "loss": 0.301, + "step": 21593 + }, + { + "epoch": 0.63, + "grad_norm": 1.271512590976876, + "learning_rate": 3.236066645174375e-06, + "loss": 0.2994, + "step": 21594 + }, + { + "epoch": 0.63, + "grad_norm": 1.345879213348608, + "learning_rate": 3.23562714573354e-06, + "loss": 0.2978, + "step": 21595 + }, + { + "epoch": 0.63, + "grad_norm": 1.2690834661041628, + "learning_rate": 3.235187661863315e-06, + "loss": 0.2909, + "step": 21596 + }, + { + "epoch": 0.63, + "grad_norm": 1.5667386824039755, + "learning_rate": 3.234748193567579e-06, + "loss": 0.2917, + "step": 21597 + }, + { + "epoch": 0.63, + "grad_norm": 1.6062662911943935, + "learning_rate": 3.2343087408502094e-06, + "loss": 0.2946, + "step": 21598 + }, + { + "epoch": 0.63, + "grad_norm": 1.3566230312445982, + "learning_rate": 3.233869303715084e-06, + "loss": 0.3153, + "step": 21599 + }, + { + "epoch": 0.63, + "grad_norm": 1.84797306235576, + "learning_rate": 3.2334298821660824e-06, + "loss": 0.3065, + "step": 21600 + }, + { + "epoch": 0.63, + "grad_norm": 1.3507153466017958, + "learning_rate": 3.2329904762070817e-06, + "loss": 0.3136, + "step": 21601 + }, + { + "epoch": 0.63, + "grad_norm": 1.4675516308138503, + "learning_rate": 3.2325510858419597e-06, + "loss": 0.2883, + "step": 21602 + }, + { + "epoch": 0.63, + "grad_norm": 1.5002983153512872, + "learning_rate": 3.2321117110745937e-06, + "loss": 0.2872, + "step": 21603 + }, + { + "epoch": 0.63, + "grad_norm": 1.2612430415145153, + "learning_rate": 3.231672351908861e-06, + "loss": 0.2914, + "step": 21604 + }, + { + "epoch": 0.63, + "grad_norm": 1.4253013313434952, + "learning_rate": 3.23123300834864e-06, + "loss": 0.2976, + "step": 21605 + }, + { + "epoch": 0.63, + "grad_norm": 1.7136863228758186, + "learning_rate": 3.2307936803978077e-06, + "loss": 0.2664, + "step": 21606 + }, + { + "epoch": 0.63, + "grad_norm": 1.3125196030737543, + "learning_rate": 3.2303543680602406e-06, + "loss": 0.2742, + "step": 21607 + }, + { + "epoch": 0.63, + "grad_norm": 1.2598868724819339, + "learning_rate": 3.229915071339815e-06, + "loss": 0.2756, + "step": 21608 + }, + { + "epoch": 0.63, + "grad_norm": 1.56458167406799, + "learning_rate": 3.2294757902404093e-06, + "loss": 0.2982, + "step": 21609 + }, + { + "epoch": 0.63, + "grad_norm": 1.3632968389681543, + "learning_rate": 3.2290365247659e-06, + "loss": 0.291, + "step": 21610 + }, + { + "epoch": 0.63, + "grad_norm": 1.5355909559287353, + "learning_rate": 3.2285972749201623e-06, + "loss": 0.2899, + "step": 21611 + }, + { + "epoch": 0.63, + "grad_norm": 1.3753598318664468, + "learning_rate": 3.2281580407070723e-06, + "loss": 0.3137, + "step": 21612 + }, + { + "epoch": 0.63, + "grad_norm": 1.4121546786994172, + "learning_rate": 3.227718822130508e-06, + "loss": 0.2988, + "step": 21613 + }, + { + "epoch": 0.63, + "grad_norm": 1.283399053819783, + "learning_rate": 3.227279619194346e-06, + "loss": 0.2698, + "step": 21614 + }, + { + "epoch": 0.63, + "grad_norm": 1.8098734258852693, + "learning_rate": 3.22684043190246e-06, + "loss": 0.286, + "step": 21615 + }, + { + "epoch": 0.63, + "grad_norm": 1.2509584356364636, + "learning_rate": 3.2264012602587263e-06, + "loss": 0.2712, + "step": 21616 + }, + { + "epoch": 0.63, + "grad_norm": 0.9514428055717187, + "learning_rate": 3.225962104267022e-06, + "loss": 0.5825, + "step": 21617 + }, + { + "epoch": 0.63, + "grad_norm": 1.2710888172824284, + "learning_rate": 3.225522963931222e-06, + "loss": 0.2767, + "step": 21618 + }, + { + "epoch": 0.63, + "grad_norm": 1.4815671466538731, + "learning_rate": 3.225083839255201e-06, + "loss": 0.3096, + "step": 21619 + }, + { + "epoch": 0.63, + "grad_norm": 1.316513397295726, + "learning_rate": 3.224644730242834e-06, + "loss": 0.2856, + "step": 21620 + }, + { + "epoch": 0.63, + "grad_norm": 1.2575303638021196, + "learning_rate": 3.2242056368980003e-06, + "loss": 0.2782, + "step": 21621 + }, + { + "epoch": 0.63, + "grad_norm": 1.7663807283470885, + "learning_rate": 3.2237665592245682e-06, + "loss": 0.2726, + "step": 21622 + }, + { + "epoch": 0.63, + "grad_norm": 2.03210712489824, + "learning_rate": 3.2233274972264172e-06, + "loss": 0.289, + "step": 21623 + }, + { + "epoch": 0.63, + "grad_norm": 1.4252828828046578, + "learning_rate": 3.22288845090742e-06, + "loss": 0.3275, + "step": 21624 + }, + { + "epoch": 0.63, + "grad_norm": 2.0221521451896103, + "learning_rate": 3.222449420271453e-06, + "loss": 0.2927, + "step": 21625 + }, + { + "epoch": 0.63, + "grad_norm": 1.2482108421885234, + "learning_rate": 3.222010405322388e-06, + "loss": 0.2822, + "step": 21626 + }, + { + "epoch": 0.63, + "grad_norm": 1.6420500086441685, + "learning_rate": 3.221571406064102e-06, + "loss": 0.2812, + "step": 21627 + }, + { + "epoch": 0.63, + "grad_norm": 1.2271919613919022, + "learning_rate": 3.2211324225004673e-06, + "loss": 0.2817, + "step": 21628 + }, + { + "epoch": 0.63, + "grad_norm": 1.3111750441907268, + "learning_rate": 3.2206934546353604e-06, + "loss": 0.3242, + "step": 21629 + }, + { + "epoch": 0.63, + "grad_norm": 1.5413586652191862, + "learning_rate": 3.220254502472652e-06, + "loss": 0.3131, + "step": 21630 + }, + { + "epoch": 0.63, + "grad_norm": 1.2325743532178635, + "learning_rate": 3.2198155660162173e-06, + "loss": 0.2926, + "step": 21631 + }, + { + "epoch": 0.63, + "grad_norm": 1.3825380960361684, + "learning_rate": 3.2193766452699303e-06, + "loss": 0.3131, + "step": 21632 + }, + { + "epoch": 0.63, + "grad_norm": 1.330893411866999, + "learning_rate": 3.2189377402376646e-06, + "loss": 0.2965, + "step": 21633 + }, + { + "epoch": 0.63, + "grad_norm": 1.3085762135276495, + "learning_rate": 3.2184988509232926e-06, + "loss": 0.3036, + "step": 21634 + }, + { + "epoch": 0.63, + "grad_norm": 2.971080441908035, + "learning_rate": 3.218059977330688e-06, + "loss": 0.2657, + "step": 21635 + }, + { + "epoch": 0.63, + "grad_norm": 1.2017244406287784, + "learning_rate": 3.2176211194637234e-06, + "loss": 0.2754, + "step": 21636 + }, + { + "epoch": 0.63, + "grad_norm": 1.354145741508568, + "learning_rate": 3.217182277326274e-06, + "loss": 0.291, + "step": 21637 + }, + { + "epoch": 0.63, + "grad_norm": 1.48755510053687, + "learning_rate": 3.2167434509222096e-06, + "loss": 0.2885, + "step": 21638 + }, + { + "epoch": 0.63, + "grad_norm": 1.2564446571537193, + "learning_rate": 3.2163046402554032e-06, + "loss": 0.2778, + "step": 21639 + }, + { + "epoch": 0.63, + "grad_norm": 1.302115447079981, + "learning_rate": 3.215865845329729e-06, + "loss": 0.2816, + "step": 21640 + }, + { + "epoch": 0.63, + "grad_norm": 1.6056315605821185, + "learning_rate": 3.215427066149058e-06, + "loss": 0.3135, + "step": 21641 + }, + { + "epoch": 0.63, + "grad_norm": 1.5750828726275794, + "learning_rate": 3.2149883027172644e-06, + "loss": 0.3099, + "step": 21642 + }, + { + "epoch": 0.63, + "grad_norm": 1.5433726389426072, + "learning_rate": 3.214549555038218e-06, + "loss": 0.2764, + "step": 21643 + }, + { + "epoch": 0.63, + "grad_norm": 1.3214181146773634, + "learning_rate": 3.21411082311579e-06, + "loss": 0.2911, + "step": 21644 + }, + { + "epoch": 0.63, + "grad_norm": 1.5560085707245124, + "learning_rate": 3.2136721069538557e-06, + "loss": 0.2746, + "step": 21645 + }, + { + "epoch": 0.63, + "grad_norm": 1.7282954372004464, + "learning_rate": 3.213233406556285e-06, + "loss": 0.2966, + "step": 21646 + }, + { + "epoch": 0.63, + "grad_norm": 1.2021065792416488, + "learning_rate": 3.2127947219269486e-06, + "loss": 0.2802, + "step": 21647 + }, + { + "epoch": 0.63, + "grad_norm": 1.359026515848916, + "learning_rate": 3.2123560530697185e-06, + "loss": 0.3021, + "step": 21648 + }, + { + "epoch": 0.63, + "grad_norm": 1.461744911608169, + "learning_rate": 3.2119173999884678e-06, + "loss": 0.2753, + "step": 21649 + }, + { + "epoch": 0.63, + "grad_norm": 1.489209590645383, + "learning_rate": 3.2114787626870635e-06, + "loss": 0.2937, + "step": 21650 + }, + { + "epoch": 0.63, + "grad_norm": 1.4545015828261796, + "learning_rate": 3.2110401411693804e-06, + "loss": 0.2818, + "step": 21651 + }, + { + "epoch": 0.63, + "grad_norm": 1.3234495449109593, + "learning_rate": 3.2106015354392878e-06, + "loss": 0.2834, + "step": 21652 + }, + { + "epoch": 0.63, + "grad_norm": 1.8426097800357306, + "learning_rate": 3.2101629455006557e-06, + "loss": 0.2907, + "step": 21653 + }, + { + "epoch": 0.63, + "grad_norm": 1.5426196291946297, + "learning_rate": 3.2097243713573555e-06, + "loss": 0.3267, + "step": 21654 + }, + { + "epoch": 0.63, + "grad_norm": 1.4991741747117018, + "learning_rate": 3.2092858130132578e-06, + "loss": 0.2987, + "step": 21655 + }, + { + "epoch": 0.63, + "grad_norm": 1.4223961832214576, + "learning_rate": 3.208847270472233e-06, + "loss": 0.2936, + "step": 21656 + }, + { + "epoch": 0.63, + "grad_norm": 1.364383263832316, + "learning_rate": 3.208408743738151e-06, + "loss": 0.2898, + "step": 21657 + }, + { + "epoch": 0.63, + "grad_norm": 1.4106554409522603, + "learning_rate": 3.2079702328148805e-06, + "loss": 0.3262, + "step": 21658 + }, + { + "epoch": 0.63, + "grad_norm": 1.274412297900659, + "learning_rate": 3.207531737706293e-06, + "loss": 0.3013, + "step": 21659 + }, + { + "epoch": 0.63, + "grad_norm": 1.6421955140283893, + "learning_rate": 3.2070932584162583e-06, + "loss": 0.308, + "step": 21660 + }, + { + "epoch": 0.63, + "grad_norm": 1.7621464183733284, + "learning_rate": 3.206654794948646e-06, + "loss": 0.285, + "step": 21661 + }, + { + "epoch": 0.63, + "grad_norm": 1.4068814450082454, + "learning_rate": 3.206216347307324e-06, + "loss": 0.2855, + "step": 21662 + }, + { + "epoch": 0.63, + "grad_norm": 1.7241353137456448, + "learning_rate": 3.2057779154961634e-06, + "loss": 0.3251, + "step": 21663 + }, + { + "epoch": 0.63, + "grad_norm": 1.2055765097917228, + "learning_rate": 3.205339499519031e-06, + "loss": 0.2938, + "step": 21664 + }, + { + "epoch": 0.63, + "grad_norm": 1.4142413896150252, + "learning_rate": 3.2049010993798e-06, + "loss": 0.3164, + "step": 21665 + }, + { + "epoch": 0.63, + "grad_norm": 1.198864543660347, + "learning_rate": 3.204462715082335e-06, + "loss": 0.2754, + "step": 21666 + }, + { + "epoch": 0.63, + "grad_norm": 1.3158550487577545, + "learning_rate": 3.2040243466305065e-06, + "loss": 0.2615, + "step": 21667 + }, + { + "epoch": 0.63, + "grad_norm": 1.4151503015674978, + "learning_rate": 3.2035859940281833e-06, + "loss": 0.2996, + "step": 21668 + }, + { + "epoch": 0.63, + "grad_norm": 0.9583151946260154, + "learning_rate": 3.203147657279234e-06, + "loss": 0.6197, + "step": 21669 + }, + { + "epoch": 0.63, + "grad_norm": 1.263153125786969, + "learning_rate": 3.2027093363875263e-06, + "loss": 0.2809, + "step": 21670 + }, + { + "epoch": 0.63, + "grad_norm": 1.2820263103156098, + "learning_rate": 3.202271031356928e-06, + "loss": 0.2925, + "step": 21671 + }, + { + "epoch": 0.63, + "grad_norm": 1.2876381041912421, + "learning_rate": 3.2018327421913087e-06, + "loss": 0.2967, + "step": 21672 + }, + { + "epoch": 0.63, + "grad_norm": 1.3302416798265557, + "learning_rate": 3.201394468894536e-06, + "loss": 0.2989, + "step": 21673 + }, + { + "epoch": 0.63, + "grad_norm": 1.3498168058706725, + "learning_rate": 3.200956211470476e-06, + "loss": 0.306, + "step": 21674 + }, + { + "epoch": 0.63, + "grad_norm": 1.256210461881346, + "learning_rate": 3.200517969922997e-06, + "loss": 0.2904, + "step": 21675 + }, + { + "epoch": 0.63, + "grad_norm": 1.4785568246980998, + "learning_rate": 3.2000797442559673e-06, + "loss": 0.2945, + "step": 21676 + }, + { + "epoch": 0.63, + "grad_norm": 1.422668259085592, + "learning_rate": 3.1996415344732554e-06, + "loss": 0.2962, + "step": 21677 + }, + { + "epoch": 0.63, + "grad_norm": 1.2364070068144541, + "learning_rate": 3.199203340578726e-06, + "loss": 0.2842, + "step": 21678 + }, + { + "epoch": 0.63, + "grad_norm": 1.3001936924500856, + "learning_rate": 3.1987651625762473e-06, + "loss": 0.3031, + "step": 21679 + }, + { + "epoch": 0.63, + "grad_norm": 2.1227858277544955, + "learning_rate": 3.198327000469685e-06, + "loss": 0.2927, + "step": 21680 + }, + { + "epoch": 0.63, + "grad_norm": 1.3544180342146666, + "learning_rate": 3.197888854262907e-06, + "loss": 0.291, + "step": 21681 + }, + { + "epoch": 0.63, + "grad_norm": 1.5816370647507498, + "learning_rate": 3.1974507239597798e-06, + "loss": 0.2706, + "step": 21682 + }, + { + "epoch": 0.63, + "grad_norm": 1.2899482415524506, + "learning_rate": 3.1970126095641696e-06, + "loss": 0.2938, + "step": 21683 + }, + { + "epoch": 0.63, + "grad_norm": 1.323288649943598, + "learning_rate": 3.196574511079945e-06, + "loss": 0.294, + "step": 21684 + }, + { + "epoch": 0.63, + "grad_norm": 1.3438366629915446, + "learning_rate": 3.1961364285109686e-06, + "loss": 0.3026, + "step": 21685 + }, + { + "epoch": 0.63, + "grad_norm": 1.2775284818918597, + "learning_rate": 3.195698361861108e-06, + "loss": 0.2811, + "step": 21686 + }, + { + "epoch": 0.63, + "grad_norm": 1.3327944956405955, + "learning_rate": 3.1952603111342304e-06, + "loss": 0.3052, + "step": 21687 + }, + { + "epoch": 0.63, + "grad_norm": 1.2198195009924413, + "learning_rate": 3.194822276334201e-06, + "loss": 0.2849, + "step": 21688 + }, + { + "epoch": 0.63, + "grad_norm": 1.3121354206450229, + "learning_rate": 3.194384257464884e-06, + "loss": 0.2788, + "step": 21689 + }, + { + "epoch": 0.63, + "grad_norm": 1.2611773464402225, + "learning_rate": 3.193946254530146e-06, + "loss": 0.2985, + "step": 21690 + }, + { + "epoch": 0.63, + "grad_norm": 1.2610191789761362, + "learning_rate": 3.1935082675338523e-06, + "loss": 0.2917, + "step": 21691 + }, + { + "epoch": 0.63, + "grad_norm": 1.2542319264323682, + "learning_rate": 3.19307029647987e-06, + "loss": 0.3224, + "step": 21692 + }, + { + "epoch": 0.63, + "grad_norm": 1.3333305202703765, + "learning_rate": 3.1926323413720605e-06, + "loss": 0.2755, + "step": 21693 + }, + { + "epoch": 0.63, + "grad_norm": 1.2991767581991434, + "learning_rate": 3.1921944022142904e-06, + "loss": 0.2915, + "step": 21694 + }, + { + "epoch": 0.63, + "grad_norm": 1.3658304195170672, + "learning_rate": 3.191756479010425e-06, + "loss": 0.2913, + "step": 21695 + }, + { + "epoch": 0.63, + "grad_norm": 1.3089154385356045, + "learning_rate": 3.191318571764329e-06, + "loss": 0.2954, + "step": 21696 + }, + { + "epoch": 0.63, + "grad_norm": 1.282784211485123, + "learning_rate": 3.1908806804798675e-06, + "loss": 0.284, + "step": 21697 + }, + { + "epoch": 0.63, + "grad_norm": 1.5668494630420007, + "learning_rate": 3.1904428051609034e-06, + "loss": 0.2868, + "step": 21698 + }, + { + "epoch": 0.63, + "grad_norm": 1.257922122291378, + "learning_rate": 3.1900049458113013e-06, + "loss": 0.2911, + "step": 21699 + }, + { + "epoch": 0.63, + "grad_norm": 1.2289581879706812, + "learning_rate": 3.189567102434925e-06, + "loss": 0.3071, + "step": 21700 + }, + { + "epoch": 0.63, + "grad_norm": 1.434017970330092, + "learning_rate": 3.189129275035641e-06, + "loss": 0.3191, + "step": 21701 + }, + { + "epoch": 0.63, + "grad_norm": 1.3492867145608827, + "learning_rate": 3.1886914636173105e-06, + "loss": 0.3266, + "step": 21702 + }, + { + "epoch": 0.63, + "grad_norm": 1.3954631309425312, + "learning_rate": 3.1882536681837974e-06, + "loss": 0.3066, + "step": 21703 + }, + { + "epoch": 0.63, + "grad_norm": 1.2563788889757903, + "learning_rate": 3.1878158887389653e-06, + "loss": 0.2882, + "step": 21704 + }, + { + "epoch": 0.63, + "grad_norm": 1.3544232259593068, + "learning_rate": 3.1873781252866808e-06, + "loss": 0.2721, + "step": 21705 + }, + { + "epoch": 0.63, + "grad_norm": 1.429090930802187, + "learning_rate": 3.1869403778308023e-06, + "loss": 0.2877, + "step": 21706 + }, + { + "epoch": 0.63, + "grad_norm": 1.4408700252775641, + "learning_rate": 3.1865026463751958e-06, + "loss": 0.2933, + "step": 21707 + }, + { + "epoch": 0.63, + "grad_norm": 1.198137732936792, + "learning_rate": 3.186064930923723e-06, + "loss": 0.2769, + "step": 21708 + }, + { + "epoch": 0.63, + "grad_norm": 1.289741588996083, + "learning_rate": 3.185627231480247e-06, + "loss": 0.2707, + "step": 21709 + }, + { + "epoch": 0.63, + "grad_norm": 1.504690053261639, + "learning_rate": 3.1851895480486305e-06, + "loss": 0.2791, + "step": 21710 + }, + { + "epoch": 0.63, + "grad_norm": 1.2988424877441826, + "learning_rate": 3.184751880632738e-06, + "loss": 0.2885, + "step": 21711 + }, + { + "epoch": 0.63, + "grad_norm": 1.43322923290838, + "learning_rate": 3.1843142292364293e-06, + "loss": 0.3002, + "step": 21712 + }, + { + "epoch": 0.63, + "grad_norm": 0.9571592182214291, + "learning_rate": 3.1838765938635673e-06, + "loss": 0.5845, + "step": 21713 + }, + { + "epoch": 0.63, + "grad_norm": 1.5174952703888012, + "learning_rate": 3.183438974518015e-06, + "loss": 0.2709, + "step": 21714 + }, + { + "epoch": 0.63, + "grad_norm": 1.9476015494293943, + "learning_rate": 3.1830013712036346e-06, + "loss": 0.2816, + "step": 21715 + }, + { + "epoch": 0.63, + "grad_norm": 2.243247375866319, + "learning_rate": 3.1825637839242856e-06, + "loss": 0.2948, + "step": 21716 + }, + { + "epoch": 0.63, + "grad_norm": 1.022714799604269, + "learning_rate": 3.182126212683832e-06, + "loss": 0.5966, + "step": 21717 + }, + { + "epoch": 0.63, + "grad_norm": 1.4428051384618996, + "learning_rate": 3.181688657486135e-06, + "loss": 0.2988, + "step": 21718 + }, + { + "epoch": 0.63, + "grad_norm": 1.3470944901130408, + "learning_rate": 3.181251118335055e-06, + "loss": 0.285, + "step": 21719 + }, + { + "epoch": 0.63, + "grad_norm": 1.7536689265043255, + "learning_rate": 3.1808135952344553e-06, + "loss": 0.2997, + "step": 21720 + }, + { + "epoch": 0.63, + "grad_norm": 1.3479369063728817, + "learning_rate": 3.180376088188195e-06, + "loss": 0.3021, + "step": 21721 + }, + { + "epoch": 0.63, + "grad_norm": 1.5539991876268615, + "learning_rate": 3.179938597200136e-06, + "loss": 0.284, + "step": 21722 + }, + { + "epoch": 0.63, + "grad_norm": 1.419809766102284, + "learning_rate": 3.179501122274139e-06, + "loss": 0.2817, + "step": 21723 + }, + { + "epoch": 0.63, + "grad_norm": 1.6699505093257423, + "learning_rate": 3.179063663414066e-06, + "loss": 0.3077, + "step": 21724 + }, + { + "epoch": 0.63, + "grad_norm": 1.6656489827730334, + "learning_rate": 3.178626220623775e-06, + "loss": 0.2957, + "step": 21725 + }, + { + "epoch": 0.63, + "grad_norm": 1.4462808393620887, + "learning_rate": 3.178188793907128e-06, + "loss": 0.2927, + "step": 21726 + }, + { + "epoch": 0.63, + "grad_norm": 1.4792713152418084, + "learning_rate": 3.1777513832679854e-06, + "loss": 0.3216, + "step": 21727 + }, + { + "epoch": 0.63, + "grad_norm": 1.2362863774666555, + "learning_rate": 3.177313988710208e-06, + "loss": 0.2943, + "step": 21728 + }, + { + "epoch": 0.63, + "grad_norm": 1.6721105285650695, + "learning_rate": 3.1768766102376534e-06, + "loss": 0.308, + "step": 21729 + }, + { + "epoch": 0.63, + "grad_norm": 1.3728293768033537, + "learning_rate": 3.1764392478541827e-06, + "loss": 0.2805, + "step": 21730 + }, + { + "epoch": 0.63, + "grad_norm": 1.622889590585239, + "learning_rate": 3.1760019015636566e-06, + "loss": 0.2831, + "step": 21731 + }, + { + "epoch": 0.63, + "grad_norm": 2.100659916033101, + "learning_rate": 3.1755645713699336e-06, + "loss": 0.2937, + "step": 21732 + }, + { + "epoch": 0.63, + "grad_norm": 1.0430674967796032, + "learning_rate": 3.1751272572768756e-06, + "loss": 0.5865, + "step": 21733 + }, + { + "epoch": 0.63, + "grad_norm": 1.3235803876984014, + "learning_rate": 3.1746899592883385e-06, + "loss": 0.2832, + "step": 21734 + }, + { + "epoch": 0.63, + "grad_norm": 1.504338477605617, + "learning_rate": 3.1742526774081822e-06, + "loss": 0.3108, + "step": 21735 + }, + { + "epoch": 0.63, + "grad_norm": 1.437802591228707, + "learning_rate": 3.1738154116402664e-06, + "loss": 0.2834, + "step": 21736 + }, + { + "epoch": 0.63, + "grad_norm": 1.276466747553967, + "learning_rate": 3.1733781619884496e-06, + "loss": 0.3018, + "step": 21737 + }, + { + "epoch": 0.63, + "grad_norm": 0.9578650370438693, + "learning_rate": 3.1729409284565905e-06, + "loss": 0.5997, + "step": 21738 + }, + { + "epoch": 0.63, + "grad_norm": 1.3093385996491347, + "learning_rate": 3.172503711048549e-06, + "loss": 0.2874, + "step": 21739 + }, + { + "epoch": 0.63, + "grad_norm": 1.2584118867015102, + "learning_rate": 3.172066509768182e-06, + "loss": 0.2919, + "step": 21740 + }, + { + "epoch": 0.63, + "grad_norm": 1.5128620489521551, + "learning_rate": 3.171629324619348e-06, + "loss": 0.2818, + "step": 21741 + }, + { + "epoch": 0.63, + "grad_norm": 1.38758140612749, + "learning_rate": 3.1711921556059054e-06, + "loss": 0.2898, + "step": 21742 + }, + { + "epoch": 0.63, + "grad_norm": 1.4153391394305157, + "learning_rate": 3.1707550027317135e-06, + "loss": 0.2881, + "step": 21743 + }, + { + "epoch": 0.63, + "grad_norm": 1.4179787243126005, + "learning_rate": 3.170317866000628e-06, + "loss": 0.2927, + "step": 21744 + }, + { + "epoch": 0.63, + "grad_norm": 1.250400194816044, + "learning_rate": 3.1698807454165075e-06, + "loss": 0.2817, + "step": 21745 + }, + { + "epoch": 0.63, + "grad_norm": 1.4076037383208415, + "learning_rate": 3.1694436409832096e-06, + "loss": 0.3107, + "step": 21746 + }, + { + "epoch": 0.63, + "grad_norm": 1.4588090513191383, + "learning_rate": 3.169006552704593e-06, + "loss": 0.3015, + "step": 21747 + }, + { + "epoch": 0.63, + "grad_norm": 1.6711513967418175, + "learning_rate": 3.1685694805845136e-06, + "loss": 0.3168, + "step": 21748 + }, + { + "epoch": 0.63, + "grad_norm": 1.2505224905970198, + "learning_rate": 3.168132424626828e-06, + "loss": 0.2744, + "step": 21749 + }, + { + "epoch": 0.63, + "grad_norm": 1.4224374670155482, + "learning_rate": 3.1676953848353942e-06, + "loss": 0.2916, + "step": 21750 + }, + { + "epoch": 0.63, + "grad_norm": 1.4557789376532169, + "learning_rate": 3.16725836121407e-06, + "loss": 0.306, + "step": 21751 + }, + { + "epoch": 0.63, + "grad_norm": 1.2839493807404003, + "learning_rate": 3.1668213537667103e-06, + "loss": 0.2878, + "step": 21752 + }, + { + "epoch": 0.63, + "grad_norm": 1.3147514130832025, + "learning_rate": 3.1663843624971724e-06, + "loss": 0.2853, + "step": 21753 + }, + { + "epoch": 0.63, + "grad_norm": 1.3502150400772817, + "learning_rate": 3.1659473874093127e-06, + "loss": 0.2926, + "step": 21754 + }, + { + "epoch": 0.63, + "grad_norm": 7.957294078711789, + "learning_rate": 3.1655104285069875e-06, + "loss": 0.316, + "step": 21755 + }, + { + "epoch": 0.63, + "grad_norm": 1.3643136409719296, + "learning_rate": 3.1650734857940547e-06, + "loss": 0.3043, + "step": 21756 + }, + { + "epoch": 0.63, + "grad_norm": 1.2781006001542674, + "learning_rate": 3.164636559274367e-06, + "loss": 0.2807, + "step": 21757 + }, + { + "epoch": 0.63, + "grad_norm": 1.1986103554191614, + "learning_rate": 3.1641996489517822e-06, + "loss": 0.2595, + "step": 21758 + }, + { + "epoch": 0.63, + "grad_norm": 1.2210918934955126, + "learning_rate": 3.1637627548301565e-06, + "loss": 0.2943, + "step": 21759 + }, + { + "epoch": 0.63, + "grad_norm": 1.3852317466325088, + "learning_rate": 3.1633258769133456e-06, + "loss": 0.2837, + "step": 21760 + }, + { + "epoch": 0.63, + "grad_norm": 1.3947097649471174, + "learning_rate": 3.1628890152052038e-06, + "loss": 0.293, + "step": 21761 + }, + { + "epoch": 0.63, + "grad_norm": 1.4732451015966275, + "learning_rate": 3.162452169709587e-06, + "loss": 0.3127, + "step": 21762 + }, + { + "epoch": 0.63, + "grad_norm": 1.3822039814762064, + "learning_rate": 3.1620153404303496e-06, + "loss": 0.3265, + "step": 21763 + }, + { + "epoch": 0.63, + "grad_norm": 1.9332040966848993, + "learning_rate": 3.161578527371346e-06, + "loss": 0.2989, + "step": 21764 + }, + { + "epoch": 0.63, + "grad_norm": 1.7296867746894826, + "learning_rate": 3.1611417305364332e-06, + "loss": 0.2956, + "step": 21765 + }, + { + "epoch": 0.63, + "grad_norm": 1.2205617091360377, + "learning_rate": 3.1607049499294663e-06, + "loss": 0.2843, + "step": 21766 + }, + { + "epoch": 0.63, + "grad_norm": 1.2907510963476416, + "learning_rate": 3.1602681855542972e-06, + "loss": 0.2923, + "step": 21767 + }, + { + "epoch": 0.63, + "grad_norm": 1.2339041233456347, + "learning_rate": 3.159831437414782e-06, + "loss": 0.2882, + "step": 21768 + }, + { + "epoch": 0.63, + "grad_norm": 1.5313959445797667, + "learning_rate": 3.1593947055147745e-06, + "loss": 0.3009, + "step": 21769 + }, + { + "epoch": 0.63, + "grad_norm": 1.3430305186879883, + "learning_rate": 3.158957989858131e-06, + "loss": 0.2859, + "step": 21770 + }, + { + "epoch": 0.63, + "grad_norm": 1.283823557946213, + "learning_rate": 3.1585212904487016e-06, + "loss": 0.2942, + "step": 21771 + }, + { + "epoch": 0.63, + "grad_norm": 1.4124344184730406, + "learning_rate": 3.1580846072903426e-06, + "loss": 0.3003, + "step": 21772 + }, + { + "epoch": 0.63, + "grad_norm": 1.2512949275918297, + "learning_rate": 3.1576479403869076e-06, + "loss": 0.2715, + "step": 21773 + }, + { + "epoch": 0.63, + "grad_norm": 1.3100089179630767, + "learning_rate": 3.1572112897422493e-06, + "loss": 0.2863, + "step": 21774 + }, + { + "epoch": 0.63, + "grad_norm": 1.4611306773859172, + "learning_rate": 3.1567746553602237e-06, + "loss": 0.2981, + "step": 21775 + }, + { + "epoch": 0.63, + "grad_norm": 1.7685418950691303, + "learning_rate": 3.1563380372446807e-06, + "loss": 0.2806, + "step": 21776 + }, + { + "epoch": 0.63, + "grad_norm": 1.4579269021000738, + "learning_rate": 3.155901435399475e-06, + "loss": 0.2769, + "step": 21777 + }, + { + "epoch": 0.63, + "grad_norm": 1.308549288584718, + "learning_rate": 3.155464849828459e-06, + "loss": 0.2819, + "step": 21778 + }, + { + "epoch": 0.63, + "grad_norm": 1.4923038789000898, + "learning_rate": 3.1550282805354883e-06, + "loss": 0.291, + "step": 21779 + }, + { + "epoch": 0.63, + "grad_norm": 1.2601739398760006, + "learning_rate": 3.154591727524412e-06, + "loss": 0.2899, + "step": 21780 + }, + { + "epoch": 0.63, + "grad_norm": 1.3423075503619764, + "learning_rate": 3.154155190799084e-06, + "loss": 0.2811, + "step": 21781 + }, + { + "epoch": 0.63, + "grad_norm": 1.3978701522172376, + "learning_rate": 3.153718670363357e-06, + "loss": 0.3324, + "step": 21782 + }, + { + "epoch": 0.63, + "grad_norm": 1.45742239710731, + "learning_rate": 3.1532821662210843e-06, + "loss": 0.29, + "step": 21783 + }, + { + "epoch": 0.63, + "grad_norm": 1.35182484058472, + "learning_rate": 3.1528456783761165e-06, + "loss": 0.2701, + "step": 21784 + }, + { + "epoch": 0.63, + "grad_norm": 1.5409111785942278, + "learning_rate": 3.1524092068323054e-06, + "loss": 0.2897, + "step": 21785 + }, + { + "epoch": 0.63, + "grad_norm": 1.1894598038709228, + "learning_rate": 3.1519727515935033e-06, + "loss": 0.2856, + "step": 21786 + }, + { + "epoch": 0.63, + "grad_norm": 0.9882846000262644, + "learning_rate": 3.1515363126635633e-06, + "loss": 0.5929, + "step": 21787 + }, + { + "epoch": 0.63, + "grad_norm": 1.2297346662473985, + "learning_rate": 3.1510998900463353e-06, + "loss": 0.3006, + "step": 21788 + }, + { + "epoch": 0.63, + "grad_norm": 1.225466478568304, + "learning_rate": 3.1506634837456727e-06, + "loss": 0.2743, + "step": 21789 + }, + { + "epoch": 0.63, + "grad_norm": 1.159078764762869, + "learning_rate": 3.1502270937654246e-06, + "loss": 0.2908, + "step": 21790 + }, + { + "epoch": 0.63, + "grad_norm": 1.4834478094118964, + "learning_rate": 3.149790720109442e-06, + "loss": 0.2941, + "step": 21791 + }, + { + "epoch": 0.63, + "grad_norm": 1.7426189548559692, + "learning_rate": 3.149354362781577e-06, + "loss": 0.2944, + "step": 21792 + }, + { + "epoch": 0.63, + "grad_norm": 1.6770510722549075, + "learning_rate": 3.1489180217856795e-06, + "loss": 0.2889, + "step": 21793 + }, + { + "epoch": 0.63, + "grad_norm": 1.2971150763241635, + "learning_rate": 3.1484816971256027e-06, + "loss": 0.2875, + "step": 21794 + }, + { + "epoch": 0.63, + "grad_norm": 1.2893980006124086, + "learning_rate": 3.1480453888051943e-06, + "loss": 0.2945, + "step": 21795 + }, + { + "epoch": 0.63, + "grad_norm": 1.361770584021552, + "learning_rate": 3.1476090968283064e-06, + "loss": 0.3079, + "step": 21796 + }, + { + "epoch": 0.63, + "grad_norm": 1.6112240043781132, + "learning_rate": 3.1471728211987886e-06, + "loss": 0.2851, + "step": 21797 + }, + { + "epoch": 0.63, + "grad_norm": 1.366436226306952, + "learning_rate": 3.1467365619204913e-06, + "loss": 0.286, + "step": 21798 + }, + { + "epoch": 0.63, + "grad_norm": 1.420354378349759, + "learning_rate": 3.1463003189972648e-06, + "loss": 0.2879, + "step": 21799 + }, + { + "epoch": 0.63, + "grad_norm": 1.4578454420261773, + "learning_rate": 3.1458640924329575e-06, + "loss": 0.2783, + "step": 21800 + }, + { + "epoch": 0.63, + "grad_norm": 1.7724710889699018, + "learning_rate": 3.1454278822314205e-06, + "loss": 0.2904, + "step": 21801 + }, + { + "epoch": 0.63, + "grad_norm": 1.2320994046221756, + "learning_rate": 3.1449916883965036e-06, + "loss": 0.2852, + "step": 21802 + }, + { + "epoch": 0.63, + "grad_norm": 1.7459657681791854, + "learning_rate": 3.1445555109320557e-06, + "loss": 0.2877, + "step": 21803 + }, + { + "epoch": 0.63, + "grad_norm": 1.4981380263252702, + "learning_rate": 3.1441193498419252e-06, + "loss": 0.2961, + "step": 21804 + }, + { + "epoch": 0.63, + "grad_norm": 2.809725855816018, + "learning_rate": 3.143683205129962e-06, + "loss": 0.283, + "step": 21805 + }, + { + "epoch": 0.63, + "grad_norm": 1.179234727283166, + "learning_rate": 3.1432470768000166e-06, + "loss": 0.275, + "step": 21806 + }, + { + "epoch": 0.63, + "grad_norm": 1.75293967110599, + "learning_rate": 3.1428109648559346e-06, + "loss": 0.3248, + "step": 21807 + }, + { + "epoch": 0.63, + "grad_norm": 1.1684444721270462, + "learning_rate": 3.1423748693015666e-06, + "loss": 0.2682, + "step": 21808 + }, + { + "epoch": 0.63, + "grad_norm": 1.3801112102385458, + "learning_rate": 3.1419387901407615e-06, + "loss": 0.2783, + "step": 21809 + }, + { + "epoch": 0.63, + "grad_norm": 1.7937358439481539, + "learning_rate": 3.141502727377367e-06, + "loss": 0.3044, + "step": 21810 + }, + { + "epoch": 0.63, + "grad_norm": 2.4031585178050987, + "learning_rate": 3.1410666810152322e-06, + "loss": 0.2898, + "step": 21811 + }, + { + "epoch": 0.63, + "grad_norm": 1.35080369965875, + "learning_rate": 3.1406306510582043e-06, + "loss": 0.2798, + "step": 21812 + }, + { + "epoch": 0.63, + "grad_norm": 1.4510203552183993, + "learning_rate": 3.140194637510131e-06, + "loss": 0.2931, + "step": 21813 + }, + { + "epoch": 0.63, + "grad_norm": 1.235419045629991, + "learning_rate": 3.1397586403748602e-06, + "loss": 0.2841, + "step": 21814 + }, + { + "epoch": 0.63, + "grad_norm": 1.3882810111156998, + "learning_rate": 3.139322659656242e-06, + "loss": 0.2821, + "step": 21815 + }, + { + "epoch": 0.63, + "grad_norm": 1.5203079783710176, + "learning_rate": 3.1388866953581203e-06, + "loss": 0.285, + "step": 21816 + }, + { + "epoch": 0.63, + "grad_norm": 1.3572084234300776, + "learning_rate": 3.138450747484346e-06, + "loss": 0.2838, + "step": 21817 + }, + { + "epoch": 0.63, + "grad_norm": 1.344340121672863, + "learning_rate": 3.1380148160387634e-06, + "loss": 0.2871, + "step": 21818 + }, + { + "epoch": 0.63, + "grad_norm": 1.3338314005629834, + "learning_rate": 3.13757890102522e-06, + "loss": 0.2753, + "step": 21819 + }, + { + "epoch": 0.63, + "grad_norm": 1.6403215137595366, + "learning_rate": 3.1371430024475637e-06, + "loss": 0.2831, + "step": 21820 + }, + { + "epoch": 0.63, + "grad_norm": 1.2787223043319653, + "learning_rate": 3.136707120309642e-06, + "loss": 0.2881, + "step": 21821 + }, + { + "epoch": 0.63, + "grad_norm": 1.2334886020381952, + "learning_rate": 3.1362712546152996e-06, + "loss": 0.2817, + "step": 21822 + }, + { + "epoch": 0.63, + "grad_norm": 1.3491610014974451, + "learning_rate": 3.135835405368384e-06, + "loss": 0.2792, + "step": 21823 + }, + { + "epoch": 0.63, + "grad_norm": 1.6183320410814472, + "learning_rate": 3.135399572572742e-06, + "loss": 0.2831, + "step": 21824 + }, + { + "epoch": 0.63, + "grad_norm": 1.4847515833723655, + "learning_rate": 3.13496375623222e-06, + "loss": 0.2904, + "step": 21825 + }, + { + "epoch": 0.63, + "grad_norm": 1.4253249417831215, + "learning_rate": 3.134527956350663e-06, + "loss": 0.2807, + "step": 21826 + }, + { + "epoch": 0.63, + "grad_norm": 2.584665577085149, + "learning_rate": 3.1340921729319173e-06, + "loss": 0.2971, + "step": 21827 + }, + { + "epoch": 0.63, + "grad_norm": 1.1763382929302006, + "learning_rate": 3.1336564059798285e-06, + "loss": 0.2704, + "step": 21828 + }, + { + "epoch": 0.63, + "grad_norm": 1.5327347791144907, + "learning_rate": 3.133220655498243e-06, + "loss": 0.2766, + "step": 21829 + }, + { + "epoch": 0.63, + "grad_norm": 1.7694660701193223, + "learning_rate": 3.1327849214910065e-06, + "loss": 0.2956, + "step": 21830 + }, + { + "epoch": 0.63, + "grad_norm": 1.3216668879391105, + "learning_rate": 3.1323492039619634e-06, + "loss": 0.2877, + "step": 21831 + }, + { + "epoch": 0.63, + "grad_norm": 1.3010114984518484, + "learning_rate": 3.1319135029149596e-06, + "loss": 0.2824, + "step": 21832 + }, + { + "epoch": 0.63, + "grad_norm": 1.9900009013632325, + "learning_rate": 3.1314778183538385e-06, + "loss": 0.2868, + "step": 21833 + }, + { + "epoch": 0.63, + "grad_norm": 1.2389004795540022, + "learning_rate": 3.131042150282448e-06, + "loss": 0.3026, + "step": 21834 + }, + { + "epoch": 0.63, + "grad_norm": 1.5053083138179502, + "learning_rate": 3.130606498704631e-06, + "loss": 0.3025, + "step": 21835 + }, + { + "epoch": 0.63, + "grad_norm": 1.2585071824738303, + "learning_rate": 3.130170863624232e-06, + "loss": 0.29, + "step": 21836 + }, + { + "epoch": 0.63, + "grad_norm": 1.3732755411911817, + "learning_rate": 3.1297352450450953e-06, + "loss": 0.3163, + "step": 21837 + }, + { + "epoch": 0.63, + "grad_norm": 1.3351335946641705, + "learning_rate": 3.129299642971067e-06, + "loss": 0.2958, + "step": 21838 + }, + { + "epoch": 0.63, + "grad_norm": 1.4217683373986678, + "learning_rate": 3.128864057405989e-06, + "loss": 0.2996, + "step": 21839 + }, + { + "epoch": 0.63, + "grad_norm": 1.7365157454390754, + "learning_rate": 3.1284284883537074e-06, + "loss": 0.3255, + "step": 21840 + }, + { + "epoch": 0.63, + "grad_norm": 1.228698990033029, + "learning_rate": 3.127992935818064e-06, + "loss": 0.3129, + "step": 21841 + }, + { + "epoch": 0.63, + "grad_norm": 1.4032454604109197, + "learning_rate": 3.127557399802905e-06, + "loss": 0.2907, + "step": 21842 + }, + { + "epoch": 0.63, + "grad_norm": 1.2423575138204965, + "learning_rate": 3.127121880312072e-06, + "loss": 0.2868, + "step": 21843 + }, + { + "epoch": 0.63, + "grad_norm": 1.4750529857406334, + "learning_rate": 3.1266863773494092e-06, + "loss": 0.2854, + "step": 21844 + }, + { + "epoch": 0.63, + "grad_norm": 1.877858401764208, + "learning_rate": 3.126250890918762e-06, + "loss": 0.2862, + "step": 21845 + }, + { + "epoch": 0.63, + "grad_norm": 1.2765243177052827, + "learning_rate": 3.1258154210239695e-06, + "loss": 0.2742, + "step": 21846 + }, + { + "epoch": 0.63, + "grad_norm": 1.5442516883794024, + "learning_rate": 3.125379967668876e-06, + "loss": 0.3095, + "step": 21847 + }, + { + "epoch": 0.63, + "grad_norm": 1.2589693393229306, + "learning_rate": 3.124944530857327e-06, + "loss": 0.2759, + "step": 21848 + }, + { + "epoch": 0.63, + "grad_norm": 1.3775242845467948, + "learning_rate": 3.124509110593161e-06, + "loss": 0.2986, + "step": 21849 + }, + { + "epoch": 0.63, + "grad_norm": 1.1991858369715485, + "learning_rate": 3.124073706880224e-06, + "loss": 0.2805, + "step": 21850 + }, + { + "epoch": 0.63, + "grad_norm": 0.9249227837058707, + "learning_rate": 3.1236383197223576e-06, + "loss": 0.5224, + "step": 21851 + }, + { + "epoch": 0.63, + "grad_norm": 1.4001588245153114, + "learning_rate": 3.123202949123403e-06, + "loss": 0.3194, + "step": 21852 + }, + { + "epoch": 0.63, + "grad_norm": 1.3438492585974449, + "learning_rate": 3.122767595087205e-06, + "loss": 0.2949, + "step": 21853 + }, + { + "epoch": 0.63, + "grad_norm": 1.327357705678718, + "learning_rate": 3.122332257617602e-06, + "loss": 0.3081, + "step": 21854 + }, + { + "epoch": 0.63, + "grad_norm": 1.6857666836779417, + "learning_rate": 3.121896936718438e-06, + "loss": 0.2954, + "step": 21855 + }, + { + "epoch": 0.63, + "grad_norm": 1.816144677795486, + "learning_rate": 3.121461632393554e-06, + "loss": 0.2803, + "step": 21856 + }, + { + "epoch": 0.63, + "grad_norm": 1.4422096921357694, + "learning_rate": 3.1210263446467936e-06, + "loss": 0.2967, + "step": 21857 + }, + { + "epoch": 0.63, + "grad_norm": 1.803220893640576, + "learning_rate": 3.1205910734819956e-06, + "loss": 0.286, + "step": 21858 + }, + { + "epoch": 0.63, + "grad_norm": 1.3650458781901664, + "learning_rate": 3.1201558189030017e-06, + "loss": 0.2957, + "step": 21859 + }, + { + "epoch": 0.63, + "grad_norm": 1.5497723012196978, + "learning_rate": 3.119720580913654e-06, + "loss": 0.2911, + "step": 21860 + }, + { + "epoch": 0.63, + "grad_norm": 1.3870244569518906, + "learning_rate": 3.1192853595177937e-06, + "loss": 0.3022, + "step": 21861 + }, + { + "epoch": 0.63, + "grad_norm": 1.4274912271509743, + "learning_rate": 3.1188501547192594e-06, + "loss": 0.306, + "step": 21862 + }, + { + "epoch": 0.63, + "grad_norm": 1.6130060897848462, + "learning_rate": 3.1184149665218942e-06, + "loss": 0.2971, + "step": 21863 + }, + { + "epoch": 0.63, + "grad_norm": 1.2809865009786794, + "learning_rate": 3.1179797949295376e-06, + "loss": 0.2752, + "step": 21864 + }, + { + "epoch": 0.63, + "grad_norm": 1.3054268554204134, + "learning_rate": 3.117544639946031e-06, + "loss": 0.2897, + "step": 21865 + }, + { + "epoch": 0.63, + "grad_norm": 1.2908540546543006, + "learning_rate": 3.1171095015752135e-06, + "loss": 0.2717, + "step": 21866 + }, + { + "epoch": 0.63, + "grad_norm": 1.2652216037713666, + "learning_rate": 3.1166743798209254e-06, + "loss": 0.2834, + "step": 21867 + }, + { + "epoch": 0.63, + "grad_norm": 1.3941430823558445, + "learning_rate": 3.1162392746870063e-06, + "loss": 0.2714, + "step": 21868 + }, + { + "epoch": 0.63, + "grad_norm": 1.2916119300254802, + "learning_rate": 3.1158041861772963e-06, + "loss": 0.3114, + "step": 21869 + }, + { + "epoch": 0.63, + "grad_norm": 1.3933690981099918, + "learning_rate": 3.1153691142956368e-06, + "loss": 0.271, + "step": 21870 + }, + { + "epoch": 0.63, + "grad_norm": 1.3682619571501056, + "learning_rate": 3.1149340590458643e-06, + "loss": 0.2922, + "step": 21871 + }, + { + "epoch": 0.63, + "grad_norm": 1.3323529842091963, + "learning_rate": 3.1144990204318198e-06, + "loss": 0.2831, + "step": 21872 + }, + { + "epoch": 0.63, + "grad_norm": 1.4858029157483383, + "learning_rate": 3.1140639984573428e-06, + "loss": 0.2866, + "step": 21873 + }, + { + "epoch": 0.63, + "grad_norm": 1.447561611787399, + "learning_rate": 3.113628993126273e-06, + "loss": 0.283, + "step": 21874 + }, + { + "epoch": 0.63, + "grad_norm": 1.330347642098791, + "learning_rate": 3.1131940044424476e-06, + "loss": 0.2934, + "step": 21875 + }, + { + "epoch": 0.63, + "grad_norm": 1.239846132004988, + "learning_rate": 3.1127590324097057e-06, + "loss": 0.2867, + "step": 21876 + }, + { + "epoch": 0.63, + "grad_norm": 1.373988599776947, + "learning_rate": 3.1123240770318864e-06, + "loss": 0.2983, + "step": 21877 + }, + { + "epoch": 0.63, + "grad_norm": 1.305504421968537, + "learning_rate": 3.1118891383128275e-06, + "loss": 0.2871, + "step": 21878 + }, + { + "epoch": 0.63, + "grad_norm": 1.2825858707812525, + "learning_rate": 3.1114542162563678e-06, + "loss": 0.3092, + "step": 21879 + }, + { + "epoch": 0.63, + "grad_norm": 1.4286815150102035, + "learning_rate": 3.1110193108663465e-06, + "loss": 0.2948, + "step": 21880 + }, + { + "epoch": 0.63, + "grad_norm": 1.4112283911514845, + "learning_rate": 3.1105844221466e-06, + "loss": 0.2853, + "step": 21881 + }, + { + "epoch": 0.63, + "grad_norm": 1.4219728385242645, + "learning_rate": 3.1101495501009672e-06, + "loss": 0.2891, + "step": 21882 + }, + { + "epoch": 0.63, + "grad_norm": 1.2144091642513593, + "learning_rate": 3.109714694733285e-06, + "loss": 0.2898, + "step": 21883 + }, + { + "epoch": 0.63, + "grad_norm": 1.2590374295246014, + "learning_rate": 3.109279856047393e-06, + "loss": 0.2718, + "step": 21884 + }, + { + "epoch": 0.63, + "grad_norm": 1.2286671514625256, + "learning_rate": 3.1088450340471267e-06, + "loss": 0.2951, + "step": 21885 + }, + { + "epoch": 0.63, + "grad_norm": 1.4519132880059817, + "learning_rate": 3.1084102287363228e-06, + "loss": 0.2717, + "step": 21886 + }, + { + "epoch": 0.63, + "grad_norm": 1.2134278684820141, + "learning_rate": 3.10797544011882e-06, + "loss": 0.2849, + "step": 21887 + }, + { + "epoch": 0.63, + "grad_norm": 1.318801344368793, + "learning_rate": 3.1075406681984556e-06, + "loss": 0.3042, + "step": 21888 + }, + { + "epoch": 0.63, + "grad_norm": 1.2919124271822942, + "learning_rate": 3.107105912979066e-06, + "loss": 0.2929, + "step": 21889 + }, + { + "epoch": 0.63, + "grad_norm": 1.1926480459083486, + "learning_rate": 3.106671174464487e-06, + "loss": 0.3011, + "step": 21890 + }, + { + "epoch": 0.63, + "grad_norm": 1.2529566123741018, + "learning_rate": 3.106236452658556e-06, + "loss": 0.3211, + "step": 21891 + }, + { + "epoch": 0.63, + "grad_norm": 1.2906481892292274, + "learning_rate": 3.105801747565109e-06, + "loss": 0.2796, + "step": 21892 + }, + { + "epoch": 0.64, + "grad_norm": 1.3506753058218492, + "learning_rate": 3.105367059187984e-06, + "loss": 0.2976, + "step": 21893 + }, + { + "epoch": 0.64, + "grad_norm": 1.3295770789766168, + "learning_rate": 3.1049323875310142e-06, + "loss": 0.2925, + "step": 21894 + }, + { + "epoch": 0.64, + "grad_norm": 0.9797175095160735, + "learning_rate": 3.1044977325980378e-06, + "loss": 0.5724, + "step": 21895 + }, + { + "epoch": 0.64, + "grad_norm": 1.2231260991524997, + "learning_rate": 3.1040630943928897e-06, + "loss": 0.2893, + "step": 21896 + }, + { + "epoch": 0.64, + "grad_norm": 1.219710357400537, + "learning_rate": 3.1036284729194066e-06, + "loss": 0.2755, + "step": 21897 + }, + { + "epoch": 0.64, + "grad_norm": 1.2724479862417768, + "learning_rate": 3.103193868181422e-06, + "loss": 0.2696, + "step": 21898 + }, + { + "epoch": 0.64, + "grad_norm": 1.3122262949120869, + "learning_rate": 3.1027592801827735e-06, + "loss": 0.2829, + "step": 21899 + }, + { + "epoch": 0.64, + "grad_norm": 1.4350610039162237, + "learning_rate": 3.1023247089272946e-06, + "loss": 0.2772, + "step": 21900 + }, + { + "epoch": 0.64, + "grad_norm": 1.3744590372480412, + "learning_rate": 3.1018901544188218e-06, + "loss": 0.2931, + "step": 21901 + }, + { + "epoch": 0.64, + "grad_norm": 1.207377692496005, + "learning_rate": 3.1014556166611908e-06, + "loss": 0.2797, + "step": 21902 + }, + { + "epoch": 0.64, + "grad_norm": 1.1999459762881557, + "learning_rate": 3.1010210956582344e-06, + "loss": 0.2661, + "step": 21903 + }, + { + "epoch": 0.64, + "grad_norm": 1.478594828101412, + "learning_rate": 3.1005865914137868e-06, + "loss": 0.2759, + "step": 21904 + }, + { + "epoch": 0.64, + "grad_norm": 1.3463168171967612, + "learning_rate": 3.100152103931684e-06, + "loss": 0.2913, + "step": 21905 + }, + { + "epoch": 0.64, + "grad_norm": 1.1938848852087196, + "learning_rate": 3.0997176332157593e-06, + "loss": 0.2802, + "step": 21906 + }, + { + "epoch": 0.64, + "grad_norm": 1.395624376938516, + "learning_rate": 3.0992831792698475e-06, + "loss": 0.2833, + "step": 21907 + }, + { + "epoch": 0.64, + "grad_norm": 1.2472803686472353, + "learning_rate": 3.0988487420977843e-06, + "loss": 0.2914, + "step": 21908 + }, + { + "epoch": 0.64, + "grad_norm": 1.7459051842208215, + "learning_rate": 3.0984143217034007e-06, + "loss": 0.283, + "step": 21909 + }, + { + "epoch": 0.64, + "grad_norm": 1.30348176320459, + "learning_rate": 3.097979918090532e-06, + "loss": 0.2844, + "step": 21910 + }, + { + "epoch": 0.64, + "grad_norm": 1.627690504002658, + "learning_rate": 3.0975455312630114e-06, + "loss": 0.2819, + "step": 21911 + }, + { + "epoch": 0.64, + "grad_norm": 1.2196937556928404, + "learning_rate": 3.097111161224674e-06, + "loss": 0.2865, + "step": 21912 + }, + { + "epoch": 0.64, + "grad_norm": 1.2299344572579942, + "learning_rate": 3.0966768079793503e-06, + "loss": 0.2743, + "step": 21913 + }, + { + "epoch": 0.64, + "grad_norm": 1.2355742353312142, + "learning_rate": 3.0962424715308746e-06, + "loss": 0.3001, + "step": 21914 + }, + { + "epoch": 0.64, + "grad_norm": 1.3507808567022488, + "learning_rate": 3.0958081518830807e-06, + "loss": 0.2759, + "step": 21915 + }, + { + "epoch": 0.64, + "grad_norm": 1.3000923901872883, + "learning_rate": 3.095373849039802e-06, + "loss": 0.3133, + "step": 21916 + }, + { + "epoch": 0.64, + "grad_norm": 1.5522045360408865, + "learning_rate": 3.0949395630048694e-06, + "loss": 0.2958, + "step": 21917 + }, + { + "epoch": 0.64, + "grad_norm": 1.63641119370051, + "learning_rate": 3.094505293782116e-06, + "loss": 0.3072, + "step": 21918 + }, + { + "epoch": 0.64, + "grad_norm": 1.2663470322458146, + "learning_rate": 3.094071041375375e-06, + "loss": 0.2979, + "step": 21919 + }, + { + "epoch": 0.64, + "grad_norm": 1.2569784567947242, + "learning_rate": 3.093636805788478e-06, + "loss": 0.269, + "step": 21920 + }, + { + "epoch": 0.64, + "grad_norm": 1.4564741176121272, + "learning_rate": 3.0932025870252573e-06, + "loss": 0.2836, + "step": 21921 + }, + { + "epoch": 0.64, + "grad_norm": 1.2270570613197547, + "learning_rate": 3.092768385089545e-06, + "loss": 0.3277, + "step": 21922 + }, + { + "epoch": 0.64, + "grad_norm": 1.2413938614566957, + "learning_rate": 3.092334199985172e-06, + "loss": 0.2828, + "step": 21923 + }, + { + "epoch": 0.64, + "grad_norm": 1.3822472583606915, + "learning_rate": 3.091900031715972e-06, + "loss": 0.2939, + "step": 21924 + }, + { + "epoch": 0.64, + "grad_norm": 1.2759335215697303, + "learning_rate": 3.0914658802857756e-06, + "loss": 0.3099, + "step": 21925 + }, + { + "epoch": 0.64, + "grad_norm": 1.4676045850928074, + "learning_rate": 3.091031745698413e-06, + "loss": 0.2997, + "step": 21926 + }, + { + "epoch": 0.64, + "grad_norm": 1.3448953622734945, + "learning_rate": 3.0905976279577165e-06, + "loss": 0.2822, + "step": 21927 + }, + { + "epoch": 0.64, + "grad_norm": 1.3505513081549705, + "learning_rate": 3.0901635270675174e-06, + "loss": 0.2916, + "step": 21928 + }, + { + "epoch": 0.64, + "grad_norm": 1.248007623871016, + "learning_rate": 3.0897294430316473e-06, + "loss": 0.2983, + "step": 21929 + }, + { + "epoch": 0.64, + "grad_norm": 1.488303392753944, + "learning_rate": 3.0892953758539355e-06, + "loss": 0.2704, + "step": 21930 + }, + { + "epoch": 0.64, + "grad_norm": 1.346533355656585, + "learning_rate": 3.088861325538214e-06, + "loss": 0.2906, + "step": 21931 + }, + { + "epoch": 0.64, + "grad_norm": 1.3419523228054937, + "learning_rate": 3.0884272920883106e-06, + "loss": 0.2924, + "step": 21932 + }, + { + "epoch": 0.64, + "grad_norm": 1.6726012147518703, + "learning_rate": 3.087993275508059e-06, + "loss": 0.289, + "step": 21933 + }, + { + "epoch": 0.64, + "grad_norm": 1.9572662287709077, + "learning_rate": 3.087559275801287e-06, + "loss": 0.3195, + "step": 21934 + }, + { + "epoch": 0.64, + "grad_norm": 1.3220755667943123, + "learning_rate": 3.0871252929718266e-06, + "loss": 0.2829, + "step": 21935 + }, + { + "epoch": 0.64, + "grad_norm": 1.2578293341030178, + "learning_rate": 3.0866913270235056e-06, + "loss": 0.2758, + "step": 21936 + }, + { + "epoch": 0.64, + "grad_norm": 1.1535827428834777, + "learning_rate": 3.0862573779601545e-06, + "loss": 0.2806, + "step": 21937 + }, + { + "epoch": 0.64, + "grad_norm": 1.2460349762461675, + "learning_rate": 3.085823445785604e-06, + "loss": 0.2998, + "step": 21938 + }, + { + "epoch": 0.64, + "grad_norm": 1.1436252067725883, + "learning_rate": 3.085389530503684e-06, + "loss": 0.2767, + "step": 21939 + }, + { + "epoch": 0.64, + "grad_norm": 1.5234606279718328, + "learning_rate": 3.0849556321182213e-06, + "loss": 0.3024, + "step": 21940 + }, + { + "epoch": 0.64, + "grad_norm": 1.3342516552120969, + "learning_rate": 3.084521750633046e-06, + "loss": 0.2992, + "step": 21941 + }, + { + "epoch": 0.64, + "grad_norm": 1.4015578998744267, + "learning_rate": 3.084087886051988e-06, + "loss": 0.2897, + "step": 21942 + }, + { + "epoch": 0.64, + "grad_norm": 1.3825889360928696, + "learning_rate": 3.0836540383788754e-06, + "loss": 0.2806, + "step": 21943 + }, + { + "epoch": 0.64, + "grad_norm": 1.314087881162191, + "learning_rate": 3.0832202076175384e-06, + "loss": 0.297, + "step": 21944 + }, + { + "epoch": 0.64, + "grad_norm": 1.2116584601721208, + "learning_rate": 3.082786393771803e-06, + "loss": 0.303, + "step": 21945 + }, + { + "epoch": 0.64, + "grad_norm": 1.4091142648768735, + "learning_rate": 3.0823525968454993e-06, + "loss": 0.2963, + "step": 21946 + }, + { + "epoch": 0.64, + "grad_norm": 1.2269218347910351, + "learning_rate": 3.0819188168424553e-06, + "loss": 0.2769, + "step": 21947 + }, + { + "epoch": 0.64, + "grad_norm": 2.4189718833423948, + "learning_rate": 3.0814850537665e-06, + "loss": 0.2778, + "step": 21948 + }, + { + "epoch": 0.64, + "grad_norm": 1.418059690923051, + "learning_rate": 3.081051307621459e-06, + "loss": 0.2797, + "step": 21949 + }, + { + "epoch": 0.64, + "grad_norm": 1.261710302398691, + "learning_rate": 3.0806175784111615e-06, + "loss": 0.304, + "step": 21950 + }, + { + "epoch": 0.64, + "grad_norm": 1.4126030625875876, + "learning_rate": 3.080183866139435e-06, + "loss": 0.3025, + "step": 21951 + }, + { + "epoch": 0.64, + "grad_norm": 4.048144320305836, + "learning_rate": 3.0797501708101087e-06, + "loss": 0.2706, + "step": 21952 + }, + { + "epoch": 0.64, + "grad_norm": 1.2476515455096013, + "learning_rate": 3.079316492427007e-06, + "loss": 0.2891, + "step": 21953 + }, + { + "epoch": 0.64, + "grad_norm": 1.4075337840899336, + "learning_rate": 3.078882830993959e-06, + "loss": 0.2878, + "step": 21954 + }, + { + "epoch": 0.64, + "grad_norm": 1.5362565840365132, + "learning_rate": 3.0784491865147914e-06, + "loss": 0.2993, + "step": 21955 + }, + { + "epoch": 0.64, + "grad_norm": 1.5382470180065302, + "learning_rate": 3.0780155589933315e-06, + "loss": 0.2623, + "step": 21956 + }, + { + "epoch": 0.64, + "grad_norm": 1.625466622824643, + "learning_rate": 3.0775819484334045e-06, + "loss": 0.2867, + "step": 21957 + }, + { + "epoch": 0.64, + "grad_norm": 1.2574929546426719, + "learning_rate": 3.077148354838841e-06, + "loss": 0.289, + "step": 21958 + }, + { + "epoch": 0.64, + "grad_norm": 1.2345142804552998, + "learning_rate": 3.076714778213462e-06, + "loss": 0.2655, + "step": 21959 + }, + { + "epoch": 0.64, + "grad_norm": 1.4226240755472976, + "learning_rate": 3.076281218561097e-06, + "loss": 0.3118, + "step": 21960 + }, + { + "epoch": 0.64, + "grad_norm": 1.5480755268660398, + "learning_rate": 3.0758476758855705e-06, + "loss": 0.2931, + "step": 21961 + }, + { + "epoch": 0.64, + "grad_norm": 1.2516767901043822, + "learning_rate": 3.0754141501907104e-06, + "loss": 0.3032, + "step": 21962 + }, + { + "epoch": 0.64, + "grad_norm": 1.5278458528502668, + "learning_rate": 3.074980641480343e-06, + "loss": 0.3204, + "step": 21963 + }, + { + "epoch": 0.64, + "grad_norm": 1.3213726519221725, + "learning_rate": 3.074547149758291e-06, + "loss": 0.2798, + "step": 21964 + }, + { + "epoch": 0.64, + "grad_norm": 1.5313612440062236, + "learning_rate": 3.0741136750283816e-06, + "loss": 0.2926, + "step": 21965 + }, + { + "epoch": 0.64, + "grad_norm": 1.3474025414444788, + "learning_rate": 3.0736802172944414e-06, + "loss": 0.3257, + "step": 21966 + }, + { + "epoch": 0.64, + "grad_norm": 1.4648049276936697, + "learning_rate": 3.0732467765602953e-06, + "loss": 0.274, + "step": 21967 + }, + { + "epoch": 0.64, + "grad_norm": 1.5868423960341278, + "learning_rate": 3.0728133528297664e-06, + "loss": 0.3018, + "step": 21968 + }, + { + "epoch": 0.64, + "grad_norm": 1.2360452029664157, + "learning_rate": 3.0723799461066816e-06, + "loss": 0.2798, + "step": 21969 + }, + { + "epoch": 0.64, + "grad_norm": 1.2260887773620586, + "learning_rate": 3.071946556394865e-06, + "loss": 0.2825, + "step": 21970 + }, + { + "epoch": 0.64, + "grad_norm": 1.7121819622522798, + "learning_rate": 3.071513183698142e-06, + "loss": 0.3095, + "step": 21971 + }, + { + "epoch": 0.64, + "grad_norm": 1.314090040916435, + "learning_rate": 3.0710798280203358e-06, + "loss": 0.2939, + "step": 21972 + }, + { + "epoch": 0.64, + "grad_norm": 2.7499529594501864, + "learning_rate": 3.0706464893652722e-06, + "loss": 0.2812, + "step": 21973 + }, + { + "epoch": 0.64, + "grad_norm": 1.9500345218464377, + "learning_rate": 3.0702131677367737e-06, + "loss": 0.2911, + "step": 21974 + }, + { + "epoch": 0.64, + "grad_norm": 1.3327963761835238, + "learning_rate": 3.0697798631386666e-06, + "loss": 0.3299, + "step": 21975 + }, + { + "epoch": 0.64, + "grad_norm": 1.2554883360387816, + "learning_rate": 3.069346575574773e-06, + "loss": 0.2885, + "step": 21976 + }, + { + "epoch": 0.64, + "grad_norm": 1.442580918170559, + "learning_rate": 3.0689133050489173e-06, + "loss": 0.3105, + "step": 21977 + }, + { + "epoch": 0.64, + "grad_norm": 1.3368991124055767, + "learning_rate": 3.0684800515649228e-06, + "loss": 0.2798, + "step": 21978 + }, + { + "epoch": 0.64, + "grad_norm": 1.5378569755422762, + "learning_rate": 3.068046815126613e-06, + "loss": 0.296, + "step": 21979 + }, + { + "epoch": 0.64, + "grad_norm": 1.3094914456787126, + "learning_rate": 3.0676135957378127e-06, + "loss": 0.2922, + "step": 21980 + }, + { + "epoch": 0.64, + "grad_norm": 1.3565117398675715, + "learning_rate": 3.0671803934023436e-06, + "loss": 0.2975, + "step": 21981 + }, + { + "epoch": 0.64, + "grad_norm": 1.4965361993476691, + "learning_rate": 3.066747208124028e-06, + "loss": 0.2998, + "step": 21982 + }, + { + "epoch": 0.64, + "grad_norm": 1.3367466237048211, + "learning_rate": 3.0663140399066903e-06, + "loss": 0.315, + "step": 21983 + }, + { + "epoch": 0.64, + "grad_norm": 1.3262206215811343, + "learning_rate": 3.0658808887541535e-06, + "loss": 0.2798, + "step": 21984 + }, + { + "epoch": 0.64, + "grad_norm": 1.2451853965289186, + "learning_rate": 3.0654477546702382e-06, + "loss": 0.2902, + "step": 21985 + }, + { + "epoch": 0.64, + "grad_norm": 1.309445453801315, + "learning_rate": 3.0650146376587705e-06, + "loss": 0.3011, + "step": 21986 + }, + { + "epoch": 0.64, + "grad_norm": 0.9518630708650594, + "learning_rate": 3.064581537723568e-06, + "loss": 0.5653, + "step": 21987 + }, + { + "epoch": 0.64, + "grad_norm": 2.082319106539535, + "learning_rate": 3.0641484548684542e-06, + "loss": 0.2763, + "step": 21988 + }, + { + "epoch": 0.64, + "grad_norm": 1.4848097283919874, + "learning_rate": 3.0637153890972515e-06, + "loss": 0.2772, + "step": 21989 + }, + { + "epoch": 0.64, + "grad_norm": 1.2196475577004442, + "learning_rate": 3.0632823404137835e-06, + "loss": 0.2721, + "step": 21990 + }, + { + "epoch": 0.64, + "grad_norm": 1.5369759557247353, + "learning_rate": 3.0628493088218696e-06, + "loss": 0.2668, + "step": 21991 + }, + { + "epoch": 0.64, + "grad_norm": 1.2868601270717253, + "learning_rate": 3.0624162943253324e-06, + "loss": 0.2783, + "step": 21992 + }, + { + "epoch": 0.64, + "grad_norm": 1.2296251359335175, + "learning_rate": 3.0619832969279916e-06, + "loss": 0.297, + "step": 21993 + }, + { + "epoch": 0.64, + "grad_norm": 1.2912994527002197, + "learning_rate": 3.0615503166336715e-06, + "loss": 0.2545, + "step": 21994 + }, + { + "epoch": 0.64, + "grad_norm": 1.262884886714288, + "learning_rate": 3.0611173534461902e-06, + "loss": 0.2966, + "step": 21995 + }, + { + "epoch": 0.64, + "grad_norm": 1.2961819479314982, + "learning_rate": 3.06068440736937e-06, + "loss": 0.2865, + "step": 21996 + }, + { + "epoch": 0.64, + "grad_norm": 1.2025972259543607, + "learning_rate": 3.0602514784070312e-06, + "loss": 0.2739, + "step": 21997 + }, + { + "epoch": 0.64, + "grad_norm": 1.2598241467697033, + "learning_rate": 3.059818566562994e-06, + "loss": 0.2992, + "step": 21998 + }, + { + "epoch": 0.64, + "grad_norm": 1.3325331249807155, + "learning_rate": 3.059385671841082e-06, + "loss": 0.274, + "step": 21999 + }, + { + "epoch": 0.64, + "grad_norm": 1.3633172833774005, + "learning_rate": 3.058952794245111e-06, + "loss": 0.2815, + "step": 22000 + }, + { + "epoch": 0.64, + "grad_norm": 1.233441129699162, + "learning_rate": 3.058519933778903e-06, + "loss": 0.2857, + "step": 22001 + }, + { + "epoch": 0.64, + "grad_norm": 1.3303890047520575, + "learning_rate": 3.0580870904462788e-06, + "loss": 0.3017, + "step": 22002 + }, + { + "epoch": 0.64, + "grad_norm": 1.3950491050709286, + "learning_rate": 3.0576542642510577e-06, + "loss": 0.2853, + "step": 22003 + }, + { + "epoch": 0.64, + "grad_norm": 4.743040728408528, + "learning_rate": 3.057221455197059e-06, + "loss": 0.3044, + "step": 22004 + }, + { + "epoch": 0.64, + "grad_norm": 1.4680831400935261, + "learning_rate": 3.0567886632881023e-06, + "loss": 0.2839, + "step": 22005 + }, + { + "epoch": 0.64, + "grad_norm": 1.4023907593989056, + "learning_rate": 3.056355888528007e-06, + "loss": 0.302, + "step": 22006 + }, + { + "epoch": 0.64, + "grad_norm": 1.3237662402936776, + "learning_rate": 3.055923130920594e-06, + "loss": 0.3018, + "step": 22007 + }, + { + "epoch": 0.64, + "grad_norm": 1.3181546242407074, + "learning_rate": 3.0554903904696792e-06, + "loss": 0.2843, + "step": 22008 + }, + { + "epoch": 0.64, + "grad_norm": 2.9219075927044256, + "learning_rate": 3.055057667179083e-06, + "loss": 0.2635, + "step": 22009 + }, + { + "epoch": 0.64, + "grad_norm": 1.2354619709666148, + "learning_rate": 3.054624961052626e-06, + "loss": 0.2788, + "step": 22010 + }, + { + "epoch": 0.64, + "grad_norm": 1.3084324131221348, + "learning_rate": 3.054192272094125e-06, + "loss": 0.2958, + "step": 22011 + }, + { + "epoch": 0.64, + "grad_norm": 1.3696744483809908, + "learning_rate": 3.0537596003073987e-06, + "loss": 0.2884, + "step": 22012 + }, + { + "epoch": 0.64, + "grad_norm": 1.3551353066515173, + "learning_rate": 3.0533269456962643e-06, + "loss": 0.2993, + "step": 22013 + }, + { + "epoch": 0.64, + "grad_norm": 1.3618969892852835, + "learning_rate": 3.0528943082645446e-06, + "loss": 0.2859, + "step": 22014 + }, + { + "epoch": 0.64, + "grad_norm": 1.6742101837965928, + "learning_rate": 3.0524616880160516e-06, + "loss": 0.2671, + "step": 22015 + }, + { + "epoch": 0.64, + "grad_norm": 1.4066970680033388, + "learning_rate": 3.052029084954605e-06, + "loss": 0.2867, + "step": 22016 + }, + { + "epoch": 0.64, + "grad_norm": 1.8791243562040962, + "learning_rate": 3.051596499084026e-06, + "loss": 0.2863, + "step": 22017 + }, + { + "epoch": 0.64, + "grad_norm": 1.213195320872055, + "learning_rate": 3.0511639304081275e-06, + "loss": 0.302, + "step": 22018 + }, + { + "epoch": 0.64, + "grad_norm": 1.4491828264993074, + "learning_rate": 3.050731378930729e-06, + "loss": 0.2701, + "step": 22019 + }, + { + "epoch": 0.64, + "grad_norm": 1.2179105978483316, + "learning_rate": 3.0502988446556474e-06, + "loss": 0.2696, + "step": 22020 + }, + { + "epoch": 0.64, + "grad_norm": 1.4930418460098482, + "learning_rate": 3.049866327586701e-06, + "loss": 0.2938, + "step": 22021 + }, + { + "epoch": 0.64, + "grad_norm": 1.3005419596576966, + "learning_rate": 3.049433827727706e-06, + "loss": 0.2813, + "step": 22022 + }, + { + "epoch": 0.64, + "grad_norm": 1.2975099307732512, + "learning_rate": 3.0490013450824786e-06, + "loss": 0.2975, + "step": 22023 + }, + { + "epoch": 0.64, + "grad_norm": 1.6443754137011448, + "learning_rate": 3.048568879654836e-06, + "loss": 0.3065, + "step": 22024 + }, + { + "epoch": 0.64, + "grad_norm": 1.5369130516788219, + "learning_rate": 3.048136431448594e-06, + "loss": 0.2886, + "step": 22025 + }, + { + "epoch": 0.64, + "grad_norm": 1.2402339064329206, + "learning_rate": 3.0477040004675717e-06, + "loss": 0.2875, + "step": 22026 + }, + { + "epoch": 0.64, + "grad_norm": 1.4843271041593291, + "learning_rate": 3.0472715867155816e-06, + "loss": 0.2729, + "step": 22027 + }, + { + "epoch": 0.64, + "grad_norm": 1.2645163301303188, + "learning_rate": 3.046839190196441e-06, + "loss": 0.282, + "step": 22028 + }, + { + "epoch": 0.64, + "grad_norm": 1.2420173618310952, + "learning_rate": 3.0464068109139673e-06, + "loss": 0.2914, + "step": 22029 + }, + { + "epoch": 0.64, + "grad_norm": 1.2459189424906165, + "learning_rate": 3.0459744488719757e-06, + "loss": 0.2815, + "step": 22030 + }, + { + "epoch": 0.64, + "grad_norm": 1.2031566080585534, + "learning_rate": 3.0455421040742807e-06, + "loss": 0.2807, + "step": 22031 + }, + { + "epoch": 0.64, + "grad_norm": 1.255771412984364, + "learning_rate": 3.0451097765246973e-06, + "loss": 0.2812, + "step": 22032 + }, + { + "epoch": 0.64, + "grad_norm": 1.2972817337039115, + "learning_rate": 3.044677466227042e-06, + "loss": 0.2996, + "step": 22033 + }, + { + "epoch": 0.64, + "grad_norm": 1.4793657354772445, + "learning_rate": 3.0442451731851304e-06, + "loss": 0.3122, + "step": 22034 + }, + { + "epoch": 0.64, + "grad_norm": 1.3945894817773141, + "learning_rate": 3.0438128974027774e-06, + "loss": 0.3017, + "step": 22035 + }, + { + "epoch": 0.64, + "grad_norm": 1.278879804567381, + "learning_rate": 3.043380638883796e-06, + "loss": 0.303, + "step": 22036 + }, + { + "epoch": 0.64, + "grad_norm": 1.210654399885652, + "learning_rate": 3.042948397632003e-06, + "loss": 0.2917, + "step": 22037 + }, + { + "epoch": 0.64, + "grad_norm": 1.350328137493836, + "learning_rate": 3.0425161736512114e-06, + "loss": 0.3088, + "step": 22038 + }, + { + "epoch": 0.64, + "grad_norm": 1.2217984755767541, + "learning_rate": 3.042083966945238e-06, + "loss": 0.2926, + "step": 22039 + }, + { + "epoch": 0.64, + "grad_norm": 1.3185462490225692, + "learning_rate": 3.0416517775178932e-06, + "loss": 0.3004, + "step": 22040 + }, + { + "epoch": 0.64, + "grad_norm": 1.3324450379430162, + "learning_rate": 3.0412196053729944e-06, + "loss": 0.2857, + "step": 22041 + }, + { + "epoch": 0.64, + "grad_norm": 1.2541245302083528, + "learning_rate": 3.0407874505143554e-06, + "loss": 0.2768, + "step": 22042 + }, + { + "epoch": 0.64, + "grad_norm": 1.335661767843341, + "learning_rate": 3.0403553129457875e-06, + "loss": 0.3049, + "step": 22043 + }, + { + "epoch": 0.64, + "grad_norm": 3.7747947802315207, + "learning_rate": 3.0399231926711052e-06, + "loss": 0.309, + "step": 22044 + }, + { + "epoch": 0.64, + "grad_norm": 1.3177791409316753, + "learning_rate": 3.0394910896941245e-06, + "loss": 0.2837, + "step": 22045 + }, + { + "epoch": 0.64, + "grad_norm": 1.290526603937613, + "learning_rate": 3.039059004018654e-06, + "loss": 0.2956, + "step": 22046 + }, + { + "epoch": 0.64, + "grad_norm": 1.612171034731407, + "learning_rate": 3.038626935648511e-06, + "loss": 0.3015, + "step": 22047 + }, + { + "epoch": 0.64, + "grad_norm": 1.374250495710844, + "learning_rate": 3.0381948845875063e-06, + "loss": 0.2824, + "step": 22048 + }, + { + "epoch": 0.64, + "grad_norm": 1.4566241956308708, + "learning_rate": 3.0377628508394546e-06, + "loss": 0.3126, + "step": 22049 + }, + { + "epoch": 0.64, + "grad_norm": 1.3569501206871524, + "learning_rate": 3.0373308344081665e-06, + "loss": 0.2794, + "step": 22050 + }, + { + "epoch": 0.64, + "grad_norm": 1.2194896784040659, + "learning_rate": 3.036898835297456e-06, + "loss": 0.2704, + "step": 22051 + }, + { + "epoch": 0.64, + "grad_norm": 1.3630220127037878, + "learning_rate": 3.036466853511135e-06, + "loss": 0.2576, + "step": 22052 + }, + { + "epoch": 0.64, + "grad_norm": 1.4288772498202347, + "learning_rate": 3.0360348890530163e-06, + "loss": 0.2803, + "step": 22053 + }, + { + "epoch": 0.64, + "grad_norm": 2.253551731896847, + "learning_rate": 3.0356029419269105e-06, + "loss": 0.3034, + "step": 22054 + }, + { + "epoch": 0.64, + "grad_norm": 1.195095838710004, + "learning_rate": 3.0351710121366307e-06, + "loss": 0.2857, + "step": 22055 + }, + { + "epoch": 0.64, + "grad_norm": 1.3172805194706312, + "learning_rate": 3.0347390996859883e-06, + "loss": 0.2816, + "step": 22056 + }, + { + "epoch": 0.64, + "grad_norm": 1.3434029523560376, + "learning_rate": 3.0343072045787956e-06, + "loss": 0.2832, + "step": 22057 + }, + { + "epoch": 0.64, + "grad_norm": 1.4084178065302229, + "learning_rate": 3.033875326818865e-06, + "loss": 0.3021, + "step": 22058 + }, + { + "epoch": 0.64, + "grad_norm": 1.2311026386448594, + "learning_rate": 3.033443466410004e-06, + "loss": 0.2694, + "step": 22059 + }, + { + "epoch": 0.64, + "grad_norm": 1.5220504419321819, + "learning_rate": 3.0330116233560274e-06, + "loss": 0.2779, + "step": 22060 + }, + { + "epoch": 0.64, + "grad_norm": 1.3141273082533518, + "learning_rate": 3.0325797976607453e-06, + "loss": 0.2874, + "step": 22061 + }, + { + "epoch": 0.64, + "grad_norm": 1.237573346521645, + "learning_rate": 3.0321479893279683e-06, + "loss": 0.2927, + "step": 22062 + }, + { + "epoch": 0.64, + "grad_norm": 1.3304413610860646, + "learning_rate": 3.0317161983615063e-06, + "loss": 0.2911, + "step": 22063 + }, + { + "epoch": 0.64, + "grad_norm": 2.1205459564228897, + "learning_rate": 3.0312844247651714e-06, + "loss": 0.3028, + "step": 22064 + }, + { + "epoch": 0.64, + "grad_norm": 1.002194351285664, + "learning_rate": 3.030852668542773e-06, + "loss": 0.5513, + "step": 22065 + }, + { + "epoch": 0.64, + "grad_norm": 1.4824836692176362, + "learning_rate": 3.0304209296981223e-06, + "loss": 0.2689, + "step": 22066 + }, + { + "epoch": 0.64, + "grad_norm": 1.4803735934406206, + "learning_rate": 3.029989208235028e-06, + "loss": 0.308, + "step": 22067 + }, + { + "epoch": 0.64, + "grad_norm": 1.3284284964368385, + "learning_rate": 3.0295575041573006e-06, + "loss": 0.298, + "step": 22068 + }, + { + "epoch": 0.64, + "grad_norm": 2.0141866181069554, + "learning_rate": 3.02912581746875e-06, + "loss": 0.281, + "step": 22069 + }, + { + "epoch": 0.64, + "grad_norm": 4.5741033513784926, + "learning_rate": 3.0286941481731884e-06, + "loss": 0.2999, + "step": 22070 + }, + { + "epoch": 0.64, + "grad_norm": 1.6050586747790834, + "learning_rate": 3.0282624962744207e-06, + "loss": 0.3119, + "step": 22071 + }, + { + "epoch": 0.64, + "grad_norm": 1.4205727640854702, + "learning_rate": 3.027830861776259e-06, + "loss": 0.3196, + "step": 22072 + }, + { + "epoch": 0.64, + "grad_norm": 1.5389714869801796, + "learning_rate": 3.0273992446825114e-06, + "loss": 0.2968, + "step": 22073 + }, + { + "epoch": 0.64, + "grad_norm": 1.7158632177559157, + "learning_rate": 3.0269676449969865e-06, + "loss": 0.3065, + "step": 22074 + }, + { + "epoch": 0.64, + "grad_norm": 1.2653876560190847, + "learning_rate": 3.0265360627234948e-06, + "loss": 0.3025, + "step": 22075 + }, + { + "epoch": 0.64, + "grad_norm": 1.3440146534974196, + "learning_rate": 3.0261044978658437e-06, + "loss": 0.2854, + "step": 22076 + }, + { + "epoch": 0.64, + "grad_norm": 1.19103744704202, + "learning_rate": 3.0256729504278436e-06, + "loss": 0.3056, + "step": 22077 + }, + { + "epoch": 0.64, + "grad_norm": 1.6851730073049844, + "learning_rate": 3.0252414204133006e-06, + "loss": 0.2898, + "step": 22078 + }, + { + "epoch": 0.64, + "grad_norm": 1.299900043535516, + "learning_rate": 3.024809907826024e-06, + "loss": 0.2778, + "step": 22079 + }, + { + "epoch": 0.64, + "grad_norm": 1.2994415791565894, + "learning_rate": 3.0243784126698216e-06, + "loss": 0.2705, + "step": 22080 + }, + { + "epoch": 0.64, + "grad_norm": 1.2218632567349057, + "learning_rate": 3.023946934948503e-06, + "loss": 0.2784, + "step": 22081 + }, + { + "epoch": 0.64, + "grad_norm": 1.5193624521409073, + "learning_rate": 3.0235154746658733e-06, + "loss": 0.3106, + "step": 22082 + }, + { + "epoch": 0.64, + "grad_norm": 1.2805560289621174, + "learning_rate": 3.0230840318257416e-06, + "loss": 0.2819, + "step": 22083 + }, + { + "epoch": 0.64, + "grad_norm": 1.3995715856165716, + "learning_rate": 3.022652606431915e-06, + "loss": 0.2803, + "step": 22084 + }, + { + "epoch": 0.64, + "grad_norm": 1.47040485490917, + "learning_rate": 3.0222211984882028e-06, + "loss": 0.2898, + "step": 22085 + }, + { + "epoch": 0.64, + "grad_norm": 0.9832982401569269, + "learning_rate": 3.0217898079984087e-06, + "loss": 0.5906, + "step": 22086 + }, + { + "epoch": 0.64, + "grad_norm": 1.1772068883644686, + "learning_rate": 3.0213584349663424e-06, + "loss": 0.2819, + "step": 22087 + }, + { + "epoch": 0.64, + "grad_norm": 1.595322036200694, + "learning_rate": 3.020927079395809e-06, + "loss": 0.2788, + "step": 22088 + }, + { + "epoch": 0.64, + "grad_norm": 1.3817208958715563, + "learning_rate": 3.0204957412906173e-06, + "loss": 0.3041, + "step": 22089 + }, + { + "epoch": 0.64, + "grad_norm": 1.275075937697333, + "learning_rate": 3.0200644206545716e-06, + "loss": 0.2842, + "step": 22090 + }, + { + "epoch": 0.64, + "grad_norm": 0.9506475646256795, + "learning_rate": 3.0196331174914795e-06, + "loss": 0.6419, + "step": 22091 + }, + { + "epoch": 0.64, + "grad_norm": 1.323706222842068, + "learning_rate": 3.019201831805147e-06, + "loss": 0.2987, + "step": 22092 + }, + { + "epoch": 0.64, + "grad_norm": 1.2628572916742802, + "learning_rate": 3.0187705635993803e-06, + "loss": 0.2821, + "step": 22093 + }, + { + "epoch": 0.64, + "grad_norm": 1.792932006812025, + "learning_rate": 3.0183393128779863e-06, + "loss": 0.2993, + "step": 22094 + }, + { + "epoch": 0.64, + "grad_norm": 1.2184086493392472, + "learning_rate": 3.0179080796447692e-06, + "loss": 0.2592, + "step": 22095 + }, + { + "epoch": 0.64, + "grad_norm": 1.2366429632599243, + "learning_rate": 3.0174768639035346e-06, + "loss": 0.289, + "step": 22096 + }, + { + "epoch": 0.64, + "grad_norm": 1.2217921843078086, + "learning_rate": 3.017045665658089e-06, + "loss": 0.2971, + "step": 22097 + }, + { + "epoch": 0.64, + "grad_norm": 1.2958815085250612, + "learning_rate": 3.01661448491224e-06, + "loss": 0.2701, + "step": 22098 + }, + { + "epoch": 0.64, + "grad_norm": 1.4624417757112056, + "learning_rate": 3.016183321669788e-06, + "loss": 0.325, + "step": 22099 + }, + { + "epoch": 0.64, + "grad_norm": 1.2192038000012921, + "learning_rate": 3.0157521759345406e-06, + "loss": 0.2667, + "step": 22100 + }, + { + "epoch": 0.64, + "grad_norm": 1.2239369382813645, + "learning_rate": 3.015321047710301e-06, + "loss": 0.2715, + "step": 22101 + }, + { + "epoch": 0.64, + "grad_norm": 1.6208845580726263, + "learning_rate": 3.014889937000876e-06, + "loss": 0.2642, + "step": 22102 + }, + { + "epoch": 0.64, + "grad_norm": 1.2530372061009287, + "learning_rate": 3.0144588438100693e-06, + "loss": 0.2817, + "step": 22103 + }, + { + "epoch": 0.64, + "grad_norm": 1.342366816424942, + "learning_rate": 3.0140277681416864e-06, + "loss": 0.2929, + "step": 22104 + }, + { + "epoch": 0.64, + "grad_norm": 1.4377383096405592, + "learning_rate": 3.013596709999529e-06, + "loss": 0.2991, + "step": 22105 + }, + { + "epoch": 0.64, + "grad_norm": 1.3927720645074204, + "learning_rate": 3.0131656693874025e-06, + "loss": 0.2992, + "step": 22106 + }, + { + "epoch": 0.64, + "grad_norm": 1.4907667946603824, + "learning_rate": 3.012734646309111e-06, + "loss": 0.2993, + "step": 22107 + }, + { + "epoch": 0.64, + "grad_norm": 1.4716412859444712, + "learning_rate": 3.01230364076846e-06, + "loss": 0.2895, + "step": 22108 + }, + { + "epoch": 0.64, + "grad_norm": 1.413904328576822, + "learning_rate": 3.0118726527692497e-06, + "loss": 0.2757, + "step": 22109 + }, + { + "epoch": 0.64, + "grad_norm": 1.6622170682309092, + "learning_rate": 3.011441682315286e-06, + "loss": 0.285, + "step": 22110 + }, + { + "epoch": 0.64, + "grad_norm": 0.9063919634986374, + "learning_rate": 3.011010729410371e-06, + "loss": 0.5707, + "step": 22111 + }, + { + "epoch": 0.64, + "grad_norm": 1.6719095697446351, + "learning_rate": 3.0105797940583083e-06, + "loss": 0.3082, + "step": 22112 + }, + { + "epoch": 0.64, + "grad_norm": 1.4490923796077584, + "learning_rate": 3.010148876262902e-06, + "loss": 0.2939, + "step": 22113 + }, + { + "epoch": 0.64, + "grad_norm": 1.4617481435646351, + "learning_rate": 3.009717976027954e-06, + "loss": 0.2948, + "step": 22114 + }, + { + "epoch": 0.64, + "grad_norm": 1.3665397048875119, + "learning_rate": 3.0092870933572656e-06, + "loss": 0.3023, + "step": 22115 + }, + { + "epoch": 0.64, + "grad_norm": 0.8691029432711241, + "learning_rate": 3.0088562282546407e-06, + "loss": 0.5428, + "step": 22116 + }, + { + "epoch": 0.64, + "grad_norm": 1.2171705531620431, + "learning_rate": 3.0084253807238837e-06, + "loss": 0.2724, + "step": 22117 + }, + { + "epoch": 0.64, + "grad_norm": 1.3398332004021403, + "learning_rate": 3.007994550768793e-06, + "loss": 0.2835, + "step": 22118 + }, + { + "epoch": 0.64, + "grad_norm": 1.5486163978540781, + "learning_rate": 3.0075637383931726e-06, + "loss": 0.3089, + "step": 22119 + }, + { + "epoch": 0.64, + "grad_norm": 1.432085260878728, + "learning_rate": 3.0071329436008243e-06, + "loss": 0.2834, + "step": 22120 + }, + { + "epoch": 0.64, + "grad_norm": 1.3856842897071682, + "learning_rate": 3.0067021663955516e-06, + "loss": 0.297, + "step": 22121 + }, + { + "epoch": 0.64, + "grad_norm": 0.9000960517155095, + "learning_rate": 3.0062714067811526e-06, + "loss": 0.54, + "step": 22122 + }, + { + "epoch": 0.64, + "grad_norm": 0.9952844063897772, + "learning_rate": 3.0058406647614307e-06, + "loss": 0.5699, + "step": 22123 + }, + { + "epoch": 0.64, + "grad_norm": 1.2626716927670445, + "learning_rate": 3.005409940340187e-06, + "loss": 0.2895, + "step": 22124 + }, + { + "epoch": 0.64, + "grad_norm": 1.19540580477752, + "learning_rate": 3.0049792335212242e-06, + "loss": 0.2827, + "step": 22125 + }, + { + "epoch": 0.64, + "grad_norm": 1.3784042566797898, + "learning_rate": 3.0045485443083413e-06, + "loss": 0.3087, + "step": 22126 + }, + { + "epoch": 0.64, + "grad_norm": 1.2794815627346534, + "learning_rate": 3.0041178727053398e-06, + "loss": 0.2987, + "step": 22127 + }, + { + "epoch": 0.64, + "grad_norm": 1.2845274156883382, + "learning_rate": 3.003687218716019e-06, + "loss": 0.2908, + "step": 22128 + }, + { + "epoch": 0.64, + "grad_norm": 1.5087724402219025, + "learning_rate": 3.0032565823441808e-06, + "loss": 0.3031, + "step": 22129 + }, + { + "epoch": 0.64, + "grad_norm": 1.273515977598066, + "learning_rate": 3.0028259635936254e-06, + "loss": 0.2973, + "step": 22130 + }, + { + "epoch": 0.64, + "grad_norm": 1.4228159698439167, + "learning_rate": 3.0023953624681536e-06, + "loss": 0.2868, + "step": 22131 + }, + { + "epoch": 0.64, + "grad_norm": 1.4496124651495992, + "learning_rate": 3.001964778971565e-06, + "loss": 0.294, + "step": 22132 + }, + { + "epoch": 0.64, + "grad_norm": 1.4422328743570783, + "learning_rate": 3.0015342131076586e-06, + "loss": 0.2741, + "step": 22133 + }, + { + "epoch": 0.64, + "grad_norm": 1.3547039694112972, + "learning_rate": 3.001103664880235e-06, + "loss": 0.2977, + "step": 22134 + }, + { + "epoch": 0.64, + "grad_norm": 1.323489541743394, + "learning_rate": 3.000673134293094e-06, + "loss": 0.285, + "step": 22135 + }, + { + "epoch": 0.64, + "grad_norm": 1.3441198731946478, + "learning_rate": 3.000242621350036e-06, + "loss": 0.2801, + "step": 22136 + }, + { + "epoch": 0.64, + "grad_norm": 1.585332266793936, + "learning_rate": 2.9998121260548575e-06, + "loss": 0.2789, + "step": 22137 + }, + { + "epoch": 0.64, + "grad_norm": 1.2671821455163823, + "learning_rate": 2.999381648411359e-06, + "loss": 0.2844, + "step": 22138 + }, + { + "epoch": 0.64, + "grad_norm": 1.455669098510183, + "learning_rate": 2.9989511884233403e-06, + "loss": 0.2971, + "step": 22139 + }, + { + "epoch": 0.64, + "grad_norm": 1.5224938026102404, + "learning_rate": 2.9985207460946e-06, + "loss": 0.2775, + "step": 22140 + }, + { + "epoch": 0.64, + "grad_norm": 1.274790738559613, + "learning_rate": 2.9980903214289354e-06, + "loss": 0.309, + "step": 22141 + }, + { + "epoch": 0.64, + "grad_norm": 1.9524154373997629, + "learning_rate": 2.997659914430146e-06, + "loss": 0.2858, + "step": 22142 + }, + { + "epoch": 0.64, + "grad_norm": 1.3818795443390306, + "learning_rate": 2.99722952510203e-06, + "loss": 0.2877, + "step": 22143 + }, + { + "epoch": 0.64, + "grad_norm": 1.2331871375401977, + "learning_rate": 2.9967991534483863e-06, + "loss": 0.2829, + "step": 22144 + }, + { + "epoch": 0.64, + "grad_norm": 1.31063585567402, + "learning_rate": 2.9963687994730122e-06, + "loss": 0.2894, + "step": 22145 + }, + { + "epoch": 0.64, + "grad_norm": 1.4292246799137773, + "learning_rate": 2.995938463179705e-06, + "loss": 0.2725, + "step": 22146 + }, + { + "epoch": 0.64, + "grad_norm": 1.3357618163605625, + "learning_rate": 2.9955081445722634e-06, + "loss": 0.2831, + "step": 22147 + }, + { + "epoch": 0.64, + "grad_norm": 1.2394893785140602, + "learning_rate": 2.9950778436544847e-06, + "loss": 0.2873, + "step": 22148 + }, + { + "epoch": 0.64, + "grad_norm": 1.4034937354127492, + "learning_rate": 2.994647560430167e-06, + "loss": 0.2802, + "step": 22149 + }, + { + "epoch": 0.64, + "grad_norm": 1.3535935118058366, + "learning_rate": 2.994217294903107e-06, + "loss": 0.3233, + "step": 22150 + }, + { + "epoch": 0.64, + "grad_norm": 1.322242883203165, + "learning_rate": 2.9937870470771e-06, + "loss": 0.2775, + "step": 22151 + }, + { + "epoch": 0.64, + "grad_norm": 1.3669633472335097, + "learning_rate": 2.9933568169559453e-06, + "loss": 0.2947, + "step": 22152 + }, + { + "epoch": 0.64, + "grad_norm": 1.5690801464404585, + "learning_rate": 2.9929266045434395e-06, + "loss": 0.2762, + "step": 22153 + }, + { + "epoch": 0.64, + "grad_norm": 1.3053751121727455, + "learning_rate": 2.9924964098433786e-06, + "loss": 0.2826, + "step": 22154 + }, + { + "epoch": 0.64, + "grad_norm": 1.4087894620383496, + "learning_rate": 2.99206623285956e-06, + "loss": 0.2906, + "step": 22155 + }, + { + "epoch": 0.64, + "grad_norm": 8.880460260612285, + "learning_rate": 2.9916360735957777e-06, + "loss": 0.3265, + "step": 22156 + }, + { + "epoch": 0.64, + "grad_norm": 1.316437954577682, + "learning_rate": 2.9912059320558294e-06, + "loss": 0.2779, + "step": 22157 + }, + { + "epoch": 0.64, + "grad_norm": 1.2395136718549506, + "learning_rate": 2.990775808243511e-06, + "loss": 0.2838, + "step": 22158 + }, + { + "epoch": 0.64, + "grad_norm": 1.370894679525223, + "learning_rate": 2.9903457021626193e-06, + "loss": 0.3048, + "step": 22159 + }, + { + "epoch": 0.64, + "grad_norm": 1.2778701358678548, + "learning_rate": 2.989915613816948e-06, + "loss": 0.3055, + "step": 22160 + }, + { + "epoch": 0.64, + "grad_norm": 1.4709125401848373, + "learning_rate": 2.9894855432102942e-06, + "loss": 0.2735, + "step": 22161 + }, + { + "epoch": 0.64, + "grad_norm": 1.5403033961242223, + "learning_rate": 2.9890554903464526e-06, + "loss": 0.2803, + "step": 22162 + }, + { + "epoch": 0.64, + "grad_norm": 1.4575452683558743, + "learning_rate": 2.988625455229219e-06, + "loss": 0.299, + "step": 22163 + }, + { + "epoch": 0.64, + "grad_norm": 1.8620003737065554, + "learning_rate": 2.988195437862388e-06, + "loss": 0.2995, + "step": 22164 + }, + { + "epoch": 0.64, + "grad_norm": 1.2508910740400783, + "learning_rate": 2.987765438249754e-06, + "loss": 0.283, + "step": 22165 + }, + { + "epoch": 0.64, + "grad_norm": 1.2429583536080602, + "learning_rate": 2.9873354563951125e-06, + "loss": 0.2793, + "step": 22166 + }, + { + "epoch": 0.64, + "grad_norm": 1.2013003343238005, + "learning_rate": 2.986905492302258e-06, + "loss": 0.2733, + "step": 22167 + }, + { + "epoch": 0.64, + "grad_norm": 1.3135121135222911, + "learning_rate": 2.9864755459749856e-06, + "loss": 0.2867, + "step": 22168 + }, + { + "epoch": 0.64, + "grad_norm": 1.5453317218501819, + "learning_rate": 2.9860456174170884e-06, + "loss": 0.2964, + "step": 22169 + }, + { + "epoch": 0.64, + "grad_norm": 1.7730373191860485, + "learning_rate": 2.98561570663236e-06, + "loss": 0.292, + "step": 22170 + }, + { + "epoch": 0.64, + "grad_norm": 1.350910855500896, + "learning_rate": 2.985185813624596e-06, + "loss": 0.3109, + "step": 22171 + }, + { + "epoch": 0.64, + "grad_norm": 1.2886863673725246, + "learning_rate": 2.98475593839759e-06, + "loss": 0.2829, + "step": 22172 + }, + { + "epoch": 0.64, + "grad_norm": 1.3778091781699429, + "learning_rate": 2.9843260809551344e-06, + "loss": 0.3237, + "step": 22173 + }, + { + "epoch": 0.64, + "grad_norm": 1.2954469789362015, + "learning_rate": 2.9838962413010242e-06, + "loss": 0.2968, + "step": 22174 + }, + { + "epoch": 0.64, + "grad_norm": 1.515022679211732, + "learning_rate": 2.9834664194390516e-06, + "loss": 0.2815, + "step": 22175 + }, + { + "epoch": 0.64, + "grad_norm": 1.2509754811837022, + "learning_rate": 2.9830366153730105e-06, + "loss": 0.2732, + "step": 22176 + }, + { + "epoch": 0.64, + "grad_norm": 1.2849261404015029, + "learning_rate": 2.9826068291066933e-06, + "loss": 0.2859, + "step": 22177 + }, + { + "epoch": 0.64, + "grad_norm": 1.299887281085765, + "learning_rate": 2.982177060643894e-06, + "loss": 0.2865, + "step": 22178 + }, + { + "epoch": 0.64, + "grad_norm": 2.5831541256555797, + "learning_rate": 2.981747309988403e-06, + "loss": 0.282, + "step": 22179 + }, + { + "epoch": 0.64, + "grad_norm": 1.2994368605679725, + "learning_rate": 2.9813175771440166e-06, + "loss": 0.269, + "step": 22180 + }, + { + "epoch": 0.64, + "grad_norm": 1.2803090282699692, + "learning_rate": 2.980887862114523e-06, + "loss": 0.2913, + "step": 22181 + }, + { + "epoch": 0.64, + "grad_norm": 1.274358775780585, + "learning_rate": 2.980458164903717e-06, + "loss": 0.2812, + "step": 22182 + }, + { + "epoch": 0.64, + "grad_norm": 1.4416514717735116, + "learning_rate": 2.9800284855153926e-06, + "loss": 0.2706, + "step": 22183 + }, + { + "epoch": 0.64, + "grad_norm": 1.2350677389166282, + "learning_rate": 2.9795988239533363e-06, + "loss": 0.2861, + "step": 22184 + }, + { + "epoch": 0.64, + "grad_norm": 1.4042396077074497, + "learning_rate": 2.979169180221343e-06, + "loss": 0.2927, + "step": 22185 + }, + { + "epoch": 0.64, + "grad_norm": 2.1931032850299648, + "learning_rate": 2.9787395543232054e-06, + "loss": 0.2728, + "step": 22186 + }, + { + "epoch": 0.64, + "grad_norm": 1.2519753654742924, + "learning_rate": 2.978309946262713e-06, + "loss": 0.2829, + "step": 22187 + }, + { + "epoch": 0.64, + "grad_norm": 1.3709281447709571, + "learning_rate": 2.9778803560436563e-06, + "loss": 0.2718, + "step": 22188 + }, + { + "epoch": 0.64, + "grad_norm": 2.298980956094876, + "learning_rate": 2.9774507836698296e-06, + "loss": 0.2891, + "step": 22189 + }, + { + "epoch": 0.64, + "grad_norm": 1.2953170705103503, + "learning_rate": 2.9770212291450212e-06, + "loss": 0.2867, + "step": 22190 + }, + { + "epoch": 0.64, + "grad_norm": 1.3214858088671115, + "learning_rate": 2.9765916924730242e-06, + "loss": 0.3272, + "step": 22191 + }, + { + "epoch": 0.64, + "grad_norm": 1.36325777859072, + "learning_rate": 2.976162173657626e-06, + "loss": 0.2808, + "step": 22192 + }, + { + "epoch": 0.64, + "grad_norm": 1.3637730798288599, + "learning_rate": 2.9757326727026203e-06, + "loss": 0.2907, + "step": 22193 + }, + { + "epoch": 0.64, + "grad_norm": 1.2715934120316947, + "learning_rate": 2.9753031896117964e-06, + "loss": 0.2964, + "step": 22194 + }, + { + "epoch": 0.64, + "grad_norm": 1.4669261189441858, + "learning_rate": 2.974873724388945e-06, + "loss": 0.2829, + "step": 22195 + }, + { + "epoch": 0.64, + "grad_norm": 1.2686423123782489, + "learning_rate": 2.9744442770378533e-06, + "loss": 0.2978, + "step": 22196 + }, + { + "epoch": 0.64, + "grad_norm": 1.3616543581883516, + "learning_rate": 2.9740148475623143e-06, + "loss": 0.3002, + "step": 22197 + }, + { + "epoch": 0.64, + "grad_norm": 1.3515565018103788, + "learning_rate": 2.973585435966117e-06, + "loss": 0.2913, + "step": 22198 + }, + { + "epoch": 0.64, + "grad_norm": 1.3358058240399207, + "learning_rate": 2.9731560422530513e-06, + "loss": 0.2768, + "step": 22199 + }, + { + "epoch": 0.64, + "grad_norm": 1.3840915223239763, + "learning_rate": 2.972726666426905e-06, + "loss": 0.2705, + "step": 22200 + }, + { + "epoch": 0.64, + "grad_norm": 1.355718069055313, + "learning_rate": 2.9722973084914687e-06, + "loss": 0.3113, + "step": 22201 + }, + { + "epoch": 0.64, + "grad_norm": 1.2097126563915224, + "learning_rate": 2.971867968450531e-06, + "loss": 0.2845, + "step": 22202 + }, + { + "epoch": 0.64, + "grad_norm": 1.2577762185858257, + "learning_rate": 2.9714386463078802e-06, + "loss": 0.2975, + "step": 22203 + }, + { + "epoch": 0.64, + "grad_norm": 1.3270962360038576, + "learning_rate": 2.971009342067308e-06, + "loss": 0.2996, + "step": 22204 + }, + { + "epoch": 0.64, + "grad_norm": 1.4783209957809285, + "learning_rate": 2.9705800557325994e-06, + "loss": 0.2621, + "step": 22205 + }, + { + "epoch": 0.64, + "grad_norm": 1.6886872878984702, + "learning_rate": 2.970150787307544e-06, + "loss": 0.2819, + "step": 22206 + }, + { + "epoch": 0.64, + "grad_norm": 1.3941369693242842, + "learning_rate": 2.969721536795931e-06, + "loss": 0.3175, + "step": 22207 + }, + { + "epoch": 0.64, + "grad_norm": 1.2608304601226463, + "learning_rate": 2.9692923042015487e-06, + "loss": 0.27, + "step": 22208 + }, + { + "epoch": 0.64, + "grad_norm": 1.3739710298375345, + "learning_rate": 2.9688630895281834e-06, + "loss": 0.2634, + "step": 22209 + }, + { + "epoch": 0.64, + "grad_norm": 1.235664135216923, + "learning_rate": 2.9684338927796235e-06, + "loss": 0.2754, + "step": 22210 + }, + { + "epoch": 0.64, + "grad_norm": 1.3812956993587584, + "learning_rate": 2.96800471395966e-06, + "loss": 0.307, + "step": 22211 + }, + { + "epoch": 0.64, + "grad_norm": 1.2747127072225275, + "learning_rate": 2.967575553072074e-06, + "loss": 0.2847, + "step": 22212 + }, + { + "epoch": 0.64, + "grad_norm": 1.6308584482193709, + "learning_rate": 2.9671464101206572e-06, + "loss": 0.2897, + "step": 22213 + }, + { + "epoch": 0.64, + "grad_norm": 1.255800788294994, + "learning_rate": 2.9667172851091968e-06, + "loss": 0.283, + "step": 22214 + }, + { + "epoch": 0.64, + "grad_norm": 1.1873268058918587, + "learning_rate": 2.9662881780414775e-06, + "loss": 0.289, + "step": 22215 + }, + { + "epoch": 0.64, + "grad_norm": 1.2467885611397795, + "learning_rate": 2.9658590889212886e-06, + "loss": 0.2944, + "step": 22216 + }, + { + "epoch": 0.64, + "grad_norm": 1.2909828816374584, + "learning_rate": 2.9654300177524155e-06, + "loss": 0.2869, + "step": 22217 + }, + { + "epoch": 0.64, + "grad_norm": 1.3295885933686031, + "learning_rate": 2.9650009645386453e-06, + "loss": 0.2813, + "step": 22218 + }, + { + "epoch": 0.64, + "grad_norm": 1.4474748227290675, + "learning_rate": 2.9645719292837638e-06, + "loss": 0.3123, + "step": 22219 + }, + { + "epoch": 0.64, + "grad_norm": 1.2649755215220853, + "learning_rate": 2.964142911991557e-06, + "loss": 0.3019, + "step": 22220 + }, + { + "epoch": 0.64, + "grad_norm": 1.2686668366102511, + "learning_rate": 2.963713912665812e-06, + "loss": 0.275, + "step": 22221 + }, + { + "epoch": 0.64, + "grad_norm": 1.2806873700796446, + "learning_rate": 2.9632849313103153e-06, + "loss": 0.2768, + "step": 22222 + }, + { + "epoch": 0.64, + "grad_norm": 6.835452423236295, + "learning_rate": 2.9628559679288505e-06, + "loss": 0.2927, + "step": 22223 + }, + { + "epoch": 0.64, + "grad_norm": 1.3243889343819883, + "learning_rate": 2.962427022525205e-06, + "loss": 0.3099, + "step": 22224 + }, + { + "epoch": 0.64, + "grad_norm": 1.2841594562129772, + "learning_rate": 2.961998095103163e-06, + "loss": 0.2841, + "step": 22225 + }, + { + "epoch": 0.64, + "grad_norm": 1.625658711351436, + "learning_rate": 2.961569185666511e-06, + "loss": 0.2801, + "step": 22226 + }, + { + "epoch": 0.64, + "grad_norm": 1.3618929099885846, + "learning_rate": 2.9611402942190336e-06, + "loss": 0.2867, + "step": 22227 + }, + { + "epoch": 0.64, + "grad_norm": 1.2053489811329363, + "learning_rate": 2.960711420764515e-06, + "loss": 0.2733, + "step": 22228 + }, + { + "epoch": 0.64, + "grad_norm": 1.488998065659824, + "learning_rate": 2.960282565306741e-06, + "loss": 0.319, + "step": 22229 + }, + { + "epoch": 0.64, + "grad_norm": 0.9901001885386808, + "learning_rate": 2.9598537278494967e-06, + "loss": 0.5669, + "step": 22230 + }, + { + "epoch": 0.64, + "grad_norm": 1.2619244344395968, + "learning_rate": 2.9594249083965653e-06, + "loss": 0.2686, + "step": 22231 + }, + { + "epoch": 0.64, + "grad_norm": 1.5390889922547284, + "learning_rate": 2.958996106951732e-06, + "loss": 0.3189, + "step": 22232 + }, + { + "epoch": 0.64, + "grad_norm": 1.3910837948301258, + "learning_rate": 2.9585673235187807e-06, + "loss": 0.2834, + "step": 22233 + }, + { + "epoch": 0.64, + "grad_norm": 1.2628904254632929, + "learning_rate": 2.9581385581014945e-06, + "loss": 0.2765, + "step": 22234 + }, + { + "epoch": 0.64, + "grad_norm": 1.4523744788295474, + "learning_rate": 2.9577098107036594e-06, + "loss": 0.2867, + "step": 22235 + }, + { + "epoch": 0.64, + "grad_norm": 1.3352636043097317, + "learning_rate": 2.9572810813290574e-06, + "loss": 0.3029, + "step": 22236 + }, + { + "epoch": 0.64, + "grad_norm": 1.2835012560641288, + "learning_rate": 2.9568523699814722e-06, + "loss": 0.2814, + "step": 22237 + }, + { + "epoch": 0.65, + "grad_norm": 1.3296160213562813, + "learning_rate": 2.956423676664688e-06, + "loss": 0.277, + "step": 22238 + }, + { + "epoch": 0.65, + "grad_norm": 1.4500490981814398, + "learning_rate": 2.955995001382489e-06, + "loss": 0.2777, + "step": 22239 + }, + { + "epoch": 0.65, + "grad_norm": 1.2629779683690407, + "learning_rate": 2.955566344138655e-06, + "loss": 0.2793, + "step": 22240 + }, + { + "epoch": 0.65, + "grad_norm": 1.8100611022542705, + "learning_rate": 2.955137704936971e-06, + "loss": 0.2709, + "step": 22241 + }, + { + "epoch": 0.65, + "grad_norm": 1.4929235418083715, + "learning_rate": 2.9547090837812196e-06, + "loss": 0.2756, + "step": 22242 + }, + { + "epoch": 0.65, + "grad_norm": 2.252409775641979, + "learning_rate": 2.9542804806751824e-06, + "loss": 0.2699, + "step": 22243 + }, + { + "epoch": 0.65, + "grad_norm": 2.165760873379883, + "learning_rate": 2.9538518956226424e-06, + "loss": 0.3081, + "step": 22244 + }, + { + "epoch": 0.65, + "grad_norm": 1.5102288639742871, + "learning_rate": 2.9534233286273828e-06, + "loss": 0.2812, + "step": 22245 + }, + { + "epoch": 0.65, + "grad_norm": 1.569192833531395, + "learning_rate": 2.952994779693186e-06, + "loss": 0.2895, + "step": 22246 + }, + { + "epoch": 0.65, + "grad_norm": 1.261353928651513, + "learning_rate": 2.952566248823832e-06, + "loss": 0.2881, + "step": 22247 + }, + { + "epoch": 0.65, + "grad_norm": 1.5451024582660728, + "learning_rate": 2.9521377360231034e-06, + "loss": 0.2939, + "step": 22248 + }, + { + "epoch": 0.65, + "grad_norm": 1.3244418818481019, + "learning_rate": 2.9517092412947823e-06, + "loss": 0.2752, + "step": 22249 + }, + { + "epoch": 0.65, + "grad_norm": 1.3344153285694742, + "learning_rate": 2.95128076464265e-06, + "loss": 0.2912, + "step": 22250 + }, + { + "epoch": 0.65, + "grad_norm": 1.4467730077384666, + "learning_rate": 2.9508523060704875e-06, + "loss": 0.3082, + "step": 22251 + }, + { + "epoch": 0.65, + "grad_norm": 2.2415204903100503, + "learning_rate": 2.950423865582076e-06, + "loss": 0.309, + "step": 22252 + }, + { + "epoch": 0.65, + "grad_norm": 1.6017913449683747, + "learning_rate": 2.9499954431811962e-06, + "loss": 0.2739, + "step": 22253 + }, + { + "epoch": 0.65, + "grad_norm": 1.3267829603017878, + "learning_rate": 2.949567038871631e-06, + "loss": 0.2854, + "step": 22254 + }, + { + "epoch": 0.65, + "grad_norm": 1.2103134430332156, + "learning_rate": 2.9491386526571577e-06, + "loss": 0.2828, + "step": 22255 + }, + { + "epoch": 0.65, + "grad_norm": 2.494154482911259, + "learning_rate": 2.9487102845415593e-06, + "loss": 0.2843, + "step": 22256 + }, + { + "epoch": 0.65, + "grad_norm": 1.394266616345653, + "learning_rate": 2.948281934528615e-06, + "loss": 0.2832, + "step": 22257 + }, + { + "epoch": 0.65, + "grad_norm": 1.614428876703857, + "learning_rate": 2.947853602622106e-06, + "loss": 0.3231, + "step": 22258 + }, + { + "epoch": 0.65, + "grad_norm": 1.2671801098701567, + "learning_rate": 2.9474252888258116e-06, + "loss": 0.3022, + "step": 22259 + }, + { + "epoch": 0.65, + "grad_norm": 1.5683052189975053, + "learning_rate": 2.946996993143511e-06, + "loss": 0.3114, + "step": 22260 + }, + { + "epoch": 0.65, + "grad_norm": 1.2470215146311776, + "learning_rate": 2.9465687155789847e-06, + "loss": 0.331, + "step": 22261 + }, + { + "epoch": 0.65, + "grad_norm": 0.9745199654607796, + "learning_rate": 2.946140456136013e-06, + "loss": 0.5645, + "step": 22262 + }, + { + "epoch": 0.65, + "grad_norm": 1.4295196378108745, + "learning_rate": 2.945712214818375e-06, + "loss": 0.2904, + "step": 22263 + }, + { + "epoch": 0.65, + "grad_norm": 1.2538048912555584, + "learning_rate": 2.9452839916298487e-06, + "loss": 0.2828, + "step": 22264 + }, + { + "epoch": 0.65, + "grad_norm": 1.3818802256316332, + "learning_rate": 2.944855786574214e-06, + "loss": 0.2899, + "step": 22265 + }, + { + "epoch": 0.65, + "grad_norm": 1.569720732808852, + "learning_rate": 2.9444275996552494e-06, + "loss": 0.3172, + "step": 22266 + }, + { + "epoch": 0.65, + "grad_norm": 1.702457369033704, + "learning_rate": 2.9439994308767362e-06, + "loss": 0.2666, + "step": 22267 + }, + { + "epoch": 0.65, + "grad_norm": 0.9247804806067157, + "learning_rate": 2.9435712802424493e-06, + "loss": 0.5883, + "step": 22268 + }, + { + "epoch": 0.65, + "grad_norm": 1.3684128684296164, + "learning_rate": 2.943143147756169e-06, + "loss": 0.2938, + "step": 22269 + }, + { + "epoch": 0.65, + "grad_norm": 1.2846852667303224, + "learning_rate": 2.9427150334216725e-06, + "loss": 0.2812, + "step": 22270 + }, + { + "epoch": 0.65, + "grad_norm": 1.3297333015943276, + "learning_rate": 2.9422869372427386e-06, + "loss": 0.3024, + "step": 22271 + }, + { + "epoch": 0.65, + "grad_norm": 3.3749357690750283, + "learning_rate": 2.9418588592231454e-06, + "loss": 0.2779, + "step": 22272 + }, + { + "epoch": 0.65, + "grad_norm": 2.980714966141867, + "learning_rate": 2.9414307993666724e-06, + "loss": 0.3084, + "step": 22273 + }, + { + "epoch": 0.65, + "grad_norm": 1.1765783398209815, + "learning_rate": 2.941002757677093e-06, + "loss": 0.274, + "step": 22274 + }, + { + "epoch": 0.65, + "grad_norm": 2.282397481385649, + "learning_rate": 2.940574734158188e-06, + "loss": 0.2935, + "step": 22275 + }, + { + "epoch": 0.65, + "grad_norm": 1.2512293386136044, + "learning_rate": 2.9401467288137337e-06, + "loss": 0.2986, + "step": 22276 + }, + { + "epoch": 0.65, + "grad_norm": 1.6889722335492945, + "learning_rate": 2.939718741647508e-06, + "loss": 0.3439, + "step": 22277 + }, + { + "epoch": 0.65, + "grad_norm": 2.270432771003732, + "learning_rate": 2.939290772663287e-06, + "loss": 0.2907, + "step": 22278 + }, + { + "epoch": 0.65, + "grad_norm": 1.2682070174605664, + "learning_rate": 2.938862821864847e-06, + "loss": 0.2921, + "step": 22279 + }, + { + "epoch": 0.65, + "grad_norm": 2.3332371551511244, + "learning_rate": 2.9384348892559657e-06, + "loss": 0.2775, + "step": 22280 + }, + { + "epoch": 0.65, + "grad_norm": 1.2232454852386632, + "learning_rate": 2.9380069748404195e-06, + "loss": 0.2736, + "step": 22281 + }, + { + "epoch": 0.65, + "grad_norm": 1.5053712899148006, + "learning_rate": 2.9375790786219854e-06, + "loss": 0.2916, + "step": 22282 + }, + { + "epoch": 0.65, + "grad_norm": 1.4816384489580297, + "learning_rate": 2.9371512006044377e-06, + "loss": 0.3023, + "step": 22283 + }, + { + "epoch": 0.65, + "grad_norm": 1.3362597180788562, + "learning_rate": 2.936723340791554e-06, + "loss": 0.3051, + "step": 22284 + }, + { + "epoch": 0.65, + "grad_norm": 1.3190938228197462, + "learning_rate": 2.9362954991871096e-06, + "loss": 0.3202, + "step": 22285 + }, + { + "epoch": 0.65, + "grad_norm": 1.4971167492683763, + "learning_rate": 2.935867675794881e-06, + "loss": 0.285, + "step": 22286 + }, + { + "epoch": 0.65, + "grad_norm": 1.3132317689354327, + "learning_rate": 2.9354398706186427e-06, + "loss": 0.2699, + "step": 22287 + }, + { + "epoch": 0.65, + "grad_norm": 1.2624396087434164, + "learning_rate": 2.93501208366217e-06, + "loss": 0.2783, + "step": 22288 + }, + { + "epoch": 0.65, + "grad_norm": 1.2506788314964985, + "learning_rate": 2.9345843149292386e-06, + "loss": 0.2933, + "step": 22289 + }, + { + "epoch": 0.65, + "grad_norm": 1.3297933336967753, + "learning_rate": 2.9341565644236247e-06, + "loss": 0.2911, + "step": 22290 + }, + { + "epoch": 0.65, + "grad_norm": 1.444217670613511, + "learning_rate": 2.933728832149101e-06, + "loss": 0.2661, + "step": 22291 + }, + { + "epoch": 0.65, + "grad_norm": 1.5157127114928604, + "learning_rate": 2.933301118109443e-06, + "loss": 0.2822, + "step": 22292 + }, + { + "epoch": 0.65, + "grad_norm": 1.332047820147327, + "learning_rate": 2.9328734223084253e-06, + "loss": 0.2782, + "step": 22293 + }, + { + "epoch": 0.65, + "grad_norm": 1.2618552212217193, + "learning_rate": 2.9324457447498245e-06, + "loss": 0.2895, + "step": 22294 + }, + { + "epoch": 0.65, + "grad_norm": 1.3586722162745375, + "learning_rate": 2.932018085437412e-06, + "loss": 0.3043, + "step": 22295 + }, + { + "epoch": 0.65, + "grad_norm": 1.5402756125842667, + "learning_rate": 2.9315904443749634e-06, + "loss": 0.3165, + "step": 22296 + }, + { + "epoch": 0.65, + "grad_norm": 1.232981868911181, + "learning_rate": 2.931162821566251e-06, + "loss": 0.2948, + "step": 22297 + }, + { + "epoch": 0.65, + "grad_norm": 1.278197032023029, + "learning_rate": 2.9307352170150496e-06, + "loss": 0.2882, + "step": 22298 + }, + { + "epoch": 0.65, + "grad_norm": 1.3189420790445838, + "learning_rate": 2.9303076307251327e-06, + "loss": 0.303, + "step": 22299 + }, + { + "epoch": 0.65, + "grad_norm": 0.9660880905288554, + "learning_rate": 2.9298800627002743e-06, + "loss": 0.6217, + "step": 22300 + }, + { + "epoch": 0.65, + "grad_norm": 1.5042686963284986, + "learning_rate": 2.9294525129442475e-06, + "loss": 0.2844, + "step": 22301 + }, + { + "epoch": 0.65, + "grad_norm": 1.3606766393878917, + "learning_rate": 2.929024981460825e-06, + "loss": 0.2893, + "step": 22302 + }, + { + "epoch": 0.65, + "grad_norm": 1.2844642300915383, + "learning_rate": 2.92859746825378e-06, + "loss": 0.3011, + "step": 22303 + }, + { + "epoch": 0.65, + "grad_norm": 1.4721735086542773, + "learning_rate": 2.9281699733268853e-06, + "loss": 0.2801, + "step": 22304 + }, + { + "epoch": 0.65, + "grad_norm": 1.240756113757828, + "learning_rate": 2.9277424966839142e-06, + "loss": 0.2918, + "step": 22305 + }, + { + "epoch": 0.65, + "grad_norm": 1.961386167547274, + "learning_rate": 2.9273150383286374e-06, + "loss": 0.275, + "step": 22306 + }, + { + "epoch": 0.65, + "grad_norm": 1.679322310884105, + "learning_rate": 2.926887598264829e-06, + "loss": 0.2868, + "step": 22307 + }, + { + "epoch": 0.65, + "grad_norm": 1.4121174214111947, + "learning_rate": 2.9264601764962603e-06, + "loss": 0.2851, + "step": 22308 + }, + { + "epoch": 0.65, + "grad_norm": 1.420463044216967, + "learning_rate": 2.9260327730267046e-06, + "loss": 0.2814, + "step": 22309 + }, + { + "epoch": 0.65, + "grad_norm": 1.456935052759888, + "learning_rate": 2.9256053878599317e-06, + "loss": 0.3226, + "step": 22310 + }, + { + "epoch": 0.65, + "grad_norm": 1.2822249017300356, + "learning_rate": 2.925178020999714e-06, + "loss": 0.286, + "step": 22311 + }, + { + "epoch": 0.65, + "grad_norm": 1.3752223247973443, + "learning_rate": 2.9247506724498238e-06, + "loss": 0.2767, + "step": 22312 + }, + { + "epoch": 0.65, + "grad_norm": 1.2795809412503552, + "learning_rate": 2.924323342214033e-06, + "loss": 0.2951, + "step": 22313 + }, + { + "epoch": 0.65, + "grad_norm": 1.3935695816861253, + "learning_rate": 2.9238960302961095e-06, + "loss": 0.3004, + "step": 22314 + }, + { + "epoch": 0.65, + "grad_norm": 0.9412014183797345, + "learning_rate": 2.9234687366998277e-06, + "loss": 0.564, + "step": 22315 + }, + { + "epoch": 0.65, + "grad_norm": 1.3021879867789368, + "learning_rate": 2.923041461428957e-06, + "loss": 0.2895, + "step": 22316 + }, + { + "epoch": 0.65, + "grad_norm": 1.3269871176734966, + "learning_rate": 2.9226142044872685e-06, + "loss": 0.2835, + "step": 22317 + }, + { + "epoch": 0.65, + "grad_norm": 1.4520532887084463, + "learning_rate": 2.922186965878534e-06, + "loss": 0.2938, + "step": 22318 + }, + { + "epoch": 0.65, + "grad_norm": 1.426672908921729, + "learning_rate": 2.9217597456065206e-06, + "loss": 0.293, + "step": 22319 + }, + { + "epoch": 0.65, + "grad_norm": 1.3156902103464343, + "learning_rate": 2.9213325436750017e-06, + "loss": 0.2813, + "step": 22320 + }, + { + "epoch": 0.65, + "grad_norm": 1.3003369790910881, + "learning_rate": 2.9209053600877456e-06, + "loss": 0.2776, + "step": 22321 + }, + { + "epoch": 0.65, + "grad_norm": 1.5855377351971551, + "learning_rate": 2.920478194848524e-06, + "loss": 0.2903, + "step": 22322 + }, + { + "epoch": 0.65, + "grad_norm": 1.698707302428713, + "learning_rate": 2.9200510479611043e-06, + "loss": 0.2818, + "step": 22323 + }, + { + "epoch": 0.65, + "grad_norm": 1.3617542196995596, + "learning_rate": 2.9196239194292585e-06, + "loss": 0.2833, + "step": 22324 + }, + { + "epoch": 0.65, + "grad_norm": 0.9326074301023943, + "learning_rate": 2.9191968092567536e-06, + "loss": 0.5032, + "step": 22325 + }, + { + "epoch": 0.65, + "grad_norm": 1.441879947129607, + "learning_rate": 2.9187697174473597e-06, + "loss": 0.2887, + "step": 22326 + }, + { + "epoch": 0.65, + "grad_norm": 1.0207027063520637, + "learning_rate": 2.9183426440048456e-06, + "loss": 0.5533, + "step": 22327 + }, + { + "epoch": 0.65, + "grad_norm": 1.210473328675921, + "learning_rate": 2.9179155889329812e-06, + "loss": 0.2858, + "step": 22328 + }, + { + "epoch": 0.65, + "grad_norm": 1.3687592353670694, + "learning_rate": 2.9174885522355344e-06, + "loss": 0.2853, + "step": 22329 + }, + { + "epoch": 0.65, + "grad_norm": 1.3864531932396638, + "learning_rate": 2.9170615339162767e-06, + "loss": 0.2839, + "step": 22330 + }, + { + "epoch": 0.65, + "grad_norm": 1.3066159620430025, + "learning_rate": 2.9166345339789727e-06, + "loss": 0.2874, + "step": 22331 + }, + { + "epoch": 0.65, + "grad_norm": 1.376455726226639, + "learning_rate": 2.916207552427391e-06, + "loss": 0.2726, + "step": 22332 + }, + { + "epoch": 0.65, + "grad_norm": 1.6533220369640518, + "learning_rate": 2.915780589265301e-06, + "loss": 0.3051, + "step": 22333 + }, + { + "epoch": 0.65, + "grad_norm": 1.2050811174898097, + "learning_rate": 2.91535364449647e-06, + "loss": 0.2927, + "step": 22334 + }, + { + "epoch": 0.65, + "grad_norm": 1.2142022068541172, + "learning_rate": 2.9149267181246675e-06, + "loss": 0.2671, + "step": 22335 + }, + { + "epoch": 0.65, + "grad_norm": 1.317281413828228, + "learning_rate": 2.914499810153659e-06, + "loss": 0.2928, + "step": 22336 + }, + { + "epoch": 0.65, + "grad_norm": 1.6424956663444776, + "learning_rate": 2.914072920587213e-06, + "loss": 0.29, + "step": 22337 + }, + { + "epoch": 0.65, + "grad_norm": 1.7188688158931935, + "learning_rate": 2.9136460494290963e-06, + "loss": 0.2955, + "step": 22338 + }, + { + "epoch": 0.65, + "grad_norm": 1.4093564600875945, + "learning_rate": 2.913219196683079e-06, + "loss": 0.3028, + "step": 22339 + }, + { + "epoch": 0.65, + "grad_norm": 1.3277797149334012, + "learning_rate": 2.9127923623529225e-06, + "loss": 0.2738, + "step": 22340 + }, + { + "epoch": 0.65, + "grad_norm": 1.2792138507723105, + "learning_rate": 2.9123655464423973e-06, + "loss": 0.2789, + "step": 22341 + }, + { + "epoch": 0.65, + "grad_norm": 1.3735389072700084, + "learning_rate": 2.9119387489552685e-06, + "loss": 0.288, + "step": 22342 + }, + { + "epoch": 0.65, + "grad_norm": 3.9499165324208874, + "learning_rate": 2.911511969895304e-06, + "loss": 0.3251, + "step": 22343 + }, + { + "epoch": 0.65, + "grad_norm": 1.3465432032062057, + "learning_rate": 2.91108520926627e-06, + "loss": 0.2831, + "step": 22344 + }, + { + "epoch": 0.65, + "grad_norm": 1.362217914834295, + "learning_rate": 2.910658467071932e-06, + "loss": 0.2745, + "step": 22345 + }, + { + "epoch": 0.65, + "grad_norm": 2.2234602537870525, + "learning_rate": 2.9102317433160553e-06, + "loss": 0.3158, + "step": 22346 + }, + { + "epoch": 0.65, + "grad_norm": 1.0002827750443586, + "learning_rate": 2.9098050380024097e-06, + "loss": 0.5959, + "step": 22347 + }, + { + "epoch": 0.65, + "grad_norm": 1.432237204508315, + "learning_rate": 2.9093783511347553e-06, + "loss": 0.2775, + "step": 22348 + }, + { + "epoch": 0.65, + "grad_norm": 1.2664757527707697, + "learning_rate": 2.90895168271686e-06, + "loss": 0.2997, + "step": 22349 + }, + { + "epoch": 0.65, + "grad_norm": 1.4961467119227287, + "learning_rate": 2.9085250327524896e-06, + "loss": 0.2933, + "step": 22350 + }, + { + "epoch": 0.65, + "grad_norm": 1.4074302863023163, + "learning_rate": 2.908098401245411e-06, + "loss": 0.298, + "step": 22351 + }, + { + "epoch": 0.65, + "grad_norm": 1.779875421929844, + "learning_rate": 2.9076717881993855e-06, + "loss": 0.2678, + "step": 22352 + }, + { + "epoch": 0.65, + "grad_norm": 1.2650774183803937, + "learning_rate": 2.907245193618179e-06, + "loss": 0.2847, + "step": 22353 + }, + { + "epoch": 0.65, + "grad_norm": 1.3744505281218349, + "learning_rate": 2.9068186175055566e-06, + "loss": 0.2852, + "step": 22354 + }, + { + "epoch": 0.65, + "grad_norm": 1.434233984730897, + "learning_rate": 2.906392059865284e-06, + "loss": 0.2874, + "step": 22355 + }, + { + "epoch": 0.65, + "grad_norm": 1.2019409337721794, + "learning_rate": 2.905965520701125e-06, + "loss": 0.2891, + "step": 22356 + }, + { + "epoch": 0.65, + "grad_norm": 1.3406390369802164, + "learning_rate": 2.905539000016843e-06, + "loss": 0.2742, + "step": 22357 + }, + { + "epoch": 0.65, + "grad_norm": 1.2875494751507528, + "learning_rate": 2.905112497816204e-06, + "loss": 0.2967, + "step": 22358 + }, + { + "epoch": 0.65, + "grad_norm": 1.2858941584665797, + "learning_rate": 2.9046860141029693e-06, + "loss": 0.2877, + "step": 22359 + }, + { + "epoch": 0.65, + "grad_norm": 1.2681644697821643, + "learning_rate": 2.9042595488809033e-06, + "loss": 0.2689, + "step": 22360 + }, + { + "epoch": 0.65, + "grad_norm": 1.801039432307802, + "learning_rate": 2.9038331021537704e-06, + "loss": 0.3143, + "step": 22361 + }, + { + "epoch": 0.65, + "grad_norm": 1.3356656056869431, + "learning_rate": 2.903406673925333e-06, + "loss": 0.3009, + "step": 22362 + }, + { + "epoch": 0.65, + "grad_norm": 1.2561574337197043, + "learning_rate": 2.9029802641993555e-06, + "loss": 0.2939, + "step": 22363 + }, + { + "epoch": 0.65, + "grad_norm": 1.5559814902969702, + "learning_rate": 2.9025538729796e-06, + "loss": 0.2797, + "step": 22364 + }, + { + "epoch": 0.65, + "grad_norm": 1.326978710738205, + "learning_rate": 2.9021275002698303e-06, + "loss": 0.2601, + "step": 22365 + }, + { + "epoch": 0.65, + "grad_norm": 1.2475436797452024, + "learning_rate": 2.9017011460738103e-06, + "loss": 0.2693, + "step": 22366 + }, + { + "epoch": 0.65, + "grad_norm": 1.320208276368861, + "learning_rate": 2.9012748103952992e-06, + "loss": 0.2802, + "step": 22367 + }, + { + "epoch": 0.65, + "grad_norm": 1.9745716148367558, + "learning_rate": 2.900848493238061e-06, + "loss": 0.3013, + "step": 22368 + }, + { + "epoch": 0.65, + "grad_norm": 1.498949503994863, + "learning_rate": 2.9004221946058586e-06, + "loss": 0.2842, + "step": 22369 + }, + { + "epoch": 0.65, + "grad_norm": 1.3168300840352098, + "learning_rate": 2.899995914502453e-06, + "loss": 0.3245, + "step": 22370 + }, + { + "epoch": 0.65, + "grad_norm": 1.4696729108209168, + "learning_rate": 2.8995696529316077e-06, + "loss": 0.2885, + "step": 22371 + }, + { + "epoch": 0.65, + "grad_norm": 1.2991882481353627, + "learning_rate": 2.899143409897082e-06, + "loss": 0.2883, + "step": 22372 + }, + { + "epoch": 0.65, + "grad_norm": 1.4986887017944628, + "learning_rate": 2.8987171854026404e-06, + "loss": 0.2712, + "step": 22373 + }, + { + "epoch": 0.65, + "grad_norm": 1.3984742723353996, + "learning_rate": 2.898290979452042e-06, + "loss": 0.2882, + "step": 22374 + }, + { + "epoch": 0.65, + "grad_norm": 1.3452601773150568, + "learning_rate": 2.8978647920490517e-06, + "loss": 0.3014, + "step": 22375 + }, + { + "epoch": 0.65, + "grad_norm": 1.3590884449669107, + "learning_rate": 2.8974386231974256e-06, + "loss": 0.2921, + "step": 22376 + }, + { + "epoch": 0.65, + "grad_norm": 1.3029639215040276, + "learning_rate": 2.897012472900927e-06, + "loss": 0.2735, + "step": 22377 + }, + { + "epoch": 0.65, + "grad_norm": 1.5771495643139295, + "learning_rate": 2.8965863411633167e-06, + "loss": 0.2681, + "step": 22378 + }, + { + "epoch": 0.65, + "grad_norm": 1.291072123700661, + "learning_rate": 2.896160227988357e-06, + "loss": 0.2688, + "step": 22379 + }, + { + "epoch": 0.65, + "grad_norm": 1.2388642747175636, + "learning_rate": 2.895734133379805e-06, + "loss": 0.2921, + "step": 22380 + }, + { + "epoch": 0.65, + "grad_norm": 1.243895889292177, + "learning_rate": 2.895308057341423e-06, + "loss": 0.2728, + "step": 22381 + }, + { + "epoch": 0.65, + "grad_norm": 1.3165235322925397, + "learning_rate": 2.89488199987697e-06, + "loss": 0.3037, + "step": 22382 + }, + { + "epoch": 0.65, + "grad_norm": 1.0105109708803843, + "learning_rate": 2.894455960990207e-06, + "loss": 0.5795, + "step": 22383 + }, + { + "epoch": 0.65, + "grad_norm": 1.3258068509516228, + "learning_rate": 2.894029940684894e-06, + "loss": 0.2987, + "step": 22384 + }, + { + "epoch": 0.65, + "grad_norm": 1.2757573558670263, + "learning_rate": 2.8936039389647907e-06, + "loss": 0.2837, + "step": 22385 + }, + { + "epoch": 0.65, + "grad_norm": 0.9001145478009438, + "learning_rate": 2.8931779558336548e-06, + "loss": 0.6317, + "step": 22386 + }, + { + "epoch": 0.65, + "grad_norm": 0.9641942578385717, + "learning_rate": 2.8927519912952474e-06, + "loss": 0.6038, + "step": 22387 + }, + { + "epoch": 0.65, + "grad_norm": 1.2615964047585653, + "learning_rate": 2.8923260453533264e-06, + "loss": 0.3134, + "step": 22388 + }, + { + "epoch": 0.65, + "grad_norm": 1.2763385672486123, + "learning_rate": 2.8919001180116514e-06, + "loss": 0.3196, + "step": 22389 + }, + { + "epoch": 0.65, + "grad_norm": 1.6330787434450347, + "learning_rate": 2.891474209273982e-06, + "loss": 0.2953, + "step": 22390 + }, + { + "epoch": 0.65, + "grad_norm": 1.338035526194284, + "learning_rate": 2.891048319144075e-06, + "loss": 0.2692, + "step": 22391 + }, + { + "epoch": 0.65, + "grad_norm": 1.520231603834399, + "learning_rate": 2.8906224476256907e-06, + "loss": 0.2805, + "step": 22392 + }, + { + "epoch": 0.65, + "grad_norm": 1.5342381897032114, + "learning_rate": 2.8901965947225865e-06, + "loss": 0.3079, + "step": 22393 + }, + { + "epoch": 0.65, + "grad_norm": 1.4145215626548038, + "learning_rate": 2.8897707604385226e-06, + "loss": 0.2851, + "step": 22394 + }, + { + "epoch": 0.65, + "grad_norm": 1.3131324797480968, + "learning_rate": 2.8893449447772535e-06, + "loss": 0.3111, + "step": 22395 + }, + { + "epoch": 0.65, + "grad_norm": 1.1896137305311367, + "learning_rate": 2.8889191477425384e-06, + "loss": 0.2883, + "step": 22396 + }, + { + "epoch": 0.65, + "grad_norm": 1.492142364612637, + "learning_rate": 2.8884933693381355e-06, + "loss": 0.2878, + "step": 22397 + }, + { + "epoch": 0.65, + "grad_norm": 1.4462818013809982, + "learning_rate": 2.888067609567802e-06, + "loss": 0.2937, + "step": 22398 + }, + { + "epoch": 0.65, + "grad_norm": 1.3055813757205659, + "learning_rate": 2.887641868435295e-06, + "loss": 0.2915, + "step": 22399 + }, + { + "epoch": 0.65, + "grad_norm": 1.233905603340707, + "learning_rate": 2.887216145944372e-06, + "loss": 0.277, + "step": 22400 + }, + { + "epoch": 0.65, + "grad_norm": 1.3705491217146104, + "learning_rate": 2.88679044209879e-06, + "loss": 0.2817, + "step": 22401 + }, + { + "epoch": 0.65, + "grad_norm": 1.3508005268586547, + "learning_rate": 2.886364756902308e-06, + "loss": 0.3169, + "step": 22402 + }, + { + "epoch": 0.65, + "grad_norm": 1.5513538489539107, + "learning_rate": 2.885939090358678e-06, + "loss": 0.2936, + "step": 22403 + }, + { + "epoch": 0.65, + "grad_norm": 1.505062894857913, + "learning_rate": 2.8855134424716597e-06, + "loss": 0.2909, + "step": 22404 + }, + { + "epoch": 0.65, + "grad_norm": 1.3805284753619425, + "learning_rate": 2.8850878132450084e-06, + "loss": 0.3055, + "step": 22405 + }, + { + "epoch": 0.65, + "grad_norm": 1.600734666429152, + "learning_rate": 2.88466220268248e-06, + "loss": 0.2882, + "step": 22406 + }, + { + "epoch": 0.65, + "grad_norm": 1.414756829605715, + "learning_rate": 2.8842366107878337e-06, + "loss": 0.309, + "step": 22407 + }, + { + "epoch": 0.65, + "grad_norm": 1.279240167005513, + "learning_rate": 2.883811037564821e-06, + "loss": 0.2907, + "step": 22408 + }, + { + "epoch": 0.65, + "grad_norm": 1.3018786711927748, + "learning_rate": 2.883385483017199e-06, + "loss": 0.3233, + "step": 22409 + }, + { + "epoch": 0.65, + "grad_norm": 1.2963624604163029, + "learning_rate": 2.8829599471487236e-06, + "loss": 0.2979, + "step": 22410 + }, + { + "epoch": 0.65, + "grad_norm": 1.3223182135109572, + "learning_rate": 2.8825344299631508e-06, + "loss": 0.3097, + "step": 22411 + }, + { + "epoch": 0.65, + "grad_norm": 1.2590342218159074, + "learning_rate": 2.8821089314642344e-06, + "loss": 0.2734, + "step": 22412 + }, + { + "epoch": 0.65, + "grad_norm": 1.3067152701695242, + "learning_rate": 2.8816834516557323e-06, + "loss": 0.2905, + "step": 22413 + }, + { + "epoch": 0.65, + "grad_norm": 1.2477897359485406, + "learning_rate": 2.881257990541395e-06, + "loss": 0.2827, + "step": 22414 + }, + { + "epoch": 0.65, + "grad_norm": 1.2632232404004855, + "learning_rate": 2.88083254812498e-06, + "loss": 0.2737, + "step": 22415 + }, + { + "epoch": 0.65, + "grad_norm": 1.380966851264602, + "learning_rate": 2.8804071244102405e-06, + "loss": 0.2805, + "step": 22416 + }, + { + "epoch": 0.65, + "grad_norm": 1.4420447236147271, + "learning_rate": 2.8799817194009327e-06, + "loss": 0.3078, + "step": 22417 + }, + { + "epoch": 0.65, + "grad_norm": 1.3862762054321098, + "learning_rate": 2.879556333100809e-06, + "loss": 0.2965, + "step": 22418 + }, + { + "epoch": 0.65, + "grad_norm": 1.628963893707137, + "learning_rate": 2.8791309655136248e-06, + "loss": 0.3134, + "step": 22419 + }, + { + "epoch": 0.65, + "grad_norm": 0.9697695671836223, + "learning_rate": 2.8787056166431327e-06, + "loss": 0.5416, + "step": 22420 + }, + { + "epoch": 0.65, + "grad_norm": 1.4021443216365865, + "learning_rate": 2.878280286493089e-06, + "loss": 0.2916, + "step": 22421 + }, + { + "epoch": 0.65, + "grad_norm": 1.3473759223369937, + "learning_rate": 2.8778549750672437e-06, + "loss": 0.2663, + "step": 22422 + }, + { + "epoch": 0.65, + "grad_norm": 1.5228559040426564, + "learning_rate": 2.8774296823693513e-06, + "loss": 0.2942, + "step": 22423 + }, + { + "epoch": 0.65, + "grad_norm": 0.9858237334848994, + "learning_rate": 2.8770044084031666e-06, + "loss": 0.5855, + "step": 22424 + }, + { + "epoch": 0.65, + "grad_norm": 1.4391329788283718, + "learning_rate": 2.876579153172441e-06, + "loss": 0.2801, + "step": 22425 + }, + { + "epoch": 0.65, + "grad_norm": 0.8893515199551505, + "learning_rate": 2.876153916680927e-06, + "loss": 0.5222, + "step": 22426 + }, + { + "epoch": 0.65, + "grad_norm": 1.330465172364797, + "learning_rate": 2.8757286989323795e-06, + "loss": 0.3129, + "step": 22427 + }, + { + "epoch": 0.65, + "grad_norm": 1.5125039813856012, + "learning_rate": 2.8753034999305495e-06, + "loss": 0.3185, + "step": 22428 + }, + { + "epoch": 0.65, + "grad_norm": 1.5470642432703516, + "learning_rate": 2.8748783196791896e-06, + "loss": 0.3123, + "step": 22429 + }, + { + "epoch": 0.65, + "grad_norm": 1.2869806139522897, + "learning_rate": 2.8744531581820535e-06, + "loss": 0.3023, + "step": 22430 + }, + { + "epoch": 0.65, + "grad_norm": 1.380007604662424, + "learning_rate": 2.8740280154428902e-06, + "loss": 0.3088, + "step": 22431 + }, + { + "epoch": 0.65, + "grad_norm": 1.2891024780653975, + "learning_rate": 2.8736028914654534e-06, + "loss": 0.3145, + "step": 22432 + }, + { + "epoch": 0.65, + "grad_norm": 1.3027930641872427, + "learning_rate": 2.873177786253495e-06, + "loss": 0.2769, + "step": 22433 + }, + { + "epoch": 0.65, + "grad_norm": 1.7859306631566916, + "learning_rate": 2.872752699810766e-06, + "loss": 0.2991, + "step": 22434 + }, + { + "epoch": 0.65, + "grad_norm": 1.8102095079223297, + "learning_rate": 2.8723276321410177e-06, + "loss": 0.3024, + "step": 22435 + }, + { + "epoch": 0.65, + "grad_norm": 1.6975189207691774, + "learning_rate": 2.871902583248004e-06, + "loss": 0.3025, + "step": 22436 + }, + { + "epoch": 0.65, + "grad_norm": 1.272620192279718, + "learning_rate": 2.8714775531354716e-06, + "loss": 0.2994, + "step": 22437 + }, + { + "epoch": 0.65, + "grad_norm": 1.4336012818030504, + "learning_rate": 2.8710525418071733e-06, + "loss": 0.3096, + "step": 22438 + }, + { + "epoch": 0.65, + "grad_norm": 1.422317942206694, + "learning_rate": 2.8706275492668602e-06, + "loss": 0.2721, + "step": 22439 + }, + { + "epoch": 0.65, + "grad_norm": 1.3503855465651244, + "learning_rate": 2.870202575518284e-06, + "loss": 0.3118, + "step": 22440 + }, + { + "epoch": 0.65, + "grad_norm": 1.0110223879253812, + "learning_rate": 2.869777620565192e-06, + "loss": 0.5867, + "step": 22441 + }, + { + "epoch": 0.65, + "grad_norm": 1.8550843619468962, + "learning_rate": 2.869352684411336e-06, + "loss": 0.3096, + "step": 22442 + }, + { + "epoch": 0.65, + "grad_norm": 1.4516563920899233, + "learning_rate": 2.8689277670604665e-06, + "loss": 0.2949, + "step": 22443 + }, + { + "epoch": 0.65, + "grad_norm": 1.906692152084219, + "learning_rate": 2.868502868516333e-06, + "loss": 0.3047, + "step": 22444 + }, + { + "epoch": 0.65, + "grad_norm": 1.3612312688724786, + "learning_rate": 2.8680779887826854e-06, + "loss": 0.2965, + "step": 22445 + }, + { + "epoch": 0.65, + "grad_norm": 1.4837800153231044, + "learning_rate": 2.867653127863273e-06, + "loss": 0.2674, + "step": 22446 + }, + { + "epoch": 0.65, + "grad_norm": 1.4324015264826617, + "learning_rate": 2.8672282857618454e-06, + "loss": 0.2837, + "step": 22447 + }, + { + "epoch": 0.65, + "grad_norm": 1.393549345579498, + "learning_rate": 2.8668034624821518e-06, + "loss": 0.2946, + "step": 22448 + }, + { + "epoch": 0.65, + "grad_norm": 1.168100679469039, + "learning_rate": 2.866378658027944e-06, + "loss": 0.2923, + "step": 22449 + }, + { + "epoch": 0.65, + "grad_norm": 0.931993740307179, + "learning_rate": 2.8659538724029655e-06, + "loss": 0.5461, + "step": 22450 + }, + { + "epoch": 0.65, + "grad_norm": 1.2473520004261185, + "learning_rate": 2.8655291056109678e-06, + "loss": 0.258, + "step": 22451 + }, + { + "epoch": 0.65, + "grad_norm": 1.3336113562534406, + "learning_rate": 2.865104357655699e-06, + "loss": 0.2999, + "step": 22452 + }, + { + "epoch": 0.65, + "grad_norm": 2.84406836739457, + "learning_rate": 2.864679628540909e-06, + "loss": 0.2895, + "step": 22453 + }, + { + "epoch": 0.65, + "grad_norm": 1.563693402738861, + "learning_rate": 2.8642549182703437e-06, + "loss": 0.315, + "step": 22454 + }, + { + "epoch": 0.65, + "grad_norm": 1.6194589641747563, + "learning_rate": 2.8638302268477535e-06, + "loss": 0.2927, + "step": 22455 + }, + { + "epoch": 0.65, + "grad_norm": 1.5036779451944817, + "learning_rate": 2.8634055542768853e-06, + "loss": 0.2934, + "step": 22456 + }, + { + "epoch": 0.65, + "grad_norm": 1.3102824069031316, + "learning_rate": 2.8629809005614885e-06, + "loss": 0.2833, + "step": 22457 + }, + { + "epoch": 0.65, + "grad_norm": 1.313769101391196, + "learning_rate": 2.862556265705306e-06, + "loss": 0.298, + "step": 22458 + }, + { + "epoch": 0.65, + "grad_norm": 1.7016390020924, + "learning_rate": 2.86213164971209e-06, + "loss": 0.2814, + "step": 22459 + }, + { + "epoch": 0.65, + "grad_norm": 1.279323231778604, + "learning_rate": 2.8617070525855846e-06, + "loss": 0.2782, + "step": 22460 + }, + { + "epoch": 0.65, + "grad_norm": 1.5381475326686567, + "learning_rate": 2.8612824743295388e-06, + "loss": 0.2853, + "step": 22461 + }, + { + "epoch": 0.65, + "grad_norm": 1.3333376759988826, + "learning_rate": 2.8608579149476977e-06, + "loss": 0.305, + "step": 22462 + }, + { + "epoch": 0.65, + "grad_norm": 1.2737933480754968, + "learning_rate": 2.8604333744438104e-06, + "loss": 0.3041, + "step": 22463 + }, + { + "epoch": 0.65, + "grad_norm": 1.3591473868941368, + "learning_rate": 2.860008852821624e-06, + "loss": 0.2901, + "step": 22464 + }, + { + "epoch": 0.65, + "grad_norm": 1.6862054175824082, + "learning_rate": 2.8595843500848815e-06, + "loss": 0.2857, + "step": 22465 + }, + { + "epoch": 0.65, + "grad_norm": 1.725461145004442, + "learning_rate": 2.8591598662373306e-06, + "loss": 0.301, + "step": 22466 + }, + { + "epoch": 0.65, + "grad_norm": 1.1910599121098338, + "learning_rate": 2.8587354012827195e-06, + "loss": 0.279, + "step": 22467 + }, + { + "epoch": 0.65, + "grad_norm": 1.441336556498204, + "learning_rate": 2.85831095522479e-06, + "loss": 0.2971, + "step": 22468 + }, + { + "epoch": 0.65, + "grad_norm": 1.368664125758069, + "learning_rate": 2.857886528067291e-06, + "loss": 0.2958, + "step": 22469 + }, + { + "epoch": 0.65, + "grad_norm": 1.4552230843081846, + "learning_rate": 2.857462119813966e-06, + "loss": 0.2958, + "step": 22470 + }, + { + "epoch": 0.65, + "grad_norm": 1.330199897392509, + "learning_rate": 2.8570377304685627e-06, + "loss": 0.2667, + "step": 22471 + }, + { + "epoch": 0.65, + "grad_norm": 1.3457639508865136, + "learning_rate": 2.8566133600348246e-06, + "loss": 0.2885, + "step": 22472 + }, + { + "epoch": 0.65, + "grad_norm": 1.2838385418425766, + "learning_rate": 2.856189008516497e-06, + "loss": 0.2873, + "step": 22473 + }, + { + "epoch": 0.65, + "grad_norm": 1.3517560123537566, + "learning_rate": 2.8557646759173262e-06, + "loss": 0.3103, + "step": 22474 + }, + { + "epoch": 0.65, + "grad_norm": 1.1535040208963467, + "learning_rate": 2.855340362241055e-06, + "loss": 0.2807, + "step": 22475 + }, + { + "epoch": 0.65, + "grad_norm": 1.2765958317733261, + "learning_rate": 2.854916067491431e-06, + "loss": 0.2909, + "step": 22476 + }, + { + "epoch": 0.65, + "grad_norm": 1.3646252788105357, + "learning_rate": 2.8544917916721947e-06, + "loss": 0.2828, + "step": 22477 + }, + { + "epoch": 0.65, + "grad_norm": 1.2820512544077434, + "learning_rate": 2.8540675347870928e-06, + "loss": 0.2823, + "step": 22478 + }, + { + "epoch": 0.65, + "grad_norm": 1.2706928838478362, + "learning_rate": 2.8536432968398675e-06, + "loss": 0.2814, + "step": 22479 + }, + { + "epoch": 0.65, + "grad_norm": 1.316064279074361, + "learning_rate": 2.853219077834265e-06, + "loss": 0.2737, + "step": 22480 + }, + { + "epoch": 0.65, + "grad_norm": 2.7240115769334747, + "learning_rate": 2.852794877774027e-06, + "loss": 0.2965, + "step": 22481 + }, + { + "epoch": 0.65, + "grad_norm": 1.4165888829881021, + "learning_rate": 2.852370696662899e-06, + "loss": 0.3327, + "step": 22482 + }, + { + "epoch": 0.65, + "grad_norm": 1.4158112730283308, + "learning_rate": 2.8519465345046226e-06, + "loss": 0.2948, + "step": 22483 + }, + { + "epoch": 0.65, + "grad_norm": 1.423479782940747, + "learning_rate": 2.8515223913029423e-06, + "loss": 0.2995, + "step": 22484 + }, + { + "epoch": 0.65, + "grad_norm": 1.2142929349919585, + "learning_rate": 2.851098267061603e-06, + "loss": 0.2922, + "step": 22485 + }, + { + "epoch": 0.65, + "grad_norm": 1.302820857726874, + "learning_rate": 2.8506741617843426e-06, + "loss": 0.2594, + "step": 22486 + }, + { + "epoch": 0.65, + "grad_norm": 1.3422171131195195, + "learning_rate": 2.8502500754749067e-06, + "loss": 0.2804, + "step": 22487 + }, + { + "epoch": 0.65, + "grad_norm": 1.6007189448436845, + "learning_rate": 2.8498260081370378e-06, + "loss": 0.3126, + "step": 22488 + }, + { + "epoch": 0.65, + "grad_norm": 1.3476315259811267, + "learning_rate": 2.849401959774478e-06, + "loss": 0.3019, + "step": 22489 + }, + { + "epoch": 0.65, + "grad_norm": 1.3359107757394215, + "learning_rate": 2.8489779303909704e-06, + "loss": 0.2964, + "step": 22490 + }, + { + "epoch": 0.65, + "grad_norm": 1.3074236014434455, + "learning_rate": 2.848553919990257e-06, + "loss": 0.2599, + "step": 22491 + }, + { + "epoch": 0.65, + "grad_norm": 1.3501804254652843, + "learning_rate": 2.84812992857608e-06, + "loss": 0.2917, + "step": 22492 + }, + { + "epoch": 0.65, + "grad_norm": 1.3971824257209244, + "learning_rate": 2.8477059561521782e-06, + "loss": 0.2868, + "step": 22493 + }, + { + "epoch": 0.65, + "grad_norm": 1.3117004032250739, + "learning_rate": 2.847282002722296e-06, + "loss": 0.292, + "step": 22494 + }, + { + "epoch": 0.65, + "grad_norm": 0.938741472733363, + "learning_rate": 2.8468580682901746e-06, + "loss": 0.5517, + "step": 22495 + }, + { + "epoch": 0.65, + "grad_norm": 1.5971593007060898, + "learning_rate": 2.846434152859554e-06, + "loss": 0.3043, + "step": 22496 + }, + { + "epoch": 0.65, + "grad_norm": 1.6432542516012738, + "learning_rate": 2.846010256434175e-06, + "loss": 0.2852, + "step": 22497 + }, + { + "epoch": 0.65, + "grad_norm": 1.357286592952371, + "learning_rate": 2.8455863790177807e-06, + "loss": 0.2813, + "step": 22498 + }, + { + "epoch": 0.65, + "grad_norm": 1.4800018742174255, + "learning_rate": 2.8451625206141093e-06, + "loss": 0.2899, + "step": 22499 + }, + { + "epoch": 0.65, + "grad_norm": 1.3087647982708275, + "learning_rate": 2.844738681226903e-06, + "loss": 0.334, + "step": 22500 + }, + { + "epoch": 0.65, + "grad_norm": 1.401632862964526, + "learning_rate": 2.8443148608599012e-06, + "loss": 0.3011, + "step": 22501 + }, + { + "epoch": 0.65, + "grad_norm": 1.2486446361322763, + "learning_rate": 2.843891059516846e-06, + "loss": 0.2863, + "step": 22502 + }, + { + "epoch": 0.65, + "grad_norm": 1.7019363580613698, + "learning_rate": 2.843467277201477e-06, + "loss": 0.3017, + "step": 22503 + }, + { + "epoch": 0.65, + "grad_norm": 1.393797079268599, + "learning_rate": 2.843043513917531e-06, + "loss": 0.2886, + "step": 22504 + }, + { + "epoch": 0.65, + "grad_norm": 1.1797418805767639, + "learning_rate": 2.842619769668751e-06, + "loss": 0.2753, + "step": 22505 + }, + { + "epoch": 0.65, + "grad_norm": 1.3179453947776436, + "learning_rate": 2.8421960444588743e-06, + "loss": 0.299, + "step": 22506 + }, + { + "epoch": 0.65, + "grad_norm": 1.3250381660103199, + "learning_rate": 2.8417723382916418e-06, + "loss": 0.2823, + "step": 22507 + }, + { + "epoch": 0.65, + "grad_norm": 1.2610130106670336, + "learning_rate": 2.8413486511707925e-06, + "loss": 0.2838, + "step": 22508 + }, + { + "epoch": 0.65, + "grad_norm": 0.9656055217184508, + "learning_rate": 2.840924983100065e-06, + "loss": 0.5576, + "step": 22509 + }, + { + "epoch": 0.65, + "grad_norm": 1.3148141768132977, + "learning_rate": 2.8405013340831995e-06, + "loss": 0.294, + "step": 22510 + }, + { + "epoch": 0.65, + "grad_norm": 1.4679718184048296, + "learning_rate": 2.8400777041239325e-06, + "loss": 0.2716, + "step": 22511 + }, + { + "epoch": 0.65, + "grad_norm": 1.307037545822666, + "learning_rate": 2.8396540932260065e-06, + "loss": 0.2906, + "step": 22512 + }, + { + "epoch": 0.65, + "grad_norm": 1.883012482611998, + "learning_rate": 2.839230501393155e-06, + "loss": 0.2935, + "step": 22513 + }, + { + "epoch": 0.65, + "grad_norm": 1.3457142188215518, + "learning_rate": 2.8388069286291176e-06, + "loss": 0.2918, + "step": 22514 + }, + { + "epoch": 0.65, + "grad_norm": 1.4025354245017991, + "learning_rate": 2.838383374937634e-06, + "loss": 0.2793, + "step": 22515 + }, + { + "epoch": 0.65, + "grad_norm": 1.289076063136364, + "learning_rate": 2.8379598403224407e-06, + "loss": 0.2959, + "step": 22516 + }, + { + "epoch": 0.65, + "grad_norm": 1.210477559997183, + "learning_rate": 2.8375363247872756e-06, + "loss": 0.2823, + "step": 22517 + }, + { + "epoch": 0.65, + "grad_norm": 1.3882346506496375, + "learning_rate": 2.8371128283358764e-06, + "loss": 0.2734, + "step": 22518 + }, + { + "epoch": 0.65, + "grad_norm": 1.2653946671791423, + "learning_rate": 2.8366893509719805e-06, + "loss": 0.2902, + "step": 22519 + }, + { + "epoch": 0.65, + "grad_norm": 1.4286713811584388, + "learning_rate": 2.8362658926993276e-06, + "loss": 0.2885, + "step": 22520 + }, + { + "epoch": 0.65, + "grad_norm": 1.2752021854729916, + "learning_rate": 2.83584245352165e-06, + "loss": 0.3015, + "step": 22521 + }, + { + "epoch": 0.65, + "grad_norm": 1.3569970409168135, + "learning_rate": 2.8354190334426877e-06, + "loss": 0.3239, + "step": 22522 + }, + { + "epoch": 0.65, + "grad_norm": 1.3481943346355945, + "learning_rate": 2.8349956324661754e-06, + "loss": 0.2853, + "step": 22523 + }, + { + "epoch": 0.65, + "grad_norm": 1.3714685785292877, + "learning_rate": 2.8345722505958507e-06, + "loss": 0.3061, + "step": 22524 + }, + { + "epoch": 0.65, + "grad_norm": 1.4663860127243766, + "learning_rate": 2.83414888783545e-06, + "loss": 0.2987, + "step": 22525 + }, + { + "epoch": 0.65, + "grad_norm": 1.5076140642959768, + "learning_rate": 2.833725544188709e-06, + "loss": 0.3025, + "step": 22526 + }, + { + "epoch": 0.65, + "grad_norm": 1.2241153746696491, + "learning_rate": 2.833302219659364e-06, + "loss": 0.2918, + "step": 22527 + }, + { + "epoch": 0.65, + "grad_norm": 1.3919183547867529, + "learning_rate": 2.8328789142511516e-06, + "loss": 0.3031, + "step": 22528 + }, + { + "epoch": 0.65, + "grad_norm": 1.3984933831139954, + "learning_rate": 2.8324556279678064e-06, + "loss": 0.2816, + "step": 22529 + }, + { + "epoch": 0.65, + "grad_norm": 1.5475880102728727, + "learning_rate": 2.8320323608130646e-06, + "loss": 0.3083, + "step": 22530 + }, + { + "epoch": 0.65, + "grad_norm": 1.3674336512599914, + "learning_rate": 2.831609112790663e-06, + "loss": 0.3072, + "step": 22531 + }, + { + "epoch": 0.65, + "grad_norm": 1.4969374516617553, + "learning_rate": 2.831185883904333e-06, + "loss": 0.2984, + "step": 22532 + }, + { + "epoch": 0.65, + "grad_norm": 1.3579368866985637, + "learning_rate": 2.830762674157812e-06, + "loss": 0.285, + "step": 22533 + }, + { + "epoch": 0.65, + "grad_norm": 1.248347929100741, + "learning_rate": 2.8303394835548337e-06, + "loss": 0.2695, + "step": 22534 + }, + { + "epoch": 0.65, + "grad_norm": 1.3769497072145018, + "learning_rate": 2.8299163120991343e-06, + "loss": 0.2876, + "step": 22535 + }, + { + "epoch": 0.65, + "grad_norm": 1.256937860461662, + "learning_rate": 2.829493159794447e-06, + "loss": 0.2975, + "step": 22536 + }, + { + "epoch": 0.65, + "grad_norm": 1.3119360731819203, + "learning_rate": 2.829070026644507e-06, + "loss": 0.2923, + "step": 22537 + }, + { + "epoch": 0.65, + "grad_norm": 2.1214104999446293, + "learning_rate": 2.828646912653048e-06, + "loss": 0.3023, + "step": 22538 + }, + { + "epoch": 0.65, + "grad_norm": 1.3254709386774377, + "learning_rate": 2.828223817823806e-06, + "loss": 0.285, + "step": 22539 + }, + { + "epoch": 0.65, + "grad_norm": 1.434475876107743, + "learning_rate": 2.827800742160511e-06, + "loss": 0.3006, + "step": 22540 + }, + { + "epoch": 0.65, + "grad_norm": 1.6165596293066338, + "learning_rate": 2.8273776856668983e-06, + "loss": 0.2816, + "step": 22541 + }, + { + "epoch": 0.65, + "grad_norm": 1.5189803381594746, + "learning_rate": 2.8269546483467016e-06, + "loss": 0.2856, + "step": 22542 + }, + { + "epoch": 0.65, + "grad_norm": 1.5056782759142022, + "learning_rate": 2.826531630203655e-06, + "loss": 0.2788, + "step": 22543 + }, + { + "epoch": 0.65, + "grad_norm": 0.9932307565796924, + "learning_rate": 2.82610863124149e-06, + "loss": 0.5301, + "step": 22544 + }, + { + "epoch": 0.65, + "grad_norm": 1.2198840917618068, + "learning_rate": 2.8256856514639407e-06, + "loss": 0.2899, + "step": 22545 + }, + { + "epoch": 0.65, + "grad_norm": 1.494064078072282, + "learning_rate": 2.8252626908747392e-06, + "loss": 0.3025, + "step": 22546 + }, + { + "epoch": 0.65, + "grad_norm": 1.5335881007029597, + "learning_rate": 2.824839749477619e-06, + "loss": 0.2873, + "step": 22547 + }, + { + "epoch": 0.65, + "grad_norm": 1.333574327421076, + "learning_rate": 2.824416827276315e-06, + "loss": 0.2997, + "step": 22548 + }, + { + "epoch": 0.65, + "grad_norm": 1.8783637693938176, + "learning_rate": 2.823993924274554e-06, + "loss": 0.2986, + "step": 22549 + }, + { + "epoch": 0.65, + "grad_norm": 1.2693573004586645, + "learning_rate": 2.823571040476072e-06, + "loss": 0.2895, + "step": 22550 + }, + { + "epoch": 0.65, + "grad_norm": 3.819827818271199, + "learning_rate": 2.8231481758845985e-06, + "loss": 0.282, + "step": 22551 + }, + { + "epoch": 0.65, + "grad_norm": 1.4104265392086879, + "learning_rate": 2.8227253305038667e-06, + "loss": 0.2835, + "step": 22552 + }, + { + "epoch": 0.65, + "grad_norm": 1.239913093050551, + "learning_rate": 2.8223025043376073e-06, + "loss": 0.297, + "step": 22553 + }, + { + "epoch": 0.65, + "grad_norm": 1.3426313914355308, + "learning_rate": 2.821879697389553e-06, + "loss": 0.2768, + "step": 22554 + }, + { + "epoch": 0.65, + "grad_norm": 1.4397727113192398, + "learning_rate": 2.821456909663434e-06, + "loss": 0.3218, + "step": 22555 + }, + { + "epoch": 0.65, + "grad_norm": 1.6910013862939584, + "learning_rate": 2.8210341411629825e-06, + "loss": 0.2928, + "step": 22556 + }, + { + "epoch": 0.65, + "grad_norm": 1.2379817634832373, + "learning_rate": 2.8206113918919287e-06, + "loss": 0.2936, + "step": 22557 + }, + { + "epoch": 0.65, + "grad_norm": 1.3905530324143645, + "learning_rate": 2.8201886618540053e-06, + "loss": 0.2894, + "step": 22558 + }, + { + "epoch": 0.65, + "grad_norm": 1.2581566007358762, + "learning_rate": 2.8197659510529395e-06, + "loss": 0.2858, + "step": 22559 + }, + { + "epoch": 0.65, + "grad_norm": 1.2944050881701263, + "learning_rate": 2.819343259492463e-06, + "loss": 0.2932, + "step": 22560 + }, + { + "epoch": 0.65, + "grad_norm": 1.2295309459460542, + "learning_rate": 2.818920587176307e-06, + "loss": 0.2855, + "step": 22561 + }, + { + "epoch": 0.65, + "grad_norm": 1.2899722578416675, + "learning_rate": 2.818497934108201e-06, + "loss": 0.3046, + "step": 22562 + }, + { + "epoch": 0.65, + "grad_norm": 1.3097782271041332, + "learning_rate": 2.8180753002918735e-06, + "loss": 0.2735, + "step": 22563 + }, + { + "epoch": 0.65, + "grad_norm": 0.952175653171225, + "learning_rate": 2.817652685731057e-06, + "loss": 0.589, + "step": 22564 + }, + { + "epoch": 0.65, + "grad_norm": 1.424126646440629, + "learning_rate": 2.8172300904294796e-06, + "loss": 0.2843, + "step": 22565 + }, + { + "epoch": 0.65, + "grad_norm": 1.393752424415213, + "learning_rate": 2.816807514390871e-06, + "loss": 0.3006, + "step": 22566 + }, + { + "epoch": 0.65, + "grad_norm": 1.0135880110512216, + "learning_rate": 2.8163849576189615e-06, + "loss": 0.5842, + "step": 22567 + }, + { + "epoch": 0.65, + "grad_norm": 1.3073387073033151, + "learning_rate": 2.8159624201174782e-06, + "loss": 0.2804, + "step": 22568 + }, + { + "epoch": 0.65, + "grad_norm": 0.9730727696955693, + "learning_rate": 2.8155399018901497e-06, + "loss": 0.5814, + "step": 22569 + }, + { + "epoch": 0.65, + "grad_norm": 1.5236481711162386, + "learning_rate": 2.815117402940707e-06, + "loss": 0.2882, + "step": 22570 + }, + { + "epoch": 0.65, + "grad_norm": 0.9487534339974519, + "learning_rate": 2.814694923272876e-06, + "loss": 0.5238, + "step": 22571 + }, + { + "epoch": 0.65, + "grad_norm": 1.6199751833831735, + "learning_rate": 2.8142724628903872e-06, + "loss": 0.2939, + "step": 22572 + }, + { + "epoch": 0.65, + "grad_norm": 1.2190991700523812, + "learning_rate": 2.8138500217969684e-06, + "loss": 0.2853, + "step": 22573 + }, + { + "epoch": 0.65, + "grad_norm": 2.449031689012091, + "learning_rate": 2.8134275999963474e-06, + "loss": 0.2904, + "step": 22574 + }, + { + "epoch": 0.65, + "grad_norm": 1.3676911965241885, + "learning_rate": 2.8130051974922533e-06, + "loss": 0.3002, + "step": 22575 + }, + { + "epoch": 0.65, + "grad_norm": 1.2053820532899342, + "learning_rate": 2.8125828142884114e-06, + "loss": 0.3155, + "step": 22576 + }, + { + "epoch": 0.65, + "grad_norm": 1.2762590148574517, + "learning_rate": 2.8121604503885525e-06, + "loss": 0.2841, + "step": 22577 + }, + { + "epoch": 0.65, + "grad_norm": 1.2836830493860318, + "learning_rate": 2.811738105796399e-06, + "loss": 0.2774, + "step": 22578 + }, + { + "epoch": 0.65, + "grad_norm": 1.2723527712355944, + "learning_rate": 2.8113157805156823e-06, + "loss": 0.3343, + "step": 22579 + }, + { + "epoch": 0.65, + "grad_norm": 1.3473559938427788, + "learning_rate": 2.810893474550127e-06, + "loss": 0.2838, + "step": 22580 + }, + { + "epoch": 0.65, + "grad_norm": 1.314619931283803, + "learning_rate": 2.8104711879034617e-06, + "loss": 0.2976, + "step": 22581 + }, + { + "epoch": 0.65, + "grad_norm": 1.4378299875594664, + "learning_rate": 2.8100489205794122e-06, + "loss": 0.2909, + "step": 22582 + }, + { + "epoch": 0.66, + "grad_norm": 1.2838319470916675, + "learning_rate": 2.8096266725817057e-06, + "loss": 0.2797, + "step": 22583 + }, + { + "epoch": 0.66, + "grad_norm": 1.368932294368463, + "learning_rate": 2.8092044439140674e-06, + "loss": 0.3024, + "step": 22584 + }, + { + "epoch": 0.66, + "grad_norm": 1.4034024412824355, + "learning_rate": 2.8087822345802246e-06, + "loss": 0.2966, + "step": 22585 + }, + { + "epoch": 0.66, + "grad_norm": 1.1848598816157956, + "learning_rate": 2.8083600445839043e-06, + "loss": 0.2542, + "step": 22586 + }, + { + "epoch": 0.66, + "grad_norm": 1.3618631248387327, + "learning_rate": 2.80793787392883e-06, + "loss": 0.2988, + "step": 22587 + }, + { + "epoch": 0.66, + "grad_norm": 1.319851783492016, + "learning_rate": 2.807515722618728e-06, + "loss": 0.2867, + "step": 22588 + }, + { + "epoch": 0.66, + "grad_norm": 1.2706267919407066, + "learning_rate": 2.8070935906573234e-06, + "loss": 0.296, + "step": 22589 + }, + { + "epoch": 0.66, + "grad_norm": 1.3743662186015155, + "learning_rate": 2.806671478048342e-06, + "loss": 0.2853, + "step": 22590 + }, + { + "epoch": 0.66, + "grad_norm": 1.6118520543873498, + "learning_rate": 2.8062493847955095e-06, + "loss": 0.2854, + "step": 22591 + }, + { + "epoch": 0.66, + "grad_norm": 1.5266653259907823, + "learning_rate": 2.8058273109025514e-06, + "loss": 0.2999, + "step": 22592 + }, + { + "epoch": 0.66, + "grad_norm": 1.293507711095169, + "learning_rate": 2.805405256373191e-06, + "loss": 0.2693, + "step": 22593 + }, + { + "epoch": 0.66, + "grad_norm": 1.3261864375493353, + "learning_rate": 2.804983221211155e-06, + "loss": 0.2884, + "step": 22594 + }, + { + "epoch": 0.66, + "grad_norm": 1.2907898524323107, + "learning_rate": 2.8045612054201656e-06, + "loss": 0.2873, + "step": 22595 + }, + { + "epoch": 0.66, + "grad_norm": 1.469285385486255, + "learning_rate": 2.8041392090039465e-06, + "loss": 0.2979, + "step": 22596 + }, + { + "epoch": 0.66, + "grad_norm": 1.322690724366199, + "learning_rate": 2.8037172319662243e-06, + "loss": 0.3031, + "step": 22597 + }, + { + "epoch": 0.66, + "grad_norm": 1.419076422337895, + "learning_rate": 2.8032952743107217e-06, + "loss": 0.2801, + "step": 22598 + }, + { + "epoch": 0.66, + "grad_norm": 1.4060091230993619, + "learning_rate": 2.8028733360411626e-06, + "loss": 0.2803, + "step": 22599 + }, + { + "epoch": 0.66, + "grad_norm": 1.2873134428478907, + "learning_rate": 2.8024514171612705e-06, + "loss": 0.2873, + "step": 22600 + }, + { + "epoch": 0.66, + "grad_norm": 1.2799133590518712, + "learning_rate": 2.802029517674769e-06, + "loss": 0.3012, + "step": 22601 + }, + { + "epoch": 0.66, + "grad_norm": 2.0543132516563447, + "learning_rate": 2.801607637585382e-06, + "loss": 0.2813, + "step": 22602 + }, + { + "epoch": 0.66, + "grad_norm": 1.356674357775465, + "learning_rate": 2.801185776896833e-06, + "loss": 0.2894, + "step": 22603 + }, + { + "epoch": 0.66, + "grad_norm": 1.6233526760760457, + "learning_rate": 2.8007639356128424e-06, + "loss": 0.3146, + "step": 22604 + }, + { + "epoch": 0.66, + "grad_norm": 1.2991464737657101, + "learning_rate": 2.8003421137371367e-06, + "loss": 0.2876, + "step": 22605 + }, + { + "epoch": 0.66, + "grad_norm": 1.354602153669069, + "learning_rate": 2.7999203112734343e-06, + "loss": 0.2832, + "step": 22606 + }, + { + "epoch": 0.66, + "grad_norm": 1.4668695109325394, + "learning_rate": 2.7994985282254588e-06, + "loss": 0.2941, + "step": 22607 + }, + { + "epoch": 0.66, + "grad_norm": 1.2941778859763204, + "learning_rate": 2.799076764596934e-06, + "loss": 0.286, + "step": 22608 + }, + { + "epoch": 0.66, + "grad_norm": 1.4366179865304034, + "learning_rate": 2.7986550203915807e-06, + "loss": 0.3102, + "step": 22609 + }, + { + "epoch": 0.66, + "grad_norm": 1.3312947101198442, + "learning_rate": 2.7982332956131213e-06, + "loss": 0.2788, + "step": 22610 + }, + { + "epoch": 0.66, + "grad_norm": 1.627718974116068, + "learning_rate": 2.797811590265278e-06, + "loss": 0.2647, + "step": 22611 + }, + { + "epoch": 0.66, + "grad_norm": 1.3354878312924885, + "learning_rate": 2.7973899043517714e-06, + "loss": 0.3068, + "step": 22612 + }, + { + "epoch": 0.66, + "grad_norm": 1.2123731450182702, + "learning_rate": 2.7969682378763254e-06, + "loss": 0.2673, + "step": 22613 + }, + { + "epoch": 0.66, + "grad_norm": 1.243406251554518, + "learning_rate": 2.796546590842657e-06, + "loss": 0.2754, + "step": 22614 + }, + { + "epoch": 0.66, + "grad_norm": 1.6389065505051645, + "learning_rate": 2.7961249632544896e-06, + "loss": 0.323, + "step": 22615 + }, + { + "epoch": 0.66, + "grad_norm": 1.2901595965738255, + "learning_rate": 2.795703355115544e-06, + "loss": 0.3213, + "step": 22616 + }, + { + "epoch": 0.66, + "grad_norm": 1.304035971985282, + "learning_rate": 2.79528176642954e-06, + "loss": 0.2868, + "step": 22617 + }, + { + "epoch": 0.66, + "grad_norm": 1.3622399524587163, + "learning_rate": 2.7948601972001997e-06, + "loss": 0.3091, + "step": 22618 + }, + { + "epoch": 0.66, + "grad_norm": 1.260413847633661, + "learning_rate": 2.7944386474312423e-06, + "loss": 0.2981, + "step": 22619 + }, + { + "epoch": 0.66, + "grad_norm": 1.5264945268970853, + "learning_rate": 2.7940171171263887e-06, + "loss": 0.2669, + "step": 22620 + }, + { + "epoch": 0.66, + "grad_norm": 1.3124828230540435, + "learning_rate": 2.7935956062893576e-06, + "loss": 0.3101, + "step": 22621 + }, + { + "epoch": 0.66, + "grad_norm": 1.240887845498421, + "learning_rate": 2.793174114923872e-06, + "loss": 0.2984, + "step": 22622 + }, + { + "epoch": 0.66, + "grad_norm": 1.3608638445237349, + "learning_rate": 2.7927526430336484e-06, + "loss": 0.2859, + "step": 22623 + }, + { + "epoch": 0.66, + "grad_norm": 1.4252392869596167, + "learning_rate": 2.792331190622406e-06, + "loss": 0.2977, + "step": 22624 + }, + { + "epoch": 0.66, + "grad_norm": 1.2761485069400353, + "learning_rate": 2.7919097576938657e-06, + "loss": 0.2792, + "step": 22625 + }, + { + "epoch": 0.66, + "grad_norm": 1.2624389648774121, + "learning_rate": 2.791488344251746e-06, + "loss": 0.267, + "step": 22626 + }, + { + "epoch": 0.66, + "grad_norm": 1.513569700404196, + "learning_rate": 2.7910669502997667e-06, + "loss": 0.3209, + "step": 22627 + }, + { + "epoch": 0.66, + "grad_norm": 1.2750165033980385, + "learning_rate": 2.790645575841646e-06, + "loss": 0.3041, + "step": 22628 + }, + { + "epoch": 0.66, + "grad_norm": 2.2008825733377, + "learning_rate": 2.7902242208811025e-06, + "loss": 0.2819, + "step": 22629 + }, + { + "epoch": 0.66, + "grad_norm": 2.0123289344866544, + "learning_rate": 2.789802885421856e-06, + "loss": 0.2861, + "step": 22630 + }, + { + "epoch": 0.66, + "grad_norm": 1.1722310950919743, + "learning_rate": 2.789381569467623e-06, + "loss": 0.259, + "step": 22631 + }, + { + "epoch": 0.66, + "grad_norm": 1.3179160710810016, + "learning_rate": 2.7889602730221233e-06, + "loss": 0.2957, + "step": 22632 + }, + { + "epoch": 0.66, + "grad_norm": 1.4140871262879495, + "learning_rate": 2.788538996089072e-06, + "loss": 0.2919, + "step": 22633 + }, + { + "epoch": 0.66, + "grad_norm": 1.3679347163634419, + "learning_rate": 2.788117738672188e-06, + "loss": 0.2944, + "step": 22634 + }, + { + "epoch": 0.66, + "grad_norm": 1.2145386970458012, + "learning_rate": 2.7876965007751897e-06, + "loss": 0.2819, + "step": 22635 + }, + { + "epoch": 0.66, + "grad_norm": 1.3346548603829955, + "learning_rate": 2.7872752824017946e-06, + "loss": 0.2815, + "step": 22636 + }, + { + "epoch": 0.66, + "grad_norm": 1.3174371461795702, + "learning_rate": 2.78685408355572e-06, + "loss": 0.285, + "step": 22637 + }, + { + "epoch": 0.66, + "grad_norm": 1.447820768553391, + "learning_rate": 2.7864329042406823e-06, + "loss": 0.2983, + "step": 22638 + }, + { + "epoch": 0.66, + "grad_norm": 1.523199672981974, + "learning_rate": 2.7860117444603983e-06, + "loss": 0.2963, + "step": 22639 + }, + { + "epoch": 0.66, + "grad_norm": 1.6432480436918822, + "learning_rate": 2.7855906042185865e-06, + "loss": 0.2681, + "step": 22640 + }, + { + "epoch": 0.66, + "grad_norm": 1.3447891509314724, + "learning_rate": 2.785169483518963e-06, + "loss": 0.3013, + "step": 22641 + }, + { + "epoch": 0.66, + "grad_norm": 1.5787190844714447, + "learning_rate": 2.784748382365242e-06, + "loss": 0.2977, + "step": 22642 + }, + { + "epoch": 0.66, + "grad_norm": 1.1993138206170812, + "learning_rate": 2.7843273007611405e-06, + "loss": 0.2697, + "step": 22643 + }, + { + "epoch": 0.66, + "grad_norm": 1.3876644991082474, + "learning_rate": 2.7839062387103753e-06, + "loss": 0.2681, + "step": 22644 + }, + { + "epoch": 0.66, + "grad_norm": 1.6008908285448264, + "learning_rate": 2.783485196216663e-06, + "loss": 0.2828, + "step": 22645 + }, + { + "epoch": 0.66, + "grad_norm": 1.5761106233730258, + "learning_rate": 2.7830641732837175e-06, + "loss": 0.3064, + "step": 22646 + }, + { + "epoch": 0.66, + "grad_norm": 1.2545731444896222, + "learning_rate": 2.7826431699152556e-06, + "loss": 0.2798, + "step": 22647 + }, + { + "epoch": 0.66, + "grad_norm": 1.461740946609946, + "learning_rate": 2.7822221861149924e-06, + "loss": 0.3033, + "step": 22648 + }, + { + "epoch": 0.66, + "grad_norm": 1.3200366023523629, + "learning_rate": 2.7818012218866447e-06, + "loss": 0.3065, + "step": 22649 + }, + { + "epoch": 0.66, + "grad_norm": 1.0505944286174902, + "learning_rate": 2.781380277233924e-06, + "loss": 0.6319, + "step": 22650 + }, + { + "epoch": 0.66, + "grad_norm": 1.5411449708400495, + "learning_rate": 2.780959352160547e-06, + "loss": 0.3097, + "step": 22651 + }, + { + "epoch": 0.66, + "grad_norm": 1.3563467227742518, + "learning_rate": 2.7805384466702278e-06, + "loss": 0.2917, + "step": 22652 + }, + { + "epoch": 0.66, + "grad_norm": 1.2491403486247845, + "learning_rate": 2.7801175607666826e-06, + "loss": 0.2694, + "step": 22653 + }, + { + "epoch": 0.66, + "grad_norm": 0.9777081274291786, + "learning_rate": 2.7796966944536237e-06, + "loss": 0.6157, + "step": 22654 + }, + { + "epoch": 0.66, + "grad_norm": 1.2915916232223181, + "learning_rate": 2.779275847734766e-06, + "loss": 0.2839, + "step": 22655 + }, + { + "epoch": 0.66, + "grad_norm": 1.5609322952947324, + "learning_rate": 2.778855020613824e-06, + "loss": 0.3168, + "step": 22656 + }, + { + "epoch": 0.66, + "grad_norm": 1.4163148760600777, + "learning_rate": 2.778434213094511e-06, + "loss": 0.2766, + "step": 22657 + }, + { + "epoch": 0.66, + "grad_norm": 1.5461907558559285, + "learning_rate": 2.7780134251805423e-06, + "loss": 0.2962, + "step": 22658 + }, + { + "epoch": 0.66, + "grad_norm": 1.320907978259287, + "learning_rate": 2.7775926568756283e-06, + "loss": 0.2853, + "step": 22659 + }, + { + "epoch": 0.66, + "grad_norm": 1.2854148785160702, + "learning_rate": 2.777171908183486e-06, + "loss": 0.2845, + "step": 22660 + }, + { + "epoch": 0.66, + "grad_norm": 1.461509032948653, + "learning_rate": 2.7767511791078238e-06, + "loss": 0.295, + "step": 22661 + }, + { + "epoch": 0.66, + "grad_norm": 1.2607734074570813, + "learning_rate": 2.7763304696523576e-06, + "loss": 0.261, + "step": 22662 + }, + { + "epoch": 0.66, + "grad_norm": 1.4445939514683905, + "learning_rate": 2.775909779820799e-06, + "loss": 0.2997, + "step": 22663 + }, + { + "epoch": 0.66, + "grad_norm": 1.3119531110139615, + "learning_rate": 2.775489109616861e-06, + "loss": 0.2908, + "step": 22664 + }, + { + "epoch": 0.66, + "grad_norm": 1.3257449183696852, + "learning_rate": 2.7750684590442566e-06, + "loss": 0.296, + "step": 22665 + }, + { + "epoch": 0.66, + "grad_norm": 1.486687206402072, + "learning_rate": 2.774647828106698e-06, + "loss": 0.3043, + "step": 22666 + }, + { + "epoch": 0.66, + "grad_norm": 1.1828568125889591, + "learning_rate": 2.774227216807897e-06, + "loss": 0.2729, + "step": 22667 + }, + { + "epoch": 0.66, + "grad_norm": 1.3341238186614777, + "learning_rate": 2.773806625151568e-06, + "loss": 0.28, + "step": 22668 + }, + { + "epoch": 0.66, + "grad_norm": 1.2805768494180885, + "learning_rate": 2.773386053141417e-06, + "loss": 0.2888, + "step": 22669 + }, + { + "epoch": 0.66, + "grad_norm": 1.3971840999647815, + "learning_rate": 2.7729655007811594e-06, + "loss": 0.2881, + "step": 22670 + }, + { + "epoch": 0.66, + "grad_norm": 1.3747505701524823, + "learning_rate": 2.7725449680745065e-06, + "loss": 0.2945, + "step": 22671 + }, + { + "epoch": 0.66, + "grad_norm": 1.2899021319505208, + "learning_rate": 2.772124455025168e-06, + "loss": 0.2893, + "step": 22672 + }, + { + "epoch": 0.66, + "grad_norm": 1.262390917241729, + "learning_rate": 2.771703961636857e-06, + "loss": 0.2632, + "step": 22673 + }, + { + "epoch": 0.66, + "grad_norm": 1.0017977511167595, + "learning_rate": 2.7712834879132823e-06, + "loss": 0.543, + "step": 22674 + }, + { + "epoch": 0.66, + "grad_norm": 1.30847282152602, + "learning_rate": 2.7708630338581554e-06, + "loss": 0.2811, + "step": 22675 + }, + { + "epoch": 0.66, + "grad_norm": 1.7269111199582854, + "learning_rate": 2.7704425994751883e-06, + "loss": 0.3256, + "step": 22676 + }, + { + "epoch": 0.66, + "grad_norm": 1.4068647568726453, + "learning_rate": 2.770022184768091e-06, + "loss": 0.2985, + "step": 22677 + }, + { + "epoch": 0.66, + "grad_norm": 1.3666184817073619, + "learning_rate": 2.7696017897405704e-06, + "loss": 0.2843, + "step": 22678 + }, + { + "epoch": 0.66, + "grad_norm": 1.3463954429288734, + "learning_rate": 2.7691814143963392e-06, + "loss": 0.3194, + "step": 22679 + }, + { + "epoch": 0.66, + "grad_norm": 1.4991214506951094, + "learning_rate": 2.7687610587391066e-06, + "loss": 0.2843, + "step": 22680 + }, + { + "epoch": 0.66, + "grad_norm": 1.3113173584032307, + "learning_rate": 2.768340722772582e-06, + "loss": 0.2944, + "step": 22681 + }, + { + "epoch": 0.66, + "grad_norm": 1.3342001445400764, + "learning_rate": 2.767920406500476e-06, + "loss": 0.2764, + "step": 22682 + }, + { + "epoch": 0.66, + "grad_norm": 1.3019234851421073, + "learning_rate": 2.7675001099264974e-06, + "loss": 0.2757, + "step": 22683 + }, + { + "epoch": 0.66, + "grad_norm": 1.4081270267114911, + "learning_rate": 2.7670798330543547e-06, + "loss": 0.2948, + "step": 22684 + }, + { + "epoch": 0.66, + "grad_norm": 1.3371028517169543, + "learning_rate": 2.766659575887759e-06, + "loss": 0.2994, + "step": 22685 + }, + { + "epoch": 0.66, + "grad_norm": 1.3319539946898944, + "learning_rate": 2.766239338430415e-06, + "loss": 0.2873, + "step": 22686 + }, + { + "epoch": 0.66, + "grad_norm": 1.3800285132270893, + "learning_rate": 2.7658191206860346e-06, + "loss": 0.2965, + "step": 22687 + }, + { + "epoch": 0.66, + "grad_norm": 2.3183209644641742, + "learning_rate": 2.7653989226583266e-06, + "loss": 0.29, + "step": 22688 + }, + { + "epoch": 0.66, + "grad_norm": 1.5163366748039917, + "learning_rate": 2.7649787443509962e-06, + "loss": 0.2979, + "step": 22689 + }, + { + "epoch": 0.66, + "grad_norm": 1.5253118247986286, + "learning_rate": 2.764558585767753e-06, + "loss": 0.2741, + "step": 22690 + }, + { + "epoch": 0.66, + "grad_norm": 1.6082585777974945, + "learning_rate": 2.764138446912305e-06, + "loss": 0.274, + "step": 22691 + }, + { + "epoch": 0.66, + "grad_norm": 1.5330593787259936, + "learning_rate": 2.7637183277883607e-06, + "loss": 0.3113, + "step": 22692 + }, + { + "epoch": 0.66, + "grad_norm": 1.2602406096295273, + "learning_rate": 2.7632982283996257e-06, + "loss": 0.3143, + "step": 22693 + }, + { + "epoch": 0.66, + "grad_norm": 1.3481832681740755, + "learning_rate": 2.76287814874981e-06, + "loss": 0.2932, + "step": 22694 + }, + { + "epoch": 0.66, + "grad_norm": 0.9227471235280461, + "learning_rate": 2.7624580888426185e-06, + "loss": 0.6094, + "step": 22695 + }, + { + "epoch": 0.66, + "grad_norm": 1.2848318690279235, + "learning_rate": 2.762038048681761e-06, + "loss": 0.2718, + "step": 22696 + }, + { + "epoch": 0.66, + "grad_norm": 1.0144792574448611, + "learning_rate": 2.7616180282709414e-06, + "loss": 0.5605, + "step": 22697 + }, + { + "epoch": 0.66, + "grad_norm": 1.6300499450514478, + "learning_rate": 2.7611980276138673e-06, + "loss": 0.3181, + "step": 22698 + }, + { + "epoch": 0.66, + "grad_norm": 1.6083010241156765, + "learning_rate": 2.7607780467142454e-06, + "loss": 0.3159, + "step": 22699 + }, + { + "epoch": 0.66, + "grad_norm": 1.1888782614711275, + "learning_rate": 2.760358085575782e-06, + "loss": 0.3128, + "step": 22700 + }, + { + "epoch": 0.66, + "grad_norm": 1.2694901229514026, + "learning_rate": 2.7599381442021833e-06, + "loss": 0.3111, + "step": 22701 + }, + { + "epoch": 0.66, + "grad_norm": 1.1991107519452595, + "learning_rate": 2.759518222597155e-06, + "loss": 0.2712, + "step": 22702 + }, + { + "epoch": 0.66, + "grad_norm": 1.3003760786932665, + "learning_rate": 2.759098320764404e-06, + "loss": 0.2979, + "step": 22703 + }, + { + "epoch": 0.66, + "grad_norm": 1.2403236998119653, + "learning_rate": 2.7586784387076366e-06, + "loss": 0.2808, + "step": 22704 + }, + { + "epoch": 0.66, + "grad_norm": 1.3006241255551283, + "learning_rate": 2.758258576430555e-06, + "loss": 0.2896, + "step": 22705 + }, + { + "epoch": 0.66, + "grad_norm": 1.1428332533450243, + "learning_rate": 2.757838733936866e-06, + "loss": 0.275, + "step": 22706 + }, + { + "epoch": 0.66, + "grad_norm": 1.365517882607999, + "learning_rate": 2.757418911230276e-06, + "loss": 0.2775, + "step": 22707 + }, + { + "epoch": 0.66, + "grad_norm": 1.3136161683932368, + "learning_rate": 2.7569991083144883e-06, + "loss": 0.3168, + "step": 22708 + }, + { + "epoch": 0.66, + "grad_norm": 1.5070699363516111, + "learning_rate": 2.756579325193208e-06, + "loss": 0.2863, + "step": 22709 + }, + { + "epoch": 0.66, + "grad_norm": 1.2264055321168341, + "learning_rate": 2.756159561870141e-06, + "loss": 0.287, + "step": 22710 + }, + { + "epoch": 0.66, + "grad_norm": 1.2332121884983134, + "learning_rate": 2.75573981834899e-06, + "loss": 0.2809, + "step": 22711 + }, + { + "epoch": 0.66, + "grad_norm": 1.4624332674160578, + "learning_rate": 2.7553200946334603e-06, + "loss": 0.305, + "step": 22712 + }, + { + "epoch": 0.66, + "grad_norm": 1.2464645824351017, + "learning_rate": 2.7549003907272574e-06, + "loss": 0.2732, + "step": 22713 + }, + { + "epoch": 0.66, + "grad_norm": 1.3118764469451378, + "learning_rate": 2.754480706634082e-06, + "loss": 0.3065, + "step": 22714 + }, + { + "epoch": 0.66, + "grad_norm": 1.247014593352671, + "learning_rate": 2.7540610423576384e-06, + "loss": 0.2708, + "step": 22715 + }, + { + "epoch": 0.66, + "grad_norm": 1.3344037093837848, + "learning_rate": 2.7536413979016318e-06, + "loss": 0.2791, + "step": 22716 + }, + { + "epoch": 0.66, + "grad_norm": 1.447861165181736, + "learning_rate": 2.7532217732697663e-06, + "loss": 0.2967, + "step": 22717 + }, + { + "epoch": 0.66, + "grad_norm": 1.3348146143477042, + "learning_rate": 2.752802168465742e-06, + "loss": 0.2949, + "step": 22718 + }, + { + "epoch": 0.66, + "grad_norm": 1.3274775379899058, + "learning_rate": 2.7523825834932634e-06, + "loss": 0.2787, + "step": 22719 + }, + { + "epoch": 0.66, + "grad_norm": 1.1951233294315033, + "learning_rate": 2.751963018356033e-06, + "loss": 0.2728, + "step": 22720 + }, + { + "epoch": 0.66, + "grad_norm": 1.2459851248162344, + "learning_rate": 2.751543473057754e-06, + "loss": 0.2787, + "step": 22721 + }, + { + "epoch": 0.66, + "grad_norm": 0.9026707782432981, + "learning_rate": 2.751123947602129e-06, + "loss": 0.5639, + "step": 22722 + }, + { + "epoch": 0.66, + "grad_norm": 1.3794683726351131, + "learning_rate": 2.7507044419928625e-06, + "loss": 0.3004, + "step": 22723 + }, + { + "epoch": 0.66, + "grad_norm": 1.591793098324997, + "learning_rate": 2.750284956233652e-06, + "loss": 0.2883, + "step": 22724 + }, + { + "epoch": 0.66, + "grad_norm": 1.3246516725127169, + "learning_rate": 2.7498654903282016e-06, + "loss": 0.278, + "step": 22725 + }, + { + "epoch": 0.66, + "grad_norm": 1.4005254212763274, + "learning_rate": 2.749446044280213e-06, + "loss": 0.2707, + "step": 22726 + }, + { + "epoch": 0.66, + "grad_norm": 1.3347062835316708, + "learning_rate": 2.7490266180933884e-06, + "loss": 0.3286, + "step": 22727 + }, + { + "epoch": 0.66, + "grad_norm": 1.3631957937411436, + "learning_rate": 2.7486072117714287e-06, + "loss": 0.3098, + "step": 22728 + }, + { + "epoch": 0.66, + "grad_norm": 1.2716134257441165, + "learning_rate": 2.748187825318035e-06, + "loss": 0.2808, + "step": 22729 + }, + { + "epoch": 0.66, + "grad_norm": 1.2959072434873269, + "learning_rate": 2.7477684587369093e-06, + "loss": 0.2853, + "step": 22730 + }, + { + "epoch": 0.66, + "grad_norm": 1.3431513305759561, + "learning_rate": 2.747349112031752e-06, + "loss": 0.2749, + "step": 22731 + }, + { + "epoch": 0.66, + "grad_norm": 1.2612299781575467, + "learning_rate": 2.7469297852062647e-06, + "loss": 0.2837, + "step": 22732 + }, + { + "epoch": 0.66, + "grad_norm": 1.6323268824163344, + "learning_rate": 2.746510478264146e-06, + "loss": 0.2735, + "step": 22733 + }, + { + "epoch": 0.66, + "grad_norm": 1.3110283769106703, + "learning_rate": 2.746091191209097e-06, + "loss": 0.2938, + "step": 22734 + }, + { + "epoch": 0.66, + "grad_norm": 1.2171801716185, + "learning_rate": 2.7456719240448187e-06, + "loss": 0.2737, + "step": 22735 + }, + { + "epoch": 0.66, + "grad_norm": 0.925826227245557, + "learning_rate": 2.7452526767750102e-06, + "loss": 0.5375, + "step": 22736 + }, + { + "epoch": 0.66, + "grad_norm": 1.3909726324501381, + "learning_rate": 2.7448334494033724e-06, + "loss": 0.3144, + "step": 22737 + }, + { + "epoch": 0.66, + "grad_norm": 1.3479068355051251, + "learning_rate": 2.7444142419336038e-06, + "loss": 0.3006, + "step": 22738 + }, + { + "epoch": 0.66, + "grad_norm": 1.3049981718527728, + "learning_rate": 2.743995054369405e-06, + "loss": 0.2978, + "step": 22739 + }, + { + "epoch": 0.66, + "grad_norm": 1.2293975269403983, + "learning_rate": 2.743575886714477e-06, + "loss": 0.2749, + "step": 22740 + }, + { + "epoch": 0.66, + "grad_norm": 0.8838185109452887, + "learning_rate": 2.743156738972514e-06, + "loss": 0.5235, + "step": 22741 + }, + { + "epoch": 0.66, + "grad_norm": 1.3930701693906147, + "learning_rate": 2.7427376111472183e-06, + "loss": 0.2846, + "step": 22742 + }, + { + "epoch": 0.66, + "grad_norm": 1.4906968016712054, + "learning_rate": 2.742318503242288e-06, + "loss": 0.2896, + "step": 22743 + }, + { + "epoch": 0.66, + "grad_norm": 1.2445933190802085, + "learning_rate": 2.741899415261422e-06, + "loss": 0.2751, + "step": 22744 + }, + { + "epoch": 0.66, + "grad_norm": 1.2738111611416847, + "learning_rate": 2.7414803472083213e-06, + "loss": 0.2875, + "step": 22745 + }, + { + "epoch": 0.66, + "grad_norm": 1.3243284506060493, + "learning_rate": 2.7410612990866792e-06, + "loss": 0.2934, + "step": 22746 + }, + { + "epoch": 0.66, + "grad_norm": 1.2827829458169697, + "learning_rate": 2.7406422709001956e-06, + "loss": 0.2795, + "step": 22747 + }, + { + "epoch": 0.66, + "grad_norm": 1.2598934688894368, + "learning_rate": 2.7402232626525698e-06, + "loss": 0.2729, + "step": 22748 + }, + { + "epoch": 0.66, + "grad_norm": 1.2009503972590527, + "learning_rate": 2.7398042743474983e-06, + "loss": 0.3004, + "step": 22749 + }, + { + "epoch": 0.66, + "grad_norm": 1.317429159293623, + "learning_rate": 2.73938530598868e-06, + "loss": 0.28, + "step": 22750 + }, + { + "epoch": 0.66, + "grad_norm": 1.2704450829012792, + "learning_rate": 2.7389663575798125e-06, + "loss": 0.3496, + "step": 22751 + }, + { + "epoch": 0.66, + "grad_norm": 1.356518733058989, + "learning_rate": 2.7385474291245906e-06, + "loss": 0.2878, + "step": 22752 + }, + { + "epoch": 0.66, + "grad_norm": 1.2270199209696269, + "learning_rate": 2.7381285206267123e-06, + "loss": 0.2906, + "step": 22753 + }, + { + "epoch": 0.66, + "grad_norm": 1.3593908575960134, + "learning_rate": 2.737709632089875e-06, + "loss": 0.2977, + "step": 22754 + }, + { + "epoch": 0.66, + "grad_norm": 1.1976973897928884, + "learning_rate": 2.7372907635177752e-06, + "loss": 0.2803, + "step": 22755 + }, + { + "epoch": 0.66, + "grad_norm": 1.275555220323819, + "learning_rate": 2.73687191491411e-06, + "loss": 0.3146, + "step": 22756 + }, + { + "epoch": 0.66, + "grad_norm": 2.4807747258100674, + "learning_rate": 2.7364530862825743e-06, + "loss": 0.2993, + "step": 22757 + }, + { + "epoch": 0.66, + "grad_norm": 1.3255903441479173, + "learning_rate": 2.7360342776268665e-06, + "loss": 0.2875, + "step": 22758 + }, + { + "epoch": 0.66, + "grad_norm": 1.2725104198686874, + "learning_rate": 2.7356154889506826e-06, + "loss": 0.2895, + "step": 22759 + }, + { + "epoch": 0.66, + "grad_norm": 0.9605359665789831, + "learning_rate": 2.735196720257715e-06, + "loss": 0.5836, + "step": 22760 + }, + { + "epoch": 0.66, + "grad_norm": 1.2976130541184032, + "learning_rate": 2.734777971551662e-06, + "loss": 0.3022, + "step": 22761 + }, + { + "epoch": 0.66, + "grad_norm": 1.3104260088144792, + "learning_rate": 2.7343592428362174e-06, + "loss": 0.2767, + "step": 22762 + }, + { + "epoch": 0.66, + "grad_norm": 1.3856568512908125, + "learning_rate": 2.7339405341150792e-06, + "loss": 0.278, + "step": 22763 + }, + { + "epoch": 0.66, + "grad_norm": 1.3415062001054119, + "learning_rate": 2.73352184539194e-06, + "loss": 0.2892, + "step": 22764 + }, + { + "epoch": 0.66, + "grad_norm": 1.2195681669774094, + "learning_rate": 2.733103176670496e-06, + "loss": 0.2617, + "step": 22765 + }, + { + "epoch": 0.66, + "grad_norm": 1.3495365812027387, + "learning_rate": 2.7326845279544414e-06, + "loss": 0.3014, + "step": 22766 + }, + { + "epoch": 0.66, + "grad_norm": 1.5241654126218553, + "learning_rate": 2.7322658992474706e-06, + "loss": 0.3139, + "step": 22767 + }, + { + "epoch": 0.66, + "grad_norm": 1.3372411411827407, + "learning_rate": 2.731847290553281e-06, + "loss": 0.2781, + "step": 22768 + }, + { + "epoch": 0.66, + "grad_norm": 1.3027102297456443, + "learning_rate": 2.7314287018755615e-06, + "loss": 0.3077, + "step": 22769 + }, + { + "epoch": 0.66, + "grad_norm": 1.366957836752573, + "learning_rate": 2.7310101332180093e-06, + "loss": 0.2904, + "step": 22770 + }, + { + "epoch": 0.66, + "grad_norm": 1.2990116755387242, + "learning_rate": 2.730591584584318e-06, + "loss": 0.2814, + "step": 22771 + }, + { + "epoch": 0.66, + "grad_norm": 1.1478819815160182, + "learning_rate": 2.7301730559781815e-06, + "loss": 0.2722, + "step": 22772 + }, + { + "epoch": 0.66, + "grad_norm": 1.249553992763908, + "learning_rate": 2.7297545474032945e-06, + "loss": 0.3065, + "step": 22773 + }, + { + "epoch": 0.66, + "grad_norm": 2.7612652206482253, + "learning_rate": 2.729336058863347e-06, + "loss": 0.2811, + "step": 22774 + }, + { + "epoch": 0.66, + "grad_norm": 1.310382638181152, + "learning_rate": 2.7289175903620336e-06, + "loss": 0.2943, + "step": 22775 + }, + { + "epoch": 0.66, + "grad_norm": 1.3606023640604732, + "learning_rate": 2.728499141903048e-06, + "loss": 0.3099, + "step": 22776 + }, + { + "epoch": 0.66, + "grad_norm": 1.2417697765556523, + "learning_rate": 2.7280807134900823e-06, + "loss": 0.3, + "step": 22777 + }, + { + "epoch": 0.66, + "grad_norm": 1.9084200460857992, + "learning_rate": 2.7276623051268314e-06, + "loss": 0.2851, + "step": 22778 + }, + { + "epoch": 0.66, + "grad_norm": 4.695452909041842, + "learning_rate": 2.7272439168169844e-06, + "loss": 0.3256, + "step": 22779 + }, + { + "epoch": 0.66, + "grad_norm": 1.971689889795221, + "learning_rate": 2.7268255485642347e-06, + "loss": 0.2821, + "step": 22780 + }, + { + "epoch": 0.66, + "grad_norm": 1.2998076244643564, + "learning_rate": 2.7264072003722753e-06, + "loss": 0.2878, + "step": 22781 + }, + { + "epoch": 0.66, + "grad_norm": 1.3790435499581544, + "learning_rate": 2.725988872244797e-06, + "loss": 0.2905, + "step": 22782 + }, + { + "epoch": 0.66, + "grad_norm": 1.2672058145415084, + "learning_rate": 2.7255705641854923e-06, + "loss": 0.2997, + "step": 22783 + }, + { + "epoch": 0.66, + "grad_norm": 1.332741877749623, + "learning_rate": 2.725152276198053e-06, + "loss": 0.2983, + "step": 22784 + }, + { + "epoch": 0.66, + "grad_norm": 1.247578493321835, + "learning_rate": 2.7247340082861697e-06, + "loss": 0.3006, + "step": 22785 + }, + { + "epoch": 0.66, + "grad_norm": 1.324471199497301, + "learning_rate": 2.7243157604535344e-06, + "loss": 0.2875, + "step": 22786 + }, + { + "epoch": 0.66, + "grad_norm": 1.2192897294565743, + "learning_rate": 2.7238975327038385e-06, + "loss": 0.282, + "step": 22787 + }, + { + "epoch": 0.66, + "grad_norm": 1.4691169779976403, + "learning_rate": 2.7234793250407708e-06, + "loss": 0.2844, + "step": 22788 + }, + { + "epoch": 0.66, + "grad_norm": 1.3702676782784349, + "learning_rate": 2.723061137468024e-06, + "loss": 0.2873, + "step": 22789 + }, + { + "epoch": 0.66, + "grad_norm": 1.2262580100741054, + "learning_rate": 2.7226429699892874e-06, + "loss": 0.2955, + "step": 22790 + }, + { + "epoch": 0.66, + "grad_norm": 1.4176220033750377, + "learning_rate": 2.7222248226082522e-06, + "loss": 0.3005, + "step": 22791 + }, + { + "epoch": 0.66, + "grad_norm": 0.9345299887708133, + "learning_rate": 2.7218066953286075e-06, + "loss": 0.5506, + "step": 22792 + }, + { + "epoch": 0.66, + "grad_norm": 1.3614109797232847, + "learning_rate": 2.721388588154045e-06, + "loss": 0.2782, + "step": 22793 + }, + { + "epoch": 0.66, + "grad_norm": 1.308608778825674, + "learning_rate": 2.7209705010882527e-06, + "loss": 0.2869, + "step": 22794 + }, + { + "epoch": 0.66, + "grad_norm": 1.204685895679119, + "learning_rate": 2.7205524341349233e-06, + "loss": 0.2684, + "step": 22795 + }, + { + "epoch": 0.66, + "grad_norm": 1.2166315400901713, + "learning_rate": 2.7201343872977413e-06, + "loss": 0.2719, + "step": 22796 + }, + { + "epoch": 0.66, + "grad_norm": 1.2545658004142917, + "learning_rate": 2.7197163605803993e-06, + "loss": 0.2921, + "step": 22797 + }, + { + "epoch": 0.66, + "grad_norm": 1.1818795748436943, + "learning_rate": 2.7192983539865857e-06, + "loss": 0.2665, + "step": 22798 + }, + { + "epoch": 0.66, + "grad_norm": 1.4105827544269278, + "learning_rate": 2.7188803675199893e-06, + "loss": 0.3106, + "step": 22799 + }, + { + "epoch": 0.66, + "grad_norm": 1.343872177216123, + "learning_rate": 2.7184624011842985e-06, + "loss": 0.2912, + "step": 22800 + }, + { + "epoch": 0.66, + "grad_norm": 1.6020257135715503, + "learning_rate": 2.718044454983205e-06, + "loss": 0.2993, + "step": 22801 + }, + { + "epoch": 0.66, + "grad_norm": 1.306251485141152, + "learning_rate": 2.717626528920393e-06, + "loss": 0.2906, + "step": 22802 + }, + { + "epoch": 0.66, + "grad_norm": 1.313635400293, + "learning_rate": 2.7172086229995507e-06, + "loss": 0.2828, + "step": 22803 + }, + { + "epoch": 0.66, + "grad_norm": 1.4971636648470354, + "learning_rate": 2.7167907372243686e-06, + "loss": 0.2701, + "step": 22804 + }, + { + "epoch": 0.66, + "grad_norm": 1.4615224640310225, + "learning_rate": 2.7163728715985354e-06, + "loss": 0.284, + "step": 22805 + }, + { + "epoch": 0.66, + "grad_norm": 1.2235053624404395, + "learning_rate": 2.715955026125735e-06, + "loss": 0.2636, + "step": 22806 + }, + { + "epoch": 0.66, + "grad_norm": 1.302057278424086, + "learning_rate": 2.715537200809657e-06, + "loss": 0.283, + "step": 22807 + }, + { + "epoch": 0.66, + "grad_norm": 1.6323063801615123, + "learning_rate": 2.715119395653988e-06, + "loss": 0.3028, + "step": 22808 + }, + { + "epoch": 0.66, + "grad_norm": 1.486252170251838, + "learning_rate": 2.714701610662416e-06, + "loss": 0.2851, + "step": 22809 + }, + { + "epoch": 0.66, + "grad_norm": 1.365090646344414, + "learning_rate": 2.714283845838628e-06, + "loss": 0.2634, + "step": 22810 + }, + { + "epoch": 0.66, + "grad_norm": 1.3287981979940173, + "learning_rate": 2.7138661011863097e-06, + "loss": 0.283, + "step": 22811 + }, + { + "epoch": 0.66, + "grad_norm": 1.319883386133728, + "learning_rate": 2.713448376709149e-06, + "loss": 0.2886, + "step": 22812 + }, + { + "epoch": 0.66, + "grad_norm": 1.2402721442630704, + "learning_rate": 2.7130306724108313e-06, + "loss": 0.2847, + "step": 22813 + }, + { + "epoch": 0.66, + "grad_norm": 1.330833764478463, + "learning_rate": 2.712612988295046e-06, + "loss": 0.2911, + "step": 22814 + }, + { + "epoch": 0.66, + "grad_norm": 1.3094789677389524, + "learning_rate": 2.7121953243654736e-06, + "loss": 0.2788, + "step": 22815 + }, + { + "epoch": 0.66, + "grad_norm": 1.828567075903296, + "learning_rate": 2.7117776806258036e-06, + "loss": 0.301, + "step": 22816 + }, + { + "epoch": 0.66, + "grad_norm": 1.3463523138948204, + "learning_rate": 2.71136005707972e-06, + "loss": 0.2779, + "step": 22817 + }, + { + "epoch": 0.66, + "grad_norm": 1.2274479676010077, + "learning_rate": 2.7109424537309097e-06, + "loss": 0.2659, + "step": 22818 + }, + { + "epoch": 0.66, + "grad_norm": 1.4994046339842821, + "learning_rate": 2.7105248705830576e-06, + "loss": 0.2704, + "step": 22819 + }, + { + "epoch": 0.66, + "grad_norm": 1.23218815488436, + "learning_rate": 2.710107307639849e-06, + "loss": 0.283, + "step": 22820 + }, + { + "epoch": 0.66, + "grad_norm": 2.686903536882981, + "learning_rate": 2.709689764904969e-06, + "loss": 0.2892, + "step": 22821 + }, + { + "epoch": 0.66, + "grad_norm": 1.3790901508481654, + "learning_rate": 2.709272242382103e-06, + "loss": 0.2684, + "step": 22822 + }, + { + "epoch": 0.66, + "grad_norm": 1.3661264525487755, + "learning_rate": 2.7088547400749355e-06, + "loss": 0.2974, + "step": 22823 + }, + { + "epoch": 0.66, + "grad_norm": 1.2800351421255034, + "learning_rate": 2.708437257987149e-06, + "loss": 0.3117, + "step": 22824 + }, + { + "epoch": 0.66, + "grad_norm": 1.302162615861825, + "learning_rate": 2.7080197961224284e-06, + "loss": 0.3038, + "step": 22825 + }, + { + "epoch": 0.66, + "grad_norm": 1.5728390563814616, + "learning_rate": 2.7076023544844592e-06, + "loss": 0.2965, + "step": 22826 + }, + { + "epoch": 0.66, + "grad_norm": 1.2267278935659607, + "learning_rate": 2.707184933076925e-06, + "loss": 0.2787, + "step": 22827 + }, + { + "epoch": 0.66, + "grad_norm": 1.238399122851216, + "learning_rate": 2.706767531903508e-06, + "loss": 0.2675, + "step": 22828 + }, + { + "epoch": 0.66, + "grad_norm": 1.4636245959335135, + "learning_rate": 2.706350150967896e-06, + "loss": 0.2849, + "step": 22829 + }, + { + "epoch": 0.66, + "grad_norm": 1.4603877582496196, + "learning_rate": 2.7059327902737673e-06, + "loss": 0.309, + "step": 22830 + }, + { + "epoch": 0.66, + "grad_norm": 1.3045630960511074, + "learning_rate": 2.705515449824807e-06, + "loss": 0.2659, + "step": 22831 + }, + { + "epoch": 0.66, + "grad_norm": 0.8643401823253143, + "learning_rate": 2.7050981296246983e-06, + "loss": 0.5636, + "step": 22832 + }, + { + "epoch": 0.66, + "grad_norm": 1.275382365701056, + "learning_rate": 2.7046808296771253e-06, + "loss": 0.28, + "step": 22833 + }, + { + "epoch": 0.66, + "grad_norm": 1.3005984883996022, + "learning_rate": 2.7042635499857684e-06, + "loss": 0.296, + "step": 22834 + }, + { + "epoch": 0.66, + "grad_norm": 1.6187271659519946, + "learning_rate": 2.7038462905543113e-06, + "loss": 0.2927, + "step": 22835 + }, + { + "epoch": 0.66, + "grad_norm": 1.3226951409015293, + "learning_rate": 2.7034290513864358e-06, + "loss": 0.2821, + "step": 22836 + }, + { + "epoch": 0.66, + "grad_norm": 1.6384070713923666, + "learning_rate": 2.703011832485824e-06, + "loss": 0.2914, + "step": 22837 + }, + { + "epoch": 0.66, + "grad_norm": 1.4498053144494922, + "learning_rate": 2.7025946338561585e-06, + "loss": 0.2998, + "step": 22838 + }, + { + "epoch": 0.66, + "grad_norm": 1.942606768611967, + "learning_rate": 2.7021774555011214e-06, + "loss": 0.2826, + "step": 22839 + }, + { + "epoch": 0.66, + "grad_norm": 1.2916428516116392, + "learning_rate": 2.7017602974243927e-06, + "loss": 0.2848, + "step": 22840 + }, + { + "epoch": 0.66, + "grad_norm": 1.2898368521364973, + "learning_rate": 2.701343159629658e-06, + "loss": 0.2883, + "step": 22841 + }, + { + "epoch": 0.66, + "grad_norm": 1.3401910406339095, + "learning_rate": 2.700926042120593e-06, + "loss": 0.2761, + "step": 22842 + }, + { + "epoch": 0.66, + "grad_norm": 1.4067587031726914, + "learning_rate": 2.700508944900881e-06, + "loss": 0.2923, + "step": 22843 + }, + { + "epoch": 0.66, + "grad_norm": 1.259090291786622, + "learning_rate": 2.7000918679742027e-06, + "loss": 0.2967, + "step": 22844 + }, + { + "epoch": 0.66, + "grad_norm": 1.6984995827356952, + "learning_rate": 2.6996748113442397e-06, + "loss": 0.274, + "step": 22845 + }, + { + "epoch": 0.66, + "grad_norm": 1.2241606013712716, + "learning_rate": 2.699257775014672e-06, + "loss": 0.2924, + "step": 22846 + }, + { + "epoch": 0.66, + "grad_norm": 1.330196899993359, + "learning_rate": 2.6988407589891796e-06, + "loss": 0.2968, + "step": 22847 + }, + { + "epoch": 0.66, + "grad_norm": 1.3632671772649112, + "learning_rate": 2.6984237632714438e-06, + "loss": 0.2976, + "step": 22848 + }, + { + "epoch": 0.66, + "grad_norm": 1.2767839346395622, + "learning_rate": 2.6980067878651434e-06, + "loss": 0.2851, + "step": 22849 + }, + { + "epoch": 0.66, + "grad_norm": 1.9984175706376066, + "learning_rate": 2.6975898327739605e-06, + "loss": 0.2905, + "step": 22850 + }, + { + "epoch": 0.66, + "grad_norm": 1.345029747939106, + "learning_rate": 2.697172898001571e-06, + "loss": 0.3009, + "step": 22851 + }, + { + "epoch": 0.66, + "grad_norm": 1.530233298530041, + "learning_rate": 2.696755983551656e-06, + "loss": 0.3068, + "step": 22852 + }, + { + "epoch": 0.66, + "grad_norm": 1.5593670150208716, + "learning_rate": 2.696339089427895e-06, + "loss": 0.2954, + "step": 22853 + }, + { + "epoch": 0.66, + "grad_norm": 1.5912758306323769, + "learning_rate": 2.695922215633967e-06, + "loss": 0.3046, + "step": 22854 + }, + { + "epoch": 0.66, + "grad_norm": 0.9282547362378684, + "learning_rate": 2.6955053621735516e-06, + "loss": 0.5727, + "step": 22855 + }, + { + "epoch": 0.66, + "grad_norm": 1.3107123893951738, + "learning_rate": 2.695088529050327e-06, + "loss": 0.3085, + "step": 22856 + }, + { + "epoch": 0.66, + "grad_norm": 1.362927351098929, + "learning_rate": 2.6946717162679725e-06, + "loss": 0.2774, + "step": 22857 + }, + { + "epoch": 0.66, + "grad_norm": 1.497900196015918, + "learning_rate": 2.6942549238301654e-06, + "loss": 0.2864, + "step": 22858 + }, + { + "epoch": 0.66, + "grad_norm": 1.4568033488649577, + "learning_rate": 2.6938381517405825e-06, + "loss": 0.3174, + "step": 22859 + }, + { + "epoch": 0.66, + "grad_norm": 1.2342049943902413, + "learning_rate": 2.693421400002907e-06, + "loss": 0.284, + "step": 22860 + }, + { + "epoch": 0.66, + "grad_norm": 1.2904830430613146, + "learning_rate": 2.69300466862081e-06, + "loss": 0.2967, + "step": 22861 + }, + { + "epoch": 0.66, + "grad_norm": 1.2811425879127807, + "learning_rate": 2.692587957597973e-06, + "loss": 0.2779, + "step": 22862 + }, + { + "epoch": 0.66, + "grad_norm": 1.429509774854815, + "learning_rate": 2.692171266938073e-06, + "loss": 0.2901, + "step": 22863 + }, + { + "epoch": 0.66, + "grad_norm": 1.341529154568373, + "learning_rate": 2.6917545966447867e-06, + "loss": 0.2631, + "step": 22864 + }, + { + "epoch": 0.66, + "grad_norm": 1.3004381215107916, + "learning_rate": 2.6913379467217917e-06, + "loss": 0.2779, + "step": 22865 + }, + { + "epoch": 0.66, + "grad_norm": 1.6660156092363754, + "learning_rate": 2.6909213171727656e-06, + "loss": 0.2774, + "step": 22866 + }, + { + "epoch": 0.66, + "grad_norm": 1.3357088332443587, + "learning_rate": 2.6905047080013837e-06, + "loss": 0.2919, + "step": 22867 + }, + { + "epoch": 0.66, + "grad_norm": 1.1873079953836285, + "learning_rate": 2.690088119211324e-06, + "loss": 0.2794, + "step": 22868 + }, + { + "epoch": 0.66, + "grad_norm": 1.5529666599592518, + "learning_rate": 2.6896715508062633e-06, + "loss": 0.2804, + "step": 22869 + }, + { + "epoch": 0.66, + "grad_norm": 1.3044911758391, + "learning_rate": 2.6892550027898754e-06, + "loss": 0.2833, + "step": 22870 + }, + { + "epoch": 0.66, + "grad_norm": 1.2193846320187982, + "learning_rate": 2.688838475165838e-06, + "loss": 0.2928, + "step": 22871 + }, + { + "epoch": 0.66, + "grad_norm": 1.3771124428316268, + "learning_rate": 2.6884219679378265e-06, + "loss": 0.2701, + "step": 22872 + }, + { + "epoch": 0.66, + "grad_norm": 0.9413993845559816, + "learning_rate": 2.688005481109517e-06, + "loss": 0.53, + "step": 22873 + }, + { + "epoch": 0.66, + "grad_norm": 1.3168408122988196, + "learning_rate": 2.687589014684584e-06, + "loss": 0.2876, + "step": 22874 + }, + { + "epoch": 0.66, + "grad_norm": 1.2796919167858802, + "learning_rate": 2.6871725686667046e-06, + "loss": 0.2954, + "step": 22875 + }, + { + "epoch": 0.66, + "grad_norm": 1.7117169904762595, + "learning_rate": 2.6867561430595524e-06, + "loss": 0.2989, + "step": 22876 + }, + { + "epoch": 0.66, + "grad_norm": 1.3196362221750568, + "learning_rate": 2.686339737866805e-06, + "loss": 0.2777, + "step": 22877 + }, + { + "epoch": 0.66, + "grad_norm": 1.4299709143405521, + "learning_rate": 2.685923353092133e-06, + "loss": 0.2889, + "step": 22878 + }, + { + "epoch": 0.66, + "grad_norm": 1.4065515361896808, + "learning_rate": 2.685506988739213e-06, + "loss": 0.31, + "step": 22879 + }, + { + "epoch": 0.66, + "grad_norm": 1.4944622740166649, + "learning_rate": 2.6850906448117196e-06, + "loss": 0.3316, + "step": 22880 + }, + { + "epoch": 0.66, + "grad_norm": 1.4255444953571001, + "learning_rate": 2.6846743213133275e-06, + "loss": 0.2762, + "step": 22881 + }, + { + "epoch": 0.66, + "grad_norm": 1.6947820954235657, + "learning_rate": 2.6842580182477095e-06, + "loss": 0.3311, + "step": 22882 + }, + { + "epoch": 0.66, + "grad_norm": 1.644580779908979, + "learning_rate": 2.6838417356185402e-06, + "loss": 0.279, + "step": 22883 + }, + { + "epoch": 0.66, + "grad_norm": 1.4043055647800318, + "learning_rate": 2.6834254734294935e-06, + "loss": 0.2982, + "step": 22884 + }, + { + "epoch": 0.66, + "grad_norm": 1.2541747821565674, + "learning_rate": 2.6830092316842448e-06, + "loss": 0.2847, + "step": 22885 + }, + { + "epoch": 0.66, + "grad_norm": 1.4810394200171368, + "learning_rate": 2.682593010386463e-06, + "loss": 0.2837, + "step": 22886 + }, + { + "epoch": 0.66, + "grad_norm": 1.304826130101673, + "learning_rate": 2.682176809539824e-06, + "loss": 0.2786, + "step": 22887 + }, + { + "epoch": 0.66, + "grad_norm": 1.2727199210376239, + "learning_rate": 2.6817606291480018e-06, + "loss": 0.3036, + "step": 22888 + }, + { + "epoch": 0.66, + "grad_norm": 1.3181939332110633, + "learning_rate": 2.681344469214666e-06, + "loss": 0.3087, + "step": 22889 + }, + { + "epoch": 0.66, + "grad_norm": 1.7621951825245457, + "learning_rate": 2.6809283297434914e-06, + "loss": 0.2848, + "step": 22890 + }, + { + "epoch": 0.66, + "grad_norm": 1.3379227370837168, + "learning_rate": 2.68051221073815e-06, + "loss": 0.2972, + "step": 22891 + }, + { + "epoch": 0.66, + "grad_norm": 1.31877434535045, + "learning_rate": 2.6800961122023132e-06, + "loss": 0.2884, + "step": 22892 + }, + { + "epoch": 0.66, + "grad_norm": 1.6463380314383704, + "learning_rate": 2.6796800341396547e-06, + "loss": 0.2784, + "step": 22893 + }, + { + "epoch": 0.66, + "grad_norm": 1.2735809983235304, + "learning_rate": 2.6792639765538453e-06, + "loss": 0.2828, + "step": 22894 + }, + { + "epoch": 0.66, + "grad_norm": 1.2727921357050578, + "learning_rate": 2.6788479394485573e-06, + "loss": 0.3014, + "step": 22895 + }, + { + "epoch": 0.66, + "grad_norm": 1.2853111469017453, + "learning_rate": 2.6784319228274635e-06, + "loss": 0.2845, + "step": 22896 + }, + { + "epoch": 0.66, + "grad_norm": 1.371783832443604, + "learning_rate": 2.6780159266942324e-06, + "loss": 0.2807, + "step": 22897 + }, + { + "epoch": 0.66, + "grad_norm": 1.5638174796882955, + "learning_rate": 2.6775999510525363e-06, + "loss": 0.2832, + "step": 22898 + }, + { + "epoch": 0.66, + "grad_norm": 0.999489988910025, + "learning_rate": 2.677183995906046e-06, + "loss": 0.5723, + "step": 22899 + }, + { + "epoch": 0.66, + "grad_norm": 1.375485413514508, + "learning_rate": 2.6767680612584336e-06, + "loss": 0.2854, + "step": 22900 + }, + { + "epoch": 0.66, + "grad_norm": 1.3760092324163467, + "learning_rate": 2.676352147113368e-06, + "loss": 0.2936, + "step": 22901 + }, + { + "epoch": 0.66, + "grad_norm": 1.3960233563268623, + "learning_rate": 2.6759362534745203e-06, + "loss": 0.2964, + "step": 22902 + }, + { + "epoch": 0.66, + "grad_norm": 1.4392941905919747, + "learning_rate": 2.675520380345562e-06, + "loss": 0.2978, + "step": 22903 + }, + { + "epoch": 0.66, + "grad_norm": 0.9034136315304062, + "learning_rate": 2.675104527730161e-06, + "loss": 0.5788, + "step": 22904 + }, + { + "epoch": 0.66, + "grad_norm": 1.3213713921720536, + "learning_rate": 2.6746886956319914e-06, + "loss": 0.2881, + "step": 22905 + }, + { + "epoch": 0.66, + "grad_norm": 4.003609674435958, + "learning_rate": 2.6742728840547173e-06, + "loss": 0.3014, + "step": 22906 + }, + { + "epoch": 0.66, + "grad_norm": 1.3941968754880647, + "learning_rate": 2.67385709300201e-06, + "loss": 0.3029, + "step": 22907 + }, + { + "epoch": 0.66, + "grad_norm": 1.1375913707491596, + "learning_rate": 2.6734413224775414e-06, + "loss": 0.2661, + "step": 22908 + }, + { + "epoch": 0.66, + "grad_norm": 1.3510219457286816, + "learning_rate": 2.6730255724849773e-06, + "loss": 0.2729, + "step": 22909 + }, + { + "epoch": 0.66, + "grad_norm": 1.3806006268622997, + "learning_rate": 2.672609843027989e-06, + "loss": 0.2959, + "step": 22910 + }, + { + "epoch": 0.66, + "grad_norm": 1.302587658670436, + "learning_rate": 2.6721941341102453e-06, + "loss": 0.2921, + "step": 22911 + }, + { + "epoch": 0.66, + "grad_norm": 1.2917017534647575, + "learning_rate": 2.6717784457354135e-06, + "loss": 0.2702, + "step": 22912 + }, + { + "epoch": 0.66, + "grad_norm": 1.2067201208347094, + "learning_rate": 2.671362777907165e-06, + "loss": 0.3026, + "step": 22913 + }, + { + "epoch": 0.66, + "grad_norm": 1.315493988868874, + "learning_rate": 2.6709471306291636e-06, + "loss": 0.28, + "step": 22914 + }, + { + "epoch": 0.66, + "grad_norm": 1.4650917743739629, + "learning_rate": 2.670531503905081e-06, + "loss": 0.3126, + "step": 22915 + }, + { + "epoch": 0.66, + "grad_norm": 1.3698442593772926, + "learning_rate": 2.6701158977385823e-06, + "loss": 0.3101, + "step": 22916 + }, + { + "epoch": 0.66, + "grad_norm": 1.392748653214094, + "learning_rate": 2.669700312133337e-06, + "loss": 0.287, + "step": 22917 + }, + { + "epoch": 0.66, + "grad_norm": 1.3535943589166277, + "learning_rate": 2.6692847470930117e-06, + "loss": 0.3159, + "step": 22918 + }, + { + "epoch": 0.66, + "grad_norm": 1.2312260469669998, + "learning_rate": 2.668869202621275e-06, + "loss": 0.2668, + "step": 22919 + }, + { + "epoch": 0.66, + "grad_norm": 1.5090742628962484, + "learning_rate": 2.668453678721793e-06, + "loss": 0.2958, + "step": 22920 + }, + { + "epoch": 0.66, + "grad_norm": 1.3943199410333873, + "learning_rate": 2.668038175398233e-06, + "loss": 0.2893, + "step": 22921 + }, + { + "epoch": 0.66, + "grad_norm": 1.2671323651316235, + "learning_rate": 2.6676226926542625e-06, + "loss": 0.2673, + "step": 22922 + }, + { + "epoch": 0.66, + "grad_norm": 1.3179141427500563, + "learning_rate": 2.6672072304935477e-06, + "loss": 0.2932, + "step": 22923 + }, + { + "epoch": 0.66, + "grad_norm": 1.3886976409248915, + "learning_rate": 2.6667917889197565e-06, + "loss": 0.281, + "step": 22924 + }, + { + "epoch": 0.66, + "grad_norm": 1.17802021171851, + "learning_rate": 2.6663763679365516e-06, + "loss": 0.2568, + "step": 22925 + }, + { + "epoch": 0.66, + "grad_norm": 1.400068330721981, + "learning_rate": 2.6659609675476017e-06, + "loss": 0.3203, + "step": 22926 + }, + { + "epoch": 0.66, + "grad_norm": 1.2692212897626853, + "learning_rate": 2.6655455877565715e-06, + "loss": 0.2949, + "step": 22927 + }, + { + "epoch": 0.67, + "grad_norm": 1.3612666723257372, + "learning_rate": 2.6651302285671276e-06, + "loss": 0.3053, + "step": 22928 + }, + { + "epoch": 0.67, + "grad_norm": 1.3755515127373108, + "learning_rate": 2.6647148899829358e-06, + "loss": 0.2903, + "step": 22929 + }, + { + "epoch": 0.67, + "grad_norm": 1.3466124689654437, + "learning_rate": 2.6642995720076603e-06, + "loss": 0.2692, + "step": 22930 + }, + { + "epoch": 0.67, + "grad_norm": 1.3455592790898543, + "learning_rate": 2.6638842746449672e-06, + "loss": 0.2772, + "step": 22931 + }, + { + "epoch": 0.67, + "grad_norm": 1.4130495489176929, + "learning_rate": 2.663468997898523e-06, + "loss": 0.2974, + "step": 22932 + }, + { + "epoch": 0.67, + "grad_norm": 1.2779392566008496, + "learning_rate": 2.6630537417719894e-06, + "loss": 0.2817, + "step": 22933 + }, + { + "epoch": 0.67, + "grad_norm": 1.3148172881552318, + "learning_rate": 2.662638506269032e-06, + "loss": 0.2887, + "step": 22934 + }, + { + "epoch": 0.67, + "grad_norm": 1.456595158015476, + "learning_rate": 2.6622232913933168e-06, + "loss": 0.2848, + "step": 22935 + }, + { + "epoch": 0.67, + "grad_norm": 1.3464106663752708, + "learning_rate": 2.6618080971485053e-06, + "loss": 0.3155, + "step": 22936 + }, + { + "epoch": 0.67, + "grad_norm": 1.5641267358222557, + "learning_rate": 2.661392923538264e-06, + "loss": 0.2843, + "step": 22937 + }, + { + "epoch": 0.67, + "grad_norm": 1.4550001721534243, + "learning_rate": 2.6609777705662564e-06, + "loss": 0.2756, + "step": 22938 + }, + { + "epoch": 0.67, + "grad_norm": 2.043088070889106, + "learning_rate": 2.660562638236146e-06, + "loss": 0.2628, + "step": 22939 + }, + { + "epoch": 0.67, + "grad_norm": 1.6035183989585853, + "learning_rate": 2.660147526551596e-06, + "loss": 0.311, + "step": 22940 + }, + { + "epoch": 0.67, + "grad_norm": 1.748506214279288, + "learning_rate": 2.659732435516272e-06, + "loss": 0.2842, + "step": 22941 + }, + { + "epoch": 0.67, + "grad_norm": 1.543253629604068, + "learning_rate": 2.659317365133833e-06, + "loss": 0.2806, + "step": 22942 + }, + { + "epoch": 0.67, + "grad_norm": 0.9601843306025003, + "learning_rate": 2.6589023154079464e-06, + "loss": 0.5685, + "step": 22943 + }, + { + "epoch": 0.67, + "grad_norm": 1.2072028065268132, + "learning_rate": 2.658487286342271e-06, + "loss": 0.2719, + "step": 22944 + }, + { + "epoch": 0.67, + "grad_norm": 1.2919849826430987, + "learning_rate": 2.658072277940471e-06, + "loss": 0.2926, + "step": 22945 + }, + { + "epoch": 0.67, + "grad_norm": 1.278639448848972, + "learning_rate": 2.657657290206209e-06, + "loss": 0.2796, + "step": 22946 + }, + { + "epoch": 0.67, + "grad_norm": 1.2826280119318056, + "learning_rate": 2.657242323143148e-06, + "loss": 0.3021, + "step": 22947 + }, + { + "epoch": 0.67, + "grad_norm": 1.3269447606311005, + "learning_rate": 2.6568273767549498e-06, + "loss": 0.2827, + "step": 22948 + }, + { + "epoch": 0.67, + "grad_norm": 1.242545214730698, + "learning_rate": 2.656412451045275e-06, + "loss": 0.2813, + "step": 22949 + }, + { + "epoch": 0.67, + "grad_norm": 1.3104958735556251, + "learning_rate": 2.6559975460177867e-06, + "loss": 0.2786, + "step": 22950 + }, + { + "epoch": 0.67, + "grad_norm": 1.2988989795904127, + "learning_rate": 2.655582661676148e-06, + "loss": 0.3014, + "step": 22951 + }, + { + "epoch": 0.67, + "grad_norm": 1.2447152338063345, + "learning_rate": 2.6551677980240166e-06, + "loss": 0.2892, + "step": 22952 + }, + { + "epoch": 0.67, + "grad_norm": 1.3683831541208917, + "learning_rate": 2.654752955065055e-06, + "loss": 0.2735, + "step": 22953 + }, + { + "epoch": 0.67, + "grad_norm": 1.272925873324458, + "learning_rate": 2.654338132802925e-06, + "loss": 0.2783, + "step": 22954 + }, + { + "epoch": 0.67, + "grad_norm": 1.3336961837164496, + "learning_rate": 2.6539233312412872e-06, + "loss": 0.2897, + "step": 22955 + }, + { + "epoch": 0.67, + "grad_norm": 1.3841318389361679, + "learning_rate": 2.6535085503838008e-06, + "loss": 0.2838, + "step": 22956 + }, + { + "epoch": 0.67, + "grad_norm": 1.3298658173782225, + "learning_rate": 2.6530937902341286e-06, + "loss": 0.2717, + "step": 22957 + }, + { + "epoch": 0.67, + "grad_norm": 1.304880441087582, + "learning_rate": 2.6526790507959294e-06, + "loss": 0.2915, + "step": 22958 + }, + { + "epoch": 0.67, + "grad_norm": 1.406217960962824, + "learning_rate": 2.6522643320728637e-06, + "loss": 0.3383, + "step": 22959 + }, + { + "epoch": 0.67, + "grad_norm": 1.4937154194296263, + "learning_rate": 2.6518496340685927e-06, + "loss": 0.2836, + "step": 22960 + }, + { + "epoch": 0.67, + "grad_norm": 1.428593889729719, + "learning_rate": 2.6514349567867736e-06, + "loss": 0.2788, + "step": 22961 + }, + { + "epoch": 0.67, + "grad_norm": 1.278146756408093, + "learning_rate": 2.651020300231067e-06, + "loss": 0.2943, + "step": 22962 + }, + { + "epoch": 0.67, + "grad_norm": 1.480206123708716, + "learning_rate": 2.6506056644051315e-06, + "loss": 0.2953, + "step": 22963 + }, + { + "epoch": 0.67, + "grad_norm": 1.4800134498009125, + "learning_rate": 2.650191049312628e-06, + "loss": 0.2774, + "step": 22964 + }, + { + "epoch": 0.67, + "grad_norm": 1.2822930516558604, + "learning_rate": 2.649776454957214e-06, + "loss": 0.2808, + "step": 22965 + }, + { + "epoch": 0.67, + "grad_norm": 1.5380093051999906, + "learning_rate": 2.649361881342549e-06, + "loss": 0.3352, + "step": 22966 + }, + { + "epoch": 0.67, + "grad_norm": 2.023279163802246, + "learning_rate": 2.6489473284722915e-06, + "loss": 0.3012, + "step": 22967 + }, + { + "epoch": 0.67, + "grad_norm": 1.2250684718533675, + "learning_rate": 2.6485327963501017e-06, + "loss": 0.272, + "step": 22968 + }, + { + "epoch": 0.67, + "grad_norm": 1.8866237187405974, + "learning_rate": 2.648118284979635e-06, + "loss": 0.2834, + "step": 22969 + }, + { + "epoch": 0.67, + "grad_norm": 1.341896283002882, + "learning_rate": 2.647703794364551e-06, + "loss": 0.2779, + "step": 22970 + }, + { + "epoch": 0.67, + "grad_norm": 1.3496357503961232, + "learning_rate": 2.6472893245085063e-06, + "loss": 0.3141, + "step": 22971 + }, + { + "epoch": 0.67, + "grad_norm": 1.4480887196415142, + "learning_rate": 2.6468748754151593e-06, + "loss": 0.2801, + "step": 22972 + }, + { + "epoch": 0.67, + "grad_norm": 1.1867494978985889, + "learning_rate": 2.646460447088167e-06, + "loss": 0.2672, + "step": 22973 + }, + { + "epoch": 0.67, + "grad_norm": 1.212891760847653, + "learning_rate": 2.6460460395311875e-06, + "loss": 0.2657, + "step": 22974 + }, + { + "epoch": 0.67, + "grad_norm": 1.3610358365598711, + "learning_rate": 2.645631652747878e-06, + "loss": 0.2912, + "step": 22975 + }, + { + "epoch": 0.67, + "grad_norm": 1.3448607439364655, + "learning_rate": 2.6452172867418957e-06, + "loss": 0.2798, + "step": 22976 + }, + { + "epoch": 0.67, + "grad_norm": 1.4713664016264771, + "learning_rate": 2.6448029415168964e-06, + "loss": 0.2812, + "step": 22977 + }, + { + "epoch": 0.67, + "grad_norm": 1.3345422641257598, + "learning_rate": 2.6443886170765376e-06, + "loss": 0.2901, + "step": 22978 + }, + { + "epoch": 0.67, + "grad_norm": 2.4535060228102337, + "learning_rate": 2.6439743134244765e-06, + "loss": 0.2737, + "step": 22979 + }, + { + "epoch": 0.67, + "grad_norm": 1.3685251327888508, + "learning_rate": 2.643560030564367e-06, + "loss": 0.291, + "step": 22980 + }, + { + "epoch": 0.67, + "grad_norm": 1.6200598447958887, + "learning_rate": 2.643145768499867e-06, + "loss": 0.2993, + "step": 22981 + }, + { + "epoch": 0.67, + "grad_norm": 1.328821691813882, + "learning_rate": 2.642731527234631e-06, + "loss": 0.2753, + "step": 22982 + }, + { + "epoch": 0.67, + "grad_norm": 1.389857568281531, + "learning_rate": 2.6423173067723153e-06, + "loss": 0.3149, + "step": 22983 + }, + { + "epoch": 0.67, + "grad_norm": 1.8298511421989836, + "learning_rate": 2.641903107116576e-06, + "loss": 0.2803, + "step": 22984 + }, + { + "epoch": 0.67, + "grad_norm": 1.3458811961876138, + "learning_rate": 2.6414889282710676e-06, + "loss": 0.313, + "step": 22985 + }, + { + "epoch": 0.67, + "grad_norm": 1.261273984761064, + "learning_rate": 2.641074770239446e-06, + "loss": 0.2784, + "step": 22986 + }, + { + "epoch": 0.67, + "grad_norm": 1.3759429761250714, + "learning_rate": 2.640660633025367e-06, + "loss": 0.2779, + "step": 22987 + }, + { + "epoch": 0.67, + "grad_norm": 1.3397545495074106, + "learning_rate": 2.640246516632482e-06, + "loss": 0.284, + "step": 22988 + }, + { + "epoch": 0.67, + "grad_norm": 1.2619255873401347, + "learning_rate": 2.6398324210644477e-06, + "loss": 0.2895, + "step": 22989 + }, + { + "epoch": 0.67, + "grad_norm": 1.522311181792006, + "learning_rate": 2.639418346324919e-06, + "loss": 0.2769, + "step": 22990 + }, + { + "epoch": 0.67, + "grad_norm": 1.455788198788173, + "learning_rate": 2.6390042924175495e-06, + "loss": 0.2835, + "step": 22991 + }, + { + "epoch": 0.67, + "grad_norm": 1.2656374538683774, + "learning_rate": 2.6385902593459934e-06, + "loss": 0.2711, + "step": 22992 + }, + { + "epoch": 0.67, + "grad_norm": 1.2004044702141745, + "learning_rate": 2.6381762471139042e-06, + "loss": 0.2597, + "step": 22993 + }, + { + "epoch": 0.67, + "grad_norm": 1.2286429343537983, + "learning_rate": 2.637762255724936e-06, + "loss": 0.2787, + "step": 22994 + }, + { + "epoch": 0.67, + "grad_norm": 1.3380598834974458, + "learning_rate": 2.637348285182742e-06, + "loss": 0.3047, + "step": 22995 + }, + { + "epoch": 0.67, + "grad_norm": 1.3660823946835075, + "learning_rate": 2.636934335490978e-06, + "loss": 0.274, + "step": 22996 + }, + { + "epoch": 0.67, + "grad_norm": 1.3144981236608975, + "learning_rate": 2.636520406653292e-06, + "loss": 0.3083, + "step": 22997 + }, + { + "epoch": 0.67, + "grad_norm": 1.2937093117719853, + "learning_rate": 2.636106498673342e-06, + "loss": 0.2628, + "step": 22998 + }, + { + "epoch": 0.67, + "grad_norm": 1.4557526360463315, + "learning_rate": 2.6356926115547765e-06, + "loss": 0.2822, + "step": 22999 + }, + { + "epoch": 0.67, + "grad_norm": 1.619379356053882, + "learning_rate": 2.6352787453012497e-06, + "loss": 0.2833, + "step": 23000 + }, + { + "epoch": 0.67, + "grad_norm": 1.475571798135753, + "learning_rate": 2.634864899916414e-06, + "loss": 0.2973, + "step": 23001 + }, + { + "epoch": 0.67, + "grad_norm": 1.407311101986332, + "learning_rate": 2.634451075403922e-06, + "loss": 0.2995, + "step": 23002 + }, + { + "epoch": 0.67, + "grad_norm": 1.7229464576388094, + "learning_rate": 2.6340372717674257e-06, + "loss": 0.3044, + "step": 23003 + }, + { + "epoch": 0.67, + "grad_norm": 1.5078136242230495, + "learning_rate": 2.633623489010576e-06, + "loss": 0.2974, + "step": 23004 + }, + { + "epoch": 0.67, + "grad_norm": 1.338263586784125, + "learning_rate": 2.633209727137026e-06, + "loss": 0.2809, + "step": 23005 + }, + { + "epoch": 0.67, + "grad_norm": 1.6558294306051786, + "learning_rate": 2.632795986150428e-06, + "loss": 0.2851, + "step": 23006 + }, + { + "epoch": 0.67, + "grad_norm": 1.5012629398826667, + "learning_rate": 2.6323822660544306e-06, + "loss": 0.278, + "step": 23007 + }, + { + "epoch": 0.67, + "grad_norm": 2.2288767654033026, + "learning_rate": 2.6319685668526853e-06, + "loss": 0.2684, + "step": 23008 + }, + { + "epoch": 0.67, + "grad_norm": 1.0748184699142262, + "learning_rate": 2.6315548885488437e-06, + "loss": 0.6063, + "step": 23009 + }, + { + "epoch": 0.67, + "grad_norm": 1.312688607527591, + "learning_rate": 2.631141231146556e-06, + "loss": 0.2597, + "step": 23010 + }, + { + "epoch": 0.67, + "grad_norm": 1.3699911238100877, + "learning_rate": 2.6307275946494735e-06, + "loss": 0.286, + "step": 23011 + }, + { + "epoch": 0.67, + "grad_norm": 1.296872429425353, + "learning_rate": 2.630313979061247e-06, + "loss": 0.2721, + "step": 23012 + }, + { + "epoch": 0.67, + "grad_norm": 1.9166623579397921, + "learning_rate": 2.6299003843855255e-06, + "loss": 0.2777, + "step": 23013 + }, + { + "epoch": 0.67, + "grad_norm": 1.4608700090256834, + "learning_rate": 2.6294868106259584e-06, + "loss": 0.2772, + "step": 23014 + }, + { + "epoch": 0.67, + "grad_norm": 1.523766654182185, + "learning_rate": 2.6290732577862e-06, + "loss": 0.3078, + "step": 23015 + }, + { + "epoch": 0.67, + "grad_norm": 1.3657229269234616, + "learning_rate": 2.6286597258698938e-06, + "loss": 0.2947, + "step": 23016 + }, + { + "epoch": 0.67, + "grad_norm": 1.4087385843962419, + "learning_rate": 2.628246214880692e-06, + "loss": 0.2811, + "step": 23017 + }, + { + "epoch": 0.67, + "grad_norm": 1.485734516509507, + "learning_rate": 2.627832724822243e-06, + "loss": 0.2947, + "step": 23018 + }, + { + "epoch": 0.67, + "grad_norm": 1.3012290768655739, + "learning_rate": 2.627419255698197e-06, + "loss": 0.2864, + "step": 23019 + }, + { + "epoch": 0.67, + "grad_norm": 1.2772809862862533, + "learning_rate": 2.6270058075122025e-06, + "loss": 0.3103, + "step": 23020 + }, + { + "epoch": 0.67, + "grad_norm": 1.3726110248891976, + "learning_rate": 2.626592380267908e-06, + "loss": 0.2829, + "step": 23021 + }, + { + "epoch": 0.67, + "grad_norm": 1.4990603639033877, + "learning_rate": 2.6261789739689614e-06, + "loss": 0.2986, + "step": 23022 + }, + { + "epoch": 0.67, + "grad_norm": 1.3890225606620272, + "learning_rate": 2.6257655886190147e-06, + "loss": 0.276, + "step": 23023 + }, + { + "epoch": 0.67, + "grad_norm": 1.1982996225809133, + "learning_rate": 2.6253522242217107e-06, + "loss": 0.2594, + "step": 23024 + }, + { + "epoch": 0.67, + "grad_norm": 1.254114610267306, + "learning_rate": 2.6249388807806997e-06, + "loss": 0.2923, + "step": 23025 + }, + { + "epoch": 0.67, + "grad_norm": 1.4391573575407177, + "learning_rate": 2.624525558299631e-06, + "loss": 0.2993, + "step": 23026 + }, + { + "epoch": 0.67, + "grad_norm": 1.2779689574198347, + "learning_rate": 2.6241122567821485e-06, + "loss": 0.282, + "step": 23027 + }, + { + "epoch": 0.67, + "grad_norm": 1.3228101233409701, + "learning_rate": 2.623698976231902e-06, + "loss": 0.2817, + "step": 23028 + }, + { + "epoch": 0.67, + "grad_norm": 1.3195706330562558, + "learning_rate": 2.6232857166525387e-06, + "loss": 0.28, + "step": 23029 + }, + { + "epoch": 0.67, + "grad_norm": 1.2913605073147272, + "learning_rate": 2.6228724780477056e-06, + "loss": 0.2982, + "step": 23030 + }, + { + "epoch": 0.67, + "grad_norm": 1.329870325245115, + "learning_rate": 2.6224592604210485e-06, + "loss": 0.276, + "step": 23031 + }, + { + "epoch": 0.67, + "grad_norm": 1.2647380650123856, + "learning_rate": 2.6220460637762146e-06, + "loss": 0.2835, + "step": 23032 + }, + { + "epoch": 0.67, + "grad_norm": 1.4991705424083022, + "learning_rate": 2.621632888116851e-06, + "loss": 0.3062, + "step": 23033 + }, + { + "epoch": 0.67, + "grad_norm": 1.50672163920674, + "learning_rate": 2.621219733446606e-06, + "loss": 0.2877, + "step": 23034 + }, + { + "epoch": 0.67, + "grad_norm": 1.4027044642138937, + "learning_rate": 2.6208065997691206e-06, + "loss": 0.2889, + "step": 23035 + }, + { + "epoch": 0.67, + "grad_norm": 1.4830196886479177, + "learning_rate": 2.6203934870880433e-06, + "loss": 0.2844, + "step": 23036 + }, + { + "epoch": 0.67, + "grad_norm": 1.3749067357405451, + "learning_rate": 2.61998039540702e-06, + "loss": 0.2839, + "step": 23037 + }, + { + "epoch": 0.67, + "grad_norm": 1.3807916783652914, + "learning_rate": 2.6195673247296964e-06, + "loss": 0.2723, + "step": 23038 + }, + { + "epoch": 0.67, + "grad_norm": 1.21071909529566, + "learning_rate": 2.6191542750597176e-06, + "loss": 0.2556, + "step": 23039 + }, + { + "epoch": 0.67, + "grad_norm": 0.9814638870422867, + "learning_rate": 2.618741246400729e-06, + "loss": 0.6069, + "step": 23040 + }, + { + "epoch": 0.67, + "grad_norm": 1.2313432811307927, + "learning_rate": 2.6183282387563747e-06, + "loss": 0.2733, + "step": 23041 + }, + { + "epoch": 0.67, + "grad_norm": 1.368055864994566, + "learning_rate": 2.617915252130302e-06, + "loss": 0.2882, + "step": 23042 + }, + { + "epoch": 0.67, + "grad_norm": 1.2603348583969385, + "learning_rate": 2.617502286526152e-06, + "loss": 0.2898, + "step": 23043 + }, + { + "epoch": 0.67, + "grad_norm": 1.2853340772723298, + "learning_rate": 2.6170893419475708e-06, + "loss": 0.2797, + "step": 23044 + }, + { + "epoch": 0.67, + "grad_norm": 1.2396517054697074, + "learning_rate": 2.6166764183982017e-06, + "loss": 0.2804, + "step": 23045 + }, + { + "epoch": 0.67, + "grad_norm": 1.6195615694895362, + "learning_rate": 2.6162635158816907e-06, + "loss": 0.2829, + "step": 23046 + }, + { + "epoch": 0.67, + "grad_norm": 1.324319039095869, + "learning_rate": 2.6158506344016805e-06, + "loss": 0.2873, + "step": 23047 + }, + { + "epoch": 0.67, + "grad_norm": 1.541219148209867, + "learning_rate": 2.615437773961814e-06, + "loss": 0.2933, + "step": 23048 + }, + { + "epoch": 0.67, + "grad_norm": 1.2387177222266643, + "learning_rate": 2.615024934565737e-06, + "loss": 0.2703, + "step": 23049 + }, + { + "epoch": 0.67, + "grad_norm": 1.2363455108711454, + "learning_rate": 2.6146121162170906e-06, + "loss": 0.2854, + "step": 23050 + }, + { + "epoch": 0.67, + "grad_norm": 1.3353965534619303, + "learning_rate": 2.614199318919521e-06, + "loss": 0.2914, + "step": 23051 + }, + { + "epoch": 0.67, + "grad_norm": 1.2984980388870138, + "learning_rate": 2.613786542676666e-06, + "loss": 0.2943, + "step": 23052 + }, + { + "epoch": 0.67, + "grad_norm": 1.3423294051457555, + "learning_rate": 2.613373787492173e-06, + "loss": 0.3281, + "step": 23053 + }, + { + "epoch": 0.67, + "grad_norm": 1.3265696545086585, + "learning_rate": 2.612961053369683e-06, + "loss": 0.3094, + "step": 23054 + }, + { + "epoch": 0.67, + "grad_norm": 1.2587697391698496, + "learning_rate": 2.612548340312837e-06, + "loss": 0.3139, + "step": 23055 + }, + { + "epoch": 0.67, + "grad_norm": 1.4080615656502915, + "learning_rate": 2.612135648325278e-06, + "loss": 0.2966, + "step": 23056 + }, + { + "epoch": 0.67, + "grad_norm": 1.3062502726875758, + "learning_rate": 2.611722977410649e-06, + "loss": 0.2588, + "step": 23057 + }, + { + "epoch": 0.67, + "grad_norm": 1.3527210310114697, + "learning_rate": 2.611310327572591e-06, + "loss": 0.2874, + "step": 23058 + }, + { + "epoch": 0.67, + "grad_norm": 0.9844468135676294, + "learning_rate": 2.610897698814746e-06, + "loss": 0.6451, + "step": 23059 + }, + { + "epoch": 0.67, + "grad_norm": 1.3306070350394157, + "learning_rate": 2.610485091140755e-06, + "loss": 0.2925, + "step": 23060 + }, + { + "epoch": 0.67, + "grad_norm": 1.3175902727903814, + "learning_rate": 2.610072504554262e-06, + "loss": 0.3087, + "step": 23061 + }, + { + "epoch": 0.67, + "grad_norm": 1.9326308221227033, + "learning_rate": 2.609659939058903e-06, + "loss": 0.2937, + "step": 23062 + }, + { + "epoch": 0.67, + "grad_norm": 1.2670353471907991, + "learning_rate": 2.6092473946583218e-06, + "loss": 0.2795, + "step": 23063 + }, + { + "epoch": 0.67, + "grad_norm": 1.6474817510636512, + "learning_rate": 2.608834871356159e-06, + "loss": 0.3097, + "step": 23064 + }, + { + "epoch": 0.67, + "grad_norm": 1.2459243426475304, + "learning_rate": 2.6084223691560553e-06, + "loss": 0.2587, + "step": 23065 + }, + { + "epoch": 0.67, + "grad_norm": 1.3319597621197836, + "learning_rate": 2.60800988806165e-06, + "loss": 0.2682, + "step": 23066 + }, + { + "epoch": 0.67, + "grad_norm": 1.2830363373488398, + "learning_rate": 2.6075974280765843e-06, + "loss": 0.2987, + "step": 23067 + }, + { + "epoch": 0.67, + "grad_norm": 1.2908811581942161, + "learning_rate": 2.6071849892044978e-06, + "loss": 0.2835, + "step": 23068 + }, + { + "epoch": 0.67, + "grad_norm": 1.2448871334803022, + "learning_rate": 2.6067725714490307e-06, + "loss": 0.2721, + "step": 23069 + }, + { + "epoch": 0.67, + "grad_norm": 1.3491970328283203, + "learning_rate": 2.6063601748138232e-06, + "loss": 0.3174, + "step": 23070 + }, + { + "epoch": 0.67, + "grad_norm": 1.6798942111455413, + "learning_rate": 2.605947799302513e-06, + "loss": 0.2848, + "step": 23071 + }, + { + "epoch": 0.67, + "grad_norm": 2.1773671039971876, + "learning_rate": 2.605535444918739e-06, + "loss": 0.313, + "step": 23072 + }, + { + "epoch": 0.67, + "grad_norm": 1.3276929949491862, + "learning_rate": 2.6051231116661418e-06, + "loss": 0.307, + "step": 23073 + }, + { + "epoch": 0.67, + "grad_norm": 1.27527056121217, + "learning_rate": 2.60471079954836e-06, + "loss": 0.2943, + "step": 23074 + }, + { + "epoch": 0.67, + "grad_norm": 1.4256279359162445, + "learning_rate": 2.6042985085690315e-06, + "loss": 0.2866, + "step": 23075 + }, + { + "epoch": 0.67, + "grad_norm": 1.4489728449999242, + "learning_rate": 2.6038862387317954e-06, + "loss": 0.2947, + "step": 23076 + }, + { + "epoch": 0.67, + "grad_norm": 1.7413645918861402, + "learning_rate": 2.6034739900402896e-06, + "loss": 0.2908, + "step": 23077 + }, + { + "epoch": 0.67, + "grad_norm": 1.460915756570384, + "learning_rate": 2.6030617624981546e-06, + "loss": 0.2931, + "step": 23078 + }, + { + "epoch": 0.67, + "grad_norm": 1.305512306948831, + "learning_rate": 2.6026495561090238e-06, + "loss": 0.3043, + "step": 23079 + }, + { + "epoch": 0.67, + "grad_norm": 1.2892102560616947, + "learning_rate": 2.6022373708765373e-06, + "loss": 0.2783, + "step": 23080 + }, + { + "epoch": 0.67, + "grad_norm": 2.854453166621817, + "learning_rate": 2.601825206804333e-06, + "loss": 0.2936, + "step": 23081 + }, + { + "epoch": 0.67, + "grad_norm": 1.5853115850635398, + "learning_rate": 2.60141306389605e-06, + "loss": 0.2849, + "step": 23082 + }, + { + "epoch": 0.67, + "grad_norm": 1.704015756270365, + "learning_rate": 2.6010009421553204e-06, + "loss": 0.3202, + "step": 23083 + }, + { + "epoch": 0.67, + "grad_norm": 1.521451115058582, + "learning_rate": 2.600588841585785e-06, + "loss": 0.2759, + "step": 23084 + }, + { + "epoch": 0.67, + "grad_norm": 1.299856636703668, + "learning_rate": 2.6001767621910795e-06, + "loss": 0.2816, + "step": 23085 + }, + { + "epoch": 0.67, + "grad_norm": 1.4021543738534141, + "learning_rate": 2.599764703974841e-06, + "loss": 0.2998, + "step": 23086 + }, + { + "epoch": 0.67, + "grad_norm": 1.3098203007491247, + "learning_rate": 2.5993526669407047e-06, + "loss": 0.3185, + "step": 23087 + }, + { + "epoch": 0.67, + "grad_norm": 1.5416141133063124, + "learning_rate": 2.5989406510923088e-06, + "loss": 0.2961, + "step": 23088 + }, + { + "epoch": 0.67, + "grad_norm": 1.4040637346704488, + "learning_rate": 2.5985286564332895e-06, + "loss": 0.2889, + "step": 23089 + }, + { + "epoch": 0.67, + "grad_norm": 1.2429467023604142, + "learning_rate": 2.5981166829672798e-06, + "loss": 0.2795, + "step": 23090 + }, + { + "epoch": 0.67, + "grad_norm": 1.2906864211336242, + "learning_rate": 2.5977047306979167e-06, + "loss": 0.2604, + "step": 23091 + }, + { + "epoch": 0.67, + "grad_norm": 1.2959523110868079, + "learning_rate": 2.5972927996288368e-06, + "loss": 0.2885, + "step": 23092 + }, + { + "epoch": 0.67, + "grad_norm": 1.4316803677739693, + "learning_rate": 2.596880889763673e-06, + "loss": 0.2838, + "step": 23093 + }, + { + "epoch": 0.67, + "grad_norm": 1.2312005579677905, + "learning_rate": 2.5964690011060635e-06, + "loss": 0.287, + "step": 23094 + }, + { + "epoch": 0.67, + "grad_norm": 1.2132253907652324, + "learning_rate": 2.596057133659641e-06, + "loss": 0.2832, + "step": 23095 + }, + { + "epoch": 0.67, + "grad_norm": 1.24425299562097, + "learning_rate": 2.595645287428041e-06, + "loss": 0.2833, + "step": 23096 + }, + { + "epoch": 0.67, + "grad_norm": 1.5677156670184775, + "learning_rate": 2.595233462414901e-06, + "loss": 0.2829, + "step": 23097 + }, + { + "epoch": 0.67, + "grad_norm": 1.2757274795985976, + "learning_rate": 2.5948216586238496e-06, + "loss": 0.282, + "step": 23098 + }, + { + "epoch": 0.67, + "grad_norm": 1.5815754834512132, + "learning_rate": 2.5944098760585235e-06, + "loss": 0.2689, + "step": 23099 + }, + { + "epoch": 0.67, + "grad_norm": 1.8370149919430077, + "learning_rate": 2.5939981147225574e-06, + "loss": 0.2861, + "step": 23100 + }, + { + "epoch": 0.67, + "grad_norm": 1.3445803962443217, + "learning_rate": 2.5935863746195842e-06, + "loss": 0.2873, + "step": 23101 + }, + { + "epoch": 0.67, + "grad_norm": 0.978447207199072, + "learning_rate": 2.5931746557532385e-06, + "loss": 0.5976, + "step": 23102 + }, + { + "epoch": 0.67, + "grad_norm": 1.562695201816624, + "learning_rate": 2.592762958127153e-06, + "loss": 0.2869, + "step": 23103 + }, + { + "epoch": 0.67, + "grad_norm": 1.3766711493430546, + "learning_rate": 2.59235128174496e-06, + "loss": 0.3063, + "step": 23104 + }, + { + "epoch": 0.67, + "grad_norm": 1.4698102552797723, + "learning_rate": 2.591939626610295e-06, + "loss": 0.282, + "step": 23105 + }, + { + "epoch": 0.67, + "grad_norm": 1.3177817338299864, + "learning_rate": 2.591527992726791e-06, + "loss": 0.3051, + "step": 23106 + }, + { + "epoch": 0.67, + "grad_norm": 1.2697817950222356, + "learning_rate": 2.5911163800980777e-06, + "loss": 0.2741, + "step": 23107 + }, + { + "epoch": 0.67, + "grad_norm": 0.9041587716758064, + "learning_rate": 2.590704788727788e-06, + "loss": 0.6048, + "step": 23108 + }, + { + "epoch": 0.67, + "grad_norm": 1.2842658659455601, + "learning_rate": 2.590293218619556e-06, + "loss": 0.2934, + "step": 23109 + }, + { + "epoch": 0.67, + "grad_norm": 1.478999125253259, + "learning_rate": 2.5898816697770154e-06, + "loss": 0.3301, + "step": 23110 + }, + { + "epoch": 0.67, + "grad_norm": 1.2356152708364736, + "learning_rate": 2.5894701422037926e-06, + "loss": 0.277, + "step": 23111 + }, + { + "epoch": 0.67, + "grad_norm": 1.2824473452709861, + "learning_rate": 2.589058635903524e-06, + "loss": 0.2864, + "step": 23112 + }, + { + "epoch": 0.67, + "grad_norm": 1.3677494752346904, + "learning_rate": 2.5886471508798388e-06, + "loss": 0.3136, + "step": 23113 + }, + { + "epoch": 0.67, + "grad_norm": 1.550555238999136, + "learning_rate": 2.588235687136369e-06, + "loss": 0.3005, + "step": 23114 + }, + { + "epoch": 0.67, + "grad_norm": 1.2391294610015542, + "learning_rate": 2.5878242446767466e-06, + "loss": 0.2843, + "step": 23115 + }, + { + "epoch": 0.67, + "grad_norm": 1.3685620273357764, + "learning_rate": 2.587412823504604e-06, + "loss": 0.3158, + "step": 23116 + }, + { + "epoch": 0.67, + "grad_norm": 1.2156578683525063, + "learning_rate": 2.587001423623568e-06, + "loss": 0.262, + "step": 23117 + }, + { + "epoch": 0.67, + "grad_norm": 1.586650275748371, + "learning_rate": 2.5865900450372704e-06, + "loss": 0.3118, + "step": 23118 + }, + { + "epoch": 0.67, + "grad_norm": 1.2900908564535354, + "learning_rate": 2.586178687749343e-06, + "loss": 0.2722, + "step": 23119 + }, + { + "epoch": 0.67, + "grad_norm": 1.3287864162131906, + "learning_rate": 2.585767351763416e-06, + "loss": 0.3089, + "step": 23120 + }, + { + "epoch": 0.67, + "grad_norm": 1.323516194380623, + "learning_rate": 2.585356037083119e-06, + "loss": 0.2905, + "step": 23121 + }, + { + "epoch": 0.67, + "grad_norm": 1.449092238255338, + "learning_rate": 2.5849447437120813e-06, + "loss": 0.2863, + "step": 23122 + }, + { + "epoch": 0.67, + "grad_norm": 1.4985877884652845, + "learning_rate": 2.5845334716539327e-06, + "loss": 0.2854, + "step": 23123 + }, + { + "epoch": 0.67, + "grad_norm": 1.4446306253101677, + "learning_rate": 2.584122220912303e-06, + "loss": 0.2911, + "step": 23124 + }, + { + "epoch": 0.67, + "grad_norm": 0.9693520346335672, + "learning_rate": 2.583710991490824e-06, + "loss": 0.5629, + "step": 23125 + }, + { + "epoch": 0.67, + "grad_norm": 1.2569621145716219, + "learning_rate": 2.5832997833931195e-06, + "loss": 0.2647, + "step": 23126 + }, + { + "epoch": 0.67, + "grad_norm": 1.6600553900723962, + "learning_rate": 2.5828885966228216e-06, + "loss": 0.2792, + "step": 23127 + }, + { + "epoch": 0.67, + "grad_norm": 1.1959036996363566, + "learning_rate": 2.5824774311835586e-06, + "loss": 0.2739, + "step": 23128 + }, + { + "epoch": 0.67, + "grad_norm": 1.2874893520248551, + "learning_rate": 2.5820662870789596e-06, + "loss": 0.2857, + "step": 23129 + }, + { + "epoch": 0.67, + "grad_norm": 1.7438490725950597, + "learning_rate": 2.581655164312652e-06, + "loss": 0.2735, + "step": 23130 + }, + { + "epoch": 0.67, + "grad_norm": 1.5719197355307473, + "learning_rate": 2.581244062888265e-06, + "loss": 0.2848, + "step": 23131 + }, + { + "epoch": 0.67, + "grad_norm": 1.587285579359058, + "learning_rate": 2.5808329828094255e-06, + "loss": 0.2963, + "step": 23132 + }, + { + "epoch": 0.67, + "grad_norm": 1.5794347878801, + "learning_rate": 2.5804219240797636e-06, + "loss": 0.2962, + "step": 23133 + }, + { + "epoch": 0.67, + "grad_norm": 0.9689365131560205, + "learning_rate": 2.5800108867029033e-06, + "loss": 0.5784, + "step": 23134 + }, + { + "epoch": 0.67, + "grad_norm": 1.3123171189324465, + "learning_rate": 2.579599870682474e-06, + "loss": 0.2902, + "step": 23135 + }, + { + "epoch": 0.67, + "grad_norm": 1.3390493016937692, + "learning_rate": 2.5791888760221017e-06, + "loss": 0.2703, + "step": 23136 + }, + { + "epoch": 0.67, + "grad_norm": 1.3905113631404034, + "learning_rate": 2.578777902725416e-06, + "loss": 0.2988, + "step": 23137 + }, + { + "epoch": 0.67, + "grad_norm": 1.2964974068061859, + "learning_rate": 2.5783669507960428e-06, + "loss": 0.2781, + "step": 23138 + }, + { + "epoch": 0.67, + "grad_norm": 1.33983242673176, + "learning_rate": 2.5779560202376066e-06, + "loss": 0.3165, + "step": 23139 + }, + { + "epoch": 0.67, + "grad_norm": 1.2947527633765112, + "learning_rate": 2.577545111053735e-06, + "loss": 0.3014, + "step": 23140 + }, + { + "epoch": 0.67, + "grad_norm": 1.706718720063326, + "learning_rate": 2.5771342232480557e-06, + "loss": 0.2874, + "step": 23141 + }, + { + "epoch": 0.67, + "grad_norm": 1.2511215791005759, + "learning_rate": 2.576723356824193e-06, + "loss": 0.3045, + "step": 23142 + }, + { + "epoch": 0.67, + "grad_norm": 1.2738123354921287, + "learning_rate": 2.5763125117857755e-06, + "loss": 0.2946, + "step": 23143 + }, + { + "epoch": 0.67, + "grad_norm": 1.3664098533074174, + "learning_rate": 2.575901688136425e-06, + "loss": 0.3117, + "step": 23144 + }, + { + "epoch": 0.67, + "grad_norm": 1.2471034093686308, + "learning_rate": 2.5754908858797695e-06, + "loss": 0.2668, + "step": 23145 + }, + { + "epoch": 0.67, + "grad_norm": 1.2994555999439632, + "learning_rate": 2.575080105019433e-06, + "loss": 0.269, + "step": 23146 + }, + { + "epoch": 0.67, + "grad_norm": 1.6368399711916624, + "learning_rate": 2.574669345559042e-06, + "loss": 0.2704, + "step": 23147 + }, + { + "epoch": 0.67, + "grad_norm": 1.3168931311865673, + "learning_rate": 2.5742586075022215e-06, + "loss": 0.2893, + "step": 23148 + }, + { + "epoch": 0.67, + "grad_norm": 1.3486646256887762, + "learning_rate": 2.573847890852595e-06, + "loss": 0.3248, + "step": 23149 + }, + { + "epoch": 0.67, + "grad_norm": 1.2700864315725082, + "learning_rate": 2.5734371956137877e-06, + "loss": 0.289, + "step": 23150 + }, + { + "epoch": 0.67, + "grad_norm": 1.6658227804450756, + "learning_rate": 2.5730265217894247e-06, + "loss": 0.3008, + "step": 23151 + }, + { + "epoch": 0.67, + "grad_norm": 1.327123342965784, + "learning_rate": 2.5726158693831304e-06, + "loss": 0.2744, + "step": 23152 + }, + { + "epoch": 0.67, + "grad_norm": 2.2269247514518904, + "learning_rate": 2.5722052383985267e-06, + "loss": 0.2766, + "step": 23153 + }, + { + "epoch": 0.67, + "grad_norm": 1.37041531076676, + "learning_rate": 2.571794628839239e-06, + "loss": 0.2621, + "step": 23154 + }, + { + "epoch": 0.67, + "grad_norm": 1.3844529812034923, + "learning_rate": 2.57138404070889e-06, + "loss": 0.2762, + "step": 23155 + }, + { + "epoch": 0.67, + "grad_norm": 1.4372328113630883, + "learning_rate": 2.5709734740111043e-06, + "loss": 0.2803, + "step": 23156 + }, + { + "epoch": 0.67, + "grad_norm": 1.3032369676244762, + "learning_rate": 2.5705629287495045e-06, + "loss": 0.2787, + "step": 23157 + }, + { + "epoch": 0.67, + "grad_norm": 1.389437949185541, + "learning_rate": 2.570152404927714e-06, + "loss": 0.2984, + "step": 23158 + }, + { + "epoch": 0.67, + "grad_norm": 1.4550666871709241, + "learning_rate": 2.5697419025493554e-06, + "loss": 0.3119, + "step": 23159 + }, + { + "epoch": 0.67, + "grad_norm": 1.405517812750174, + "learning_rate": 2.5693314216180508e-06, + "loss": 0.2755, + "step": 23160 + }, + { + "epoch": 0.67, + "grad_norm": 1.3181069238719052, + "learning_rate": 2.5689209621374257e-06, + "loss": 0.2795, + "step": 23161 + }, + { + "epoch": 0.67, + "grad_norm": 1.643383372583801, + "learning_rate": 2.5685105241110987e-06, + "loss": 0.3022, + "step": 23162 + }, + { + "epoch": 0.67, + "grad_norm": 1.5222629785311244, + "learning_rate": 2.5681001075426924e-06, + "loss": 0.3004, + "step": 23163 + }, + { + "epoch": 0.67, + "grad_norm": 1.4718811805159635, + "learning_rate": 2.56768971243583e-06, + "loss": 0.2796, + "step": 23164 + }, + { + "epoch": 0.67, + "grad_norm": 1.247540138192603, + "learning_rate": 2.567279338794133e-06, + "loss": 0.3073, + "step": 23165 + }, + { + "epoch": 0.67, + "grad_norm": 1.483119581046257, + "learning_rate": 2.566868986621225e-06, + "loss": 0.2797, + "step": 23166 + }, + { + "epoch": 0.67, + "grad_norm": 1.4641414905614565, + "learning_rate": 2.5664586559207227e-06, + "loss": 0.2781, + "step": 23167 + }, + { + "epoch": 0.67, + "grad_norm": 1.2747282868500152, + "learning_rate": 2.5660483466962504e-06, + "loss": 0.2809, + "step": 23168 + }, + { + "epoch": 0.67, + "grad_norm": 2.801769481725523, + "learning_rate": 2.565638058951428e-06, + "loss": 0.2857, + "step": 23169 + }, + { + "epoch": 0.67, + "grad_norm": 1.3080308025827032, + "learning_rate": 2.5652277926898766e-06, + "loss": 0.2922, + "step": 23170 + }, + { + "epoch": 0.67, + "grad_norm": 1.2703318687902228, + "learning_rate": 2.5648175479152193e-06, + "loss": 0.2907, + "step": 23171 + }, + { + "epoch": 0.67, + "grad_norm": 1.5277494932409241, + "learning_rate": 2.564407324631072e-06, + "loss": 0.3159, + "step": 23172 + }, + { + "epoch": 0.67, + "grad_norm": 1.3173747538240235, + "learning_rate": 2.5639971228410566e-06, + "loss": 0.2994, + "step": 23173 + }, + { + "epoch": 0.67, + "grad_norm": 1.5857384712763256, + "learning_rate": 2.563586942548794e-06, + "loss": 0.3102, + "step": 23174 + }, + { + "epoch": 0.67, + "grad_norm": 1.2575318554929886, + "learning_rate": 2.563176783757904e-06, + "loss": 0.2997, + "step": 23175 + }, + { + "epoch": 0.67, + "grad_norm": 1.3437930218039873, + "learning_rate": 2.562766646472006e-06, + "loss": 0.2935, + "step": 23176 + }, + { + "epoch": 0.67, + "grad_norm": 1.6994644840326925, + "learning_rate": 2.5623565306947185e-06, + "loss": 0.2818, + "step": 23177 + }, + { + "epoch": 0.67, + "grad_norm": 1.2298877112336317, + "learning_rate": 2.561946436429662e-06, + "loss": 0.286, + "step": 23178 + }, + { + "epoch": 0.67, + "grad_norm": 1.358782871571171, + "learning_rate": 2.561536363680458e-06, + "loss": 0.3229, + "step": 23179 + }, + { + "epoch": 0.67, + "grad_norm": 1.3011443786616494, + "learning_rate": 2.5611263124507203e-06, + "loss": 0.2644, + "step": 23180 + }, + { + "epoch": 0.67, + "grad_norm": 1.39171859552446, + "learning_rate": 2.560716282744069e-06, + "loss": 0.2909, + "step": 23181 + }, + { + "epoch": 0.67, + "grad_norm": 1.2455846219623474, + "learning_rate": 2.5603062745641248e-06, + "loss": 0.2632, + "step": 23182 + }, + { + "epoch": 0.67, + "grad_norm": 1.3487309536654442, + "learning_rate": 2.559896287914504e-06, + "loss": 0.2788, + "step": 23183 + }, + { + "epoch": 0.67, + "grad_norm": 1.313355884172137, + "learning_rate": 2.5594863227988257e-06, + "loss": 0.2829, + "step": 23184 + }, + { + "epoch": 0.67, + "grad_norm": 1.6493990418688336, + "learning_rate": 2.559076379220708e-06, + "loss": 0.3237, + "step": 23185 + }, + { + "epoch": 0.67, + "grad_norm": 1.2449657155441005, + "learning_rate": 2.5586664571837684e-06, + "loss": 0.2879, + "step": 23186 + }, + { + "epoch": 0.67, + "grad_norm": 1.8539712169204723, + "learning_rate": 2.558256556691624e-06, + "loss": 0.3019, + "step": 23187 + }, + { + "epoch": 0.67, + "grad_norm": 1.4007638129200426, + "learning_rate": 2.5578466777478942e-06, + "loss": 0.2913, + "step": 23188 + }, + { + "epoch": 0.67, + "grad_norm": 1.257331603216592, + "learning_rate": 2.5574368203561935e-06, + "loss": 0.2747, + "step": 23189 + }, + { + "epoch": 0.67, + "grad_norm": 1.288731572938496, + "learning_rate": 2.557026984520139e-06, + "loss": 0.2827, + "step": 23190 + }, + { + "epoch": 0.67, + "grad_norm": 1.2939574775350013, + "learning_rate": 2.5566171702433496e-06, + "loss": 0.2854, + "step": 23191 + }, + { + "epoch": 0.67, + "grad_norm": 0.959507114185702, + "learning_rate": 2.55620737752944e-06, + "loss": 0.5897, + "step": 23192 + }, + { + "epoch": 0.67, + "grad_norm": 2.0180368993283873, + "learning_rate": 2.5557976063820277e-06, + "loss": 0.284, + "step": 23193 + }, + { + "epoch": 0.67, + "grad_norm": 1.5381263224497204, + "learning_rate": 2.55538785680473e-06, + "loss": 0.2873, + "step": 23194 + }, + { + "epoch": 0.67, + "grad_norm": 1.3687705403327954, + "learning_rate": 2.5549781288011597e-06, + "loss": 0.3007, + "step": 23195 + }, + { + "epoch": 0.67, + "grad_norm": 1.4066487525658893, + "learning_rate": 2.554568422374936e-06, + "loss": 0.2782, + "step": 23196 + }, + { + "epoch": 0.67, + "grad_norm": 1.6528311507482059, + "learning_rate": 2.554158737529672e-06, + "loss": 0.3295, + "step": 23197 + }, + { + "epoch": 0.67, + "grad_norm": 1.533444740302895, + "learning_rate": 2.5537490742689856e-06, + "loss": 0.2885, + "step": 23198 + }, + { + "epoch": 0.67, + "grad_norm": 1.5011754978318188, + "learning_rate": 2.5533394325964896e-06, + "loss": 0.2994, + "step": 23199 + }, + { + "epoch": 0.67, + "grad_norm": 0.9852840715783624, + "learning_rate": 2.5529298125157997e-06, + "loss": 0.5695, + "step": 23200 + }, + { + "epoch": 0.67, + "grad_norm": 1.2966062115632437, + "learning_rate": 2.552520214030532e-06, + "loss": 0.2889, + "step": 23201 + }, + { + "epoch": 0.67, + "grad_norm": 1.2211441413134194, + "learning_rate": 2.5521106371443006e-06, + "loss": 0.2867, + "step": 23202 + }, + { + "epoch": 0.67, + "grad_norm": 1.30856972429896, + "learning_rate": 2.551701081860719e-06, + "loss": 0.3464, + "step": 23203 + }, + { + "epoch": 0.67, + "grad_norm": 1.4023629651225455, + "learning_rate": 2.5512915481834033e-06, + "loss": 0.3135, + "step": 23204 + }, + { + "epoch": 0.67, + "grad_norm": 0.922045535270646, + "learning_rate": 2.550882036115967e-06, + "loss": 0.5661, + "step": 23205 + }, + { + "epoch": 0.67, + "grad_norm": 1.4163282220824553, + "learning_rate": 2.550472545662024e-06, + "loss": 0.2753, + "step": 23206 + }, + { + "epoch": 0.67, + "grad_norm": 1.5452284442820363, + "learning_rate": 2.5500630768251895e-06, + "loss": 0.2801, + "step": 23207 + }, + { + "epoch": 0.67, + "grad_norm": 1.4367235433355847, + "learning_rate": 2.5496536296090736e-06, + "loss": 0.2673, + "step": 23208 + }, + { + "epoch": 0.67, + "grad_norm": 1.3048910559084135, + "learning_rate": 2.5492442040172917e-06, + "loss": 0.2648, + "step": 23209 + }, + { + "epoch": 0.67, + "grad_norm": 1.491011994174523, + "learning_rate": 2.5488348000534568e-06, + "loss": 0.3134, + "step": 23210 + }, + { + "epoch": 0.67, + "grad_norm": 2.2004917143054232, + "learning_rate": 2.548425417721182e-06, + "loss": 0.2942, + "step": 23211 + }, + { + "epoch": 0.67, + "grad_norm": 1.336677019637653, + "learning_rate": 2.5480160570240796e-06, + "loss": 0.3262, + "step": 23212 + }, + { + "epoch": 0.67, + "grad_norm": 1.2078612793094372, + "learning_rate": 2.5476067179657638e-06, + "loss": 0.3146, + "step": 23213 + }, + { + "epoch": 0.67, + "grad_norm": 1.517367938307301, + "learning_rate": 2.547197400549845e-06, + "loss": 0.2897, + "step": 23214 + }, + { + "epoch": 0.67, + "grad_norm": 1.2815222596980154, + "learning_rate": 2.5467881047799386e-06, + "loss": 0.2917, + "step": 23215 + }, + { + "epoch": 0.67, + "grad_norm": 1.283362669056484, + "learning_rate": 2.5463788306596527e-06, + "loss": 0.2791, + "step": 23216 + }, + { + "epoch": 0.67, + "grad_norm": 1.3512794458772643, + "learning_rate": 2.5459695781926003e-06, + "loss": 0.2881, + "step": 23217 + }, + { + "epoch": 0.67, + "grad_norm": 1.4674949568488216, + "learning_rate": 2.5455603473823944e-06, + "loss": 0.287, + "step": 23218 + }, + { + "epoch": 0.67, + "grad_norm": 1.9901389854846174, + "learning_rate": 2.545151138232644e-06, + "loss": 0.2888, + "step": 23219 + }, + { + "epoch": 0.67, + "grad_norm": 1.43499849731327, + "learning_rate": 2.544741950746964e-06, + "loss": 0.2895, + "step": 23220 + }, + { + "epoch": 0.67, + "grad_norm": 1.2818734891611896, + "learning_rate": 2.5443327849289625e-06, + "loss": 0.301, + "step": 23221 + }, + { + "epoch": 0.67, + "grad_norm": 1.4402219774813292, + "learning_rate": 2.5439236407822537e-06, + "loss": 0.3591, + "step": 23222 + }, + { + "epoch": 0.67, + "grad_norm": 1.347617775687571, + "learning_rate": 2.5435145183104437e-06, + "loss": 0.3184, + "step": 23223 + }, + { + "epoch": 0.67, + "grad_norm": 1.175819202459832, + "learning_rate": 2.5431054175171453e-06, + "loss": 0.2724, + "step": 23224 + }, + { + "epoch": 0.67, + "grad_norm": 1.2603238559699155, + "learning_rate": 2.542696338405969e-06, + "loss": 0.284, + "step": 23225 + }, + { + "epoch": 0.67, + "grad_norm": 1.1927361288612943, + "learning_rate": 2.5422872809805277e-06, + "loss": 0.254, + "step": 23226 + }, + { + "epoch": 0.67, + "grad_norm": 1.332007776289683, + "learning_rate": 2.5418782452444253e-06, + "loss": 0.2827, + "step": 23227 + }, + { + "epoch": 0.67, + "grad_norm": 1.4069856563170053, + "learning_rate": 2.5414692312012745e-06, + "loss": 0.2837, + "step": 23228 + }, + { + "epoch": 0.67, + "grad_norm": 1.0093758254731047, + "learning_rate": 2.5410602388546856e-06, + "loss": 0.5995, + "step": 23229 + }, + { + "epoch": 0.67, + "grad_norm": 1.2748531496474715, + "learning_rate": 2.5406512682082675e-06, + "loss": 0.287, + "step": 23230 + }, + { + "epoch": 0.67, + "grad_norm": 1.3064141871644772, + "learning_rate": 2.540242319265629e-06, + "loss": 0.2977, + "step": 23231 + }, + { + "epoch": 0.67, + "grad_norm": 2.113799477460903, + "learning_rate": 2.539833392030379e-06, + "loss": 0.2972, + "step": 23232 + }, + { + "epoch": 0.67, + "grad_norm": 1.2191033964650533, + "learning_rate": 2.5394244865061266e-06, + "loss": 0.2735, + "step": 23233 + }, + { + "epoch": 0.67, + "grad_norm": 1.2913188766234234, + "learning_rate": 2.539015602696483e-06, + "loss": 0.2815, + "step": 23234 + }, + { + "epoch": 0.67, + "grad_norm": 1.225487434228141, + "learning_rate": 2.5386067406050518e-06, + "loss": 0.2614, + "step": 23235 + }, + { + "epoch": 0.67, + "grad_norm": 1.5024249730336179, + "learning_rate": 2.5381979002354435e-06, + "loss": 0.2846, + "step": 23236 + }, + { + "epoch": 0.67, + "grad_norm": 1.3541923983996873, + "learning_rate": 2.537789081591265e-06, + "loss": 0.2921, + "step": 23237 + }, + { + "epoch": 0.67, + "grad_norm": 1.3984354319210948, + "learning_rate": 2.5373802846761264e-06, + "loss": 0.2857, + "step": 23238 + }, + { + "epoch": 0.67, + "grad_norm": 1.3348320313343829, + "learning_rate": 2.536971509493634e-06, + "loss": 0.3127, + "step": 23239 + }, + { + "epoch": 0.67, + "grad_norm": 1.3308619966965534, + "learning_rate": 2.536562756047395e-06, + "loss": 0.278, + "step": 23240 + }, + { + "epoch": 0.67, + "grad_norm": 1.408750642708058, + "learning_rate": 2.5361540243410176e-06, + "loss": 0.2953, + "step": 23241 + }, + { + "epoch": 0.67, + "grad_norm": 1.2818134920000126, + "learning_rate": 2.5357453143781074e-06, + "loss": 0.296, + "step": 23242 + }, + { + "epoch": 0.67, + "grad_norm": 0.9229727858259873, + "learning_rate": 2.5353366261622748e-06, + "loss": 0.5354, + "step": 23243 + }, + { + "epoch": 0.67, + "grad_norm": 1.354187579622288, + "learning_rate": 2.534927959697121e-06, + "loss": 0.2781, + "step": 23244 + }, + { + "epoch": 0.67, + "grad_norm": 1.2237685695151639, + "learning_rate": 2.5345193149862555e-06, + "loss": 0.262, + "step": 23245 + }, + { + "epoch": 0.67, + "grad_norm": 1.2285753143101243, + "learning_rate": 2.5341106920332853e-06, + "loss": 0.2802, + "step": 23246 + }, + { + "epoch": 0.67, + "grad_norm": 1.4168593105658167, + "learning_rate": 2.5337020908418154e-06, + "loss": 0.2955, + "step": 23247 + }, + { + "epoch": 0.67, + "grad_norm": 1.347353198187004, + "learning_rate": 2.5332935114154513e-06, + "loss": 0.2775, + "step": 23248 + }, + { + "epoch": 0.67, + "grad_norm": 1.2708864031494809, + "learning_rate": 2.532884953757799e-06, + "loss": 0.2849, + "step": 23249 + }, + { + "epoch": 0.67, + "grad_norm": 1.301432901088093, + "learning_rate": 2.5324764178724653e-06, + "loss": 0.2907, + "step": 23250 + }, + { + "epoch": 0.67, + "grad_norm": 2.067648182793983, + "learning_rate": 2.532067903763056e-06, + "loss": 0.2566, + "step": 23251 + }, + { + "epoch": 0.67, + "grad_norm": 1.7903901305506846, + "learning_rate": 2.5316594114331727e-06, + "loss": 0.2789, + "step": 23252 + }, + { + "epoch": 0.67, + "grad_norm": 4.021348622888386, + "learning_rate": 2.5312509408864248e-06, + "loss": 0.2932, + "step": 23253 + }, + { + "epoch": 0.67, + "grad_norm": 1.3186958814416698, + "learning_rate": 2.530842492126413e-06, + "loss": 0.2733, + "step": 23254 + }, + { + "epoch": 0.67, + "grad_norm": 1.3335423811870286, + "learning_rate": 2.530434065156743e-06, + "loss": 0.2914, + "step": 23255 + }, + { + "epoch": 0.67, + "grad_norm": 1.3722849881340982, + "learning_rate": 2.5300256599810204e-06, + "loss": 0.2776, + "step": 23256 + }, + { + "epoch": 0.67, + "grad_norm": 1.5121292468651426, + "learning_rate": 2.529617276602848e-06, + "loss": 0.2731, + "step": 23257 + }, + { + "epoch": 0.67, + "grad_norm": 1.3841598300009503, + "learning_rate": 2.5292089150258312e-06, + "loss": 0.3074, + "step": 23258 + }, + { + "epoch": 0.67, + "grad_norm": 1.1368638113681337, + "learning_rate": 2.528800575253573e-06, + "loss": 0.2545, + "step": 23259 + }, + { + "epoch": 0.67, + "grad_norm": 1.829020372084921, + "learning_rate": 2.528392257289677e-06, + "loss": 0.2847, + "step": 23260 + }, + { + "epoch": 0.67, + "grad_norm": 1.1880979078059197, + "learning_rate": 2.527983961137747e-06, + "loss": 0.2761, + "step": 23261 + }, + { + "epoch": 0.67, + "grad_norm": 1.4646930260494209, + "learning_rate": 2.527575686801388e-06, + "loss": 0.2692, + "step": 23262 + }, + { + "epoch": 0.67, + "grad_norm": 1.3593127712777606, + "learning_rate": 2.5271674342841986e-06, + "loss": 0.2882, + "step": 23263 + }, + { + "epoch": 0.67, + "grad_norm": 1.630493891058044, + "learning_rate": 2.5267592035897844e-06, + "loss": 0.2858, + "step": 23264 + }, + { + "epoch": 0.67, + "grad_norm": 1.349421864779138, + "learning_rate": 2.526350994721747e-06, + "loss": 0.2955, + "step": 23265 + }, + { + "epoch": 0.67, + "grad_norm": 1.671169452358565, + "learning_rate": 2.52594280768369e-06, + "loss": 0.2684, + "step": 23266 + }, + { + "epoch": 0.67, + "grad_norm": 1.216370905692534, + "learning_rate": 2.5255346424792147e-06, + "loss": 0.2772, + "step": 23267 + }, + { + "epoch": 0.67, + "grad_norm": 1.3418597839107895, + "learning_rate": 2.5251264991119247e-06, + "loss": 0.2923, + "step": 23268 + }, + { + "epoch": 0.67, + "grad_norm": 1.407513215182625, + "learning_rate": 2.52471837758542e-06, + "loss": 0.3309, + "step": 23269 + }, + { + "epoch": 0.67, + "grad_norm": 1.3675852382747338, + "learning_rate": 2.524310277903305e-06, + "loss": 0.2746, + "step": 23270 + }, + { + "epoch": 0.67, + "grad_norm": 1.6561859312466163, + "learning_rate": 2.523902200069178e-06, + "loss": 0.2955, + "step": 23271 + }, + { + "epoch": 0.68, + "grad_norm": 1.353980177741998, + "learning_rate": 2.5234941440866413e-06, + "loss": 0.2846, + "step": 23272 + }, + { + "epoch": 0.68, + "grad_norm": 1.1946935926586912, + "learning_rate": 2.5230861099592952e-06, + "loss": 0.2914, + "step": 23273 + }, + { + "epoch": 0.68, + "grad_norm": 1.46525053701793, + "learning_rate": 2.5226780976907427e-06, + "loss": 0.2892, + "step": 23274 + }, + { + "epoch": 0.68, + "grad_norm": 1.3588371754105297, + "learning_rate": 2.5222701072845832e-06, + "loss": 0.2948, + "step": 23275 + }, + { + "epoch": 0.68, + "grad_norm": 1.1808005337489997, + "learning_rate": 2.5218621387444175e-06, + "loss": 0.2803, + "step": 23276 + }, + { + "epoch": 0.68, + "grad_norm": 1.4247739704547322, + "learning_rate": 2.5214541920738462e-06, + "loss": 0.2758, + "step": 23277 + }, + { + "epoch": 0.68, + "grad_norm": 0.9714424988468202, + "learning_rate": 2.521046267276469e-06, + "loss": 0.5808, + "step": 23278 + }, + { + "epoch": 0.68, + "grad_norm": 1.3036463814722767, + "learning_rate": 2.5206383643558876e-06, + "loss": 0.2784, + "step": 23279 + }, + { + "epoch": 0.68, + "grad_norm": 1.2791285998191866, + "learning_rate": 2.5202304833156987e-06, + "loss": 0.281, + "step": 23280 + }, + { + "epoch": 0.68, + "grad_norm": 1.2915355998335845, + "learning_rate": 2.5198226241595047e-06, + "loss": 0.3059, + "step": 23281 + }, + { + "epoch": 0.68, + "grad_norm": 2.0061902797304643, + "learning_rate": 2.519414786890902e-06, + "loss": 0.2944, + "step": 23282 + }, + { + "epoch": 0.68, + "grad_norm": 1.3778342802517787, + "learning_rate": 2.519006971513491e-06, + "loss": 0.2832, + "step": 23283 + }, + { + "epoch": 0.68, + "grad_norm": 1.4748505808493688, + "learning_rate": 2.5185991780308715e-06, + "loss": 0.28, + "step": 23284 + }, + { + "epoch": 0.68, + "grad_norm": 1.8970258130579831, + "learning_rate": 2.518191406446641e-06, + "loss": 0.2821, + "step": 23285 + }, + { + "epoch": 0.68, + "grad_norm": 1.4291754527005553, + "learning_rate": 2.5177836567643997e-06, + "loss": 0.2802, + "step": 23286 + }, + { + "epoch": 0.68, + "grad_norm": 1.5151264936719158, + "learning_rate": 2.5173759289877448e-06, + "loss": 0.2724, + "step": 23287 + }, + { + "epoch": 0.68, + "grad_norm": 1.261521244056309, + "learning_rate": 2.5169682231202747e-06, + "loss": 0.2689, + "step": 23288 + }, + { + "epoch": 0.68, + "grad_norm": 1.4372252679639304, + "learning_rate": 2.51656053916559e-06, + "loss": 0.3057, + "step": 23289 + }, + { + "epoch": 0.68, + "grad_norm": 1.3293112372857574, + "learning_rate": 2.5161528771272834e-06, + "loss": 0.277, + "step": 23290 + }, + { + "epoch": 0.68, + "grad_norm": 0.9291654830269426, + "learning_rate": 2.5157452370089557e-06, + "loss": 0.5838, + "step": 23291 + }, + { + "epoch": 0.68, + "grad_norm": 1.4705257501641664, + "learning_rate": 2.5153376188142042e-06, + "loss": 0.2969, + "step": 23292 + }, + { + "epoch": 0.68, + "grad_norm": 1.5722887657832947, + "learning_rate": 2.514930022546625e-06, + "loss": 0.2655, + "step": 23293 + }, + { + "epoch": 0.68, + "grad_norm": 0.8917000114374933, + "learning_rate": 2.5145224482098163e-06, + "loss": 0.5611, + "step": 23294 + }, + { + "epoch": 0.68, + "grad_norm": 1.2190489392011408, + "learning_rate": 2.5141148958073746e-06, + "loss": 0.2683, + "step": 23295 + }, + { + "epoch": 0.68, + "grad_norm": 1.3138882420341638, + "learning_rate": 2.513707365342897e-06, + "loss": 0.2701, + "step": 23296 + }, + { + "epoch": 0.68, + "grad_norm": 1.313980572211309, + "learning_rate": 2.5132998568199783e-06, + "loss": 0.2833, + "step": 23297 + }, + { + "epoch": 0.68, + "grad_norm": 1.2517540570752734, + "learning_rate": 2.5128923702422186e-06, + "loss": 0.2716, + "step": 23298 + }, + { + "epoch": 0.68, + "grad_norm": 1.3368710544602103, + "learning_rate": 2.5124849056132094e-06, + "loss": 0.2927, + "step": 23299 + }, + { + "epoch": 0.68, + "grad_norm": 1.4334962062578516, + "learning_rate": 2.5120774629365485e-06, + "loss": 0.2826, + "step": 23300 + }, + { + "epoch": 0.68, + "grad_norm": 1.900094814670018, + "learning_rate": 2.511670042215831e-06, + "loss": 0.3115, + "step": 23301 + }, + { + "epoch": 0.68, + "grad_norm": 1.330662538480863, + "learning_rate": 2.511262643454654e-06, + "loss": 0.2811, + "step": 23302 + }, + { + "epoch": 0.68, + "grad_norm": 1.452553699266929, + "learning_rate": 2.510855266656611e-06, + "loss": 0.3022, + "step": 23303 + }, + { + "epoch": 0.68, + "grad_norm": 1.4579266505151252, + "learning_rate": 2.510447911825299e-06, + "loss": 0.2817, + "step": 23304 + }, + { + "epoch": 0.68, + "grad_norm": 1.357728932864607, + "learning_rate": 2.5100405789643107e-06, + "loss": 0.2624, + "step": 23305 + }, + { + "epoch": 0.68, + "grad_norm": 1.3746895952605893, + "learning_rate": 2.509633268077244e-06, + "loss": 0.3067, + "step": 23306 + }, + { + "epoch": 0.68, + "grad_norm": 1.3171360605271318, + "learning_rate": 2.50922597916769e-06, + "loss": 0.2606, + "step": 23307 + }, + { + "epoch": 0.68, + "grad_norm": 1.514798317667346, + "learning_rate": 2.508818712239245e-06, + "loss": 0.3057, + "step": 23308 + }, + { + "epoch": 0.68, + "grad_norm": 1.4430548690308984, + "learning_rate": 2.5084114672955016e-06, + "loss": 0.269, + "step": 23309 + }, + { + "epoch": 0.68, + "grad_norm": 1.3179775801174711, + "learning_rate": 2.508004244340054e-06, + "loss": 0.2844, + "step": 23310 + }, + { + "epoch": 0.68, + "grad_norm": 1.2932793753305123, + "learning_rate": 2.507597043376497e-06, + "loss": 0.2734, + "step": 23311 + }, + { + "epoch": 0.68, + "grad_norm": 1.281551521492239, + "learning_rate": 2.507189864408424e-06, + "loss": 0.2805, + "step": 23312 + }, + { + "epoch": 0.68, + "grad_norm": 1.4831153449770562, + "learning_rate": 2.506782707439428e-06, + "loss": 0.265, + "step": 23313 + }, + { + "epoch": 0.68, + "grad_norm": 1.1941039365909, + "learning_rate": 2.506375572473102e-06, + "loss": 0.2871, + "step": 23314 + }, + { + "epoch": 0.68, + "grad_norm": 1.4233870232354229, + "learning_rate": 2.5059684595130397e-06, + "loss": 0.272, + "step": 23315 + }, + { + "epoch": 0.68, + "grad_norm": 1.2645858558839373, + "learning_rate": 2.5055613685628333e-06, + "loss": 0.2972, + "step": 23316 + }, + { + "epoch": 0.68, + "grad_norm": 1.2680371375156867, + "learning_rate": 2.505154299626077e-06, + "loss": 0.2798, + "step": 23317 + }, + { + "epoch": 0.68, + "grad_norm": 1.3434038407613085, + "learning_rate": 2.50474725270636e-06, + "loss": 0.2845, + "step": 23318 + }, + { + "epoch": 0.68, + "grad_norm": 1.3406648118411664, + "learning_rate": 2.504340227807276e-06, + "loss": 0.3012, + "step": 23319 + }, + { + "epoch": 0.68, + "grad_norm": 1.257870872836263, + "learning_rate": 2.5039332249324174e-06, + "loss": 0.2747, + "step": 23320 + }, + { + "epoch": 0.68, + "grad_norm": 1.2789766293564884, + "learning_rate": 2.503526244085376e-06, + "loss": 0.2709, + "step": 23321 + }, + { + "epoch": 0.68, + "grad_norm": 1.3747555910669076, + "learning_rate": 2.503119285269743e-06, + "loss": 0.2591, + "step": 23322 + }, + { + "epoch": 0.68, + "grad_norm": 1.2603257944085273, + "learning_rate": 2.502712348489109e-06, + "loss": 0.282, + "step": 23323 + }, + { + "epoch": 0.68, + "grad_norm": 1.2581436283249379, + "learning_rate": 2.5023054337470677e-06, + "loss": 0.2963, + "step": 23324 + }, + { + "epoch": 0.68, + "grad_norm": 1.268359193830645, + "learning_rate": 2.5018985410472096e-06, + "loss": 0.2754, + "step": 23325 + }, + { + "epoch": 0.68, + "grad_norm": 1.3393669831359858, + "learning_rate": 2.501491670393123e-06, + "loss": 0.2847, + "step": 23326 + }, + { + "epoch": 0.68, + "grad_norm": 1.186768141282499, + "learning_rate": 2.5010848217884004e-06, + "loss": 0.2915, + "step": 23327 + }, + { + "epoch": 0.68, + "grad_norm": 24.732473359441627, + "learning_rate": 2.5006779952366317e-06, + "loss": 0.2812, + "step": 23328 + }, + { + "epoch": 0.68, + "grad_norm": 2.1019623050606646, + "learning_rate": 2.5002711907414074e-06, + "loss": 0.3204, + "step": 23329 + }, + { + "epoch": 0.68, + "grad_norm": 1.2772733976306545, + "learning_rate": 2.4998644083063177e-06, + "loss": 0.2772, + "step": 23330 + }, + { + "epoch": 0.68, + "grad_norm": 1.455123220516368, + "learning_rate": 2.4994576479349526e-06, + "loss": 0.2843, + "step": 23331 + }, + { + "epoch": 0.68, + "grad_norm": 1.6034534352251695, + "learning_rate": 2.4990509096309007e-06, + "loss": 0.2699, + "step": 23332 + }, + { + "epoch": 0.68, + "grad_norm": 1.2915690629324599, + "learning_rate": 2.498644193397753e-06, + "loss": 0.2728, + "step": 23333 + }, + { + "epoch": 0.68, + "grad_norm": 1.2953249030852358, + "learning_rate": 2.4982374992390994e-06, + "loss": 0.2874, + "step": 23334 + }, + { + "epoch": 0.68, + "grad_norm": 1.4212535886324809, + "learning_rate": 2.497830827158526e-06, + "loss": 0.2843, + "step": 23335 + }, + { + "epoch": 0.68, + "grad_norm": 1.4870669696708818, + "learning_rate": 2.4974241771596254e-06, + "loss": 0.2904, + "step": 23336 + }, + { + "epoch": 0.68, + "grad_norm": 0.9859630983688247, + "learning_rate": 2.4970175492459818e-06, + "loss": 0.5439, + "step": 23337 + }, + { + "epoch": 0.68, + "grad_norm": 1.2303251691885886, + "learning_rate": 2.4966109434211866e-06, + "loss": 0.3077, + "step": 23338 + }, + { + "epoch": 0.68, + "grad_norm": 1.2034510755916252, + "learning_rate": 2.4962043596888273e-06, + "loss": 0.2717, + "step": 23339 + }, + { + "epoch": 0.68, + "grad_norm": 1.272106577027215, + "learning_rate": 2.495797798052493e-06, + "loss": 0.2774, + "step": 23340 + }, + { + "epoch": 0.68, + "grad_norm": 1.4630015422366345, + "learning_rate": 2.4953912585157695e-06, + "loss": 0.3186, + "step": 23341 + }, + { + "epoch": 0.68, + "grad_norm": 1.3018355591193618, + "learning_rate": 2.494984741082247e-06, + "loss": 0.2803, + "step": 23342 + }, + { + "epoch": 0.68, + "grad_norm": 1.5086490193557012, + "learning_rate": 2.494578245755512e-06, + "loss": 0.3391, + "step": 23343 + }, + { + "epoch": 0.68, + "grad_norm": 1.8437873532016336, + "learning_rate": 2.494171772539153e-06, + "loss": 0.3082, + "step": 23344 + }, + { + "epoch": 0.68, + "grad_norm": 1.7568400299969087, + "learning_rate": 2.493765321436755e-06, + "loss": 0.2837, + "step": 23345 + }, + { + "epoch": 0.68, + "grad_norm": 1.3961395942647712, + "learning_rate": 2.493358892451905e-06, + "loss": 0.2863, + "step": 23346 + }, + { + "epoch": 0.68, + "grad_norm": 1.255245777687311, + "learning_rate": 2.4929524855881903e-06, + "loss": 0.2826, + "step": 23347 + }, + { + "epoch": 0.68, + "grad_norm": 1.3112691020441452, + "learning_rate": 2.4925461008491986e-06, + "loss": 0.3263, + "step": 23348 + }, + { + "epoch": 0.68, + "grad_norm": 1.3970076694438247, + "learning_rate": 2.492139738238515e-06, + "loss": 0.3181, + "step": 23349 + }, + { + "epoch": 0.68, + "grad_norm": 1.5563079300447795, + "learning_rate": 2.4917333977597256e-06, + "loss": 0.2839, + "step": 23350 + }, + { + "epoch": 0.68, + "grad_norm": 1.3517527092990593, + "learning_rate": 2.4913270794164173e-06, + "loss": 0.2855, + "step": 23351 + }, + { + "epoch": 0.68, + "grad_norm": 1.2466007368237744, + "learning_rate": 2.4909207832121752e-06, + "loss": 0.2967, + "step": 23352 + }, + { + "epoch": 0.68, + "grad_norm": 1.3649495345683413, + "learning_rate": 2.4905145091505873e-06, + "loss": 0.2867, + "step": 23353 + }, + { + "epoch": 0.68, + "grad_norm": 1.3803201572907011, + "learning_rate": 2.490108257235234e-06, + "loss": 0.2884, + "step": 23354 + }, + { + "epoch": 0.68, + "grad_norm": 1.2531961711958457, + "learning_rate": 2.4897020274697042e-06, + "loss": 0.2745, + "step": 23355 + }, + { + "epoch": 0.68, + "grad_norm": 1.2652541235850037, + "learning_rate": 2.4892958198575813e-06, + "loss": 0.2815, + "step": 23356 + }, + { + "epoch": 0.68, + "grad_norm": 1.4109362353860395, + "learning_rate": 2.4888896344024506e-06, + "loss": 0.2929, + "step": 23357 + }, + { + "epoch": 0.68, + "grad_norm": 1.4684819836365026, + "learning_rate": 2.4884834711078966e-06, + "loss": 0.3123, + "step": 23358 + }, + { + "epoch": 0.68, + "grad_norm": 1.5690440213588843, + "learning_rate": 2.488077329977504e-06, + "loss": 0.2662, + "step": 23359 + }, + { + "epoch": 0.68, + "grad_norm": 1.3941659368921364, + "learning_rate": 2.487671211014857e-06, + "loss": 0.3102, + "step": 23360 + }, + { + "epoch": 0.68, + "grad_norm": 1.251427709218214, + "learning_rate": 2.487265114223541e-06, + "loss": 0.2862, + "step": 23361 + }, + { + "epoch": 0.68, + "grad_norm": 0.9420355181613828, + "learning_rate": 2.4868590396071367e-06, + "loss": 0.5843, + "step": 23362 + }, + { + "epoch": 0.68, + "grad_norm": 1.241379257817879, + "learning_rate": 2.4864529871692313e-06, + "loss": 0.2636, + "step": 23363 + }, + { + "epoch": 0.68, + "grad_norm": 1.2168592300171996, + "learning_rate": 2.486046956913404e-06, + "loss": 0.2764, + "step": 23364 + }, + { + "epoch": 0.68, + "grad_norm": 1.249444785515977, + "learning_rate": 2.4856409488432398e-06, + "loss": 0.2912, + "step": 23365 + }, + { + "epoch": 0.68, + "grad_norm": 1.2283981860534434, + "learning_rate": 2.4852349629623225e-06, + "loss": 0.2644, + "step": 23366 + }, + { + "epoch": 0.68, + "grad_norm": 1.3942304289630107, + "learning_rate": 2.4848289992742343e-06, + "loss": 0.2933, + "step": 23367 + }, + { + "epoch": 0.68, + "grad_norm": 1.182095702688065, + "learning_rate": 2.4844230577825585e-06, + "loss": 0.2648, + "step": 23368 + }, + { + "epoch": 0.68, + "grad_norm": 0.9086099106027863, + "learning_rate": 2.4840171384908767e-06, + "loss": 0.5721, + "step": 23369 + }, + { + "epoch": 0.68, + "grad_norm": 1.2574786723718028, + "learning_rate": 2.483611241402772e-06, + "loss": 0.2769, + "step": 23370 + }, + { + "epoch": 0.68, + "grad_norm": 1.4804405936129519, + "learning_rate": 2.483205366521826e-06, + "loss": 0.2803, + "step": 23371 + }, + { + "epoch": 0.68, + "grad_norm": 1.2664206431896283, + "learning_rate": 2.482799513851622e-06, + "loss": 0.3027, + "step": 23372 + }, + { + "epoch": 0.68, + "grad_norm": 1.2430670044757013, + "learning_rate": 2.4823936833957387e-06, + "loss": 0.2866, + "step": 23373 + }, + { + "epoch": 0.68, + "grad_norm": 1.6539102445089584, + "learning_rate": 2.4819878751577585e-06, + "loss": 0.2997, + "step": 23374 + }, + { + "epoch": 0.68, + "grad_norm": 1.2806052567047388, + "learning_rate": 2.481582089141264e-06, + "loss": 0.2728, + "step": 23375 + }, + { + "epoch": 0.68, + "grad_norm": 1.2883911508223072, + "learning_rate": 2.481176325349835e-06, + "loss": 0.2945, + "step": 23376 + }, + { + "epoch": 0.68, + "grad_norm": 1.3358520321946616, + "learning_rate": 2.4807705837870534e-06, + "loss": 0.279, + "step": 23377 + }, + { + "epoch": 0.68, + "grad_norm": 1.2945469021349385, + "learning_rate": 2.4803648644564987e-06, + "loss": 0.2705, + "step": 23378 + }, + { + "epoch": 0.68, + "grad_norm": 1.1867110882307907, + "learning_rate": 2.4799591673617523e-06, + "loss": 0.2729, + "step": 23379 + }, + { + "epoch": 0.68, + "grad_norm": 1.5009425977171664, + "learning_rate": 2.4795534925063963e-06, + "loss": 0.3041, + "step": 23380 + }, + { + "epoch": 0.68, + "grad_norm": 1.2291497968956593, + "learning_rate": 2.4791478398940065e-06, + "loss": 0.2676, + "step": 23381 + }, + { + "epoch": 0.68, + "grad_norm": 1.5721006641159108, + "learning_rate": 2.478742209528165e-06, + "loss": 0.2979, + "step": 23382 + }, + { + "epoch": 0.68, + "grad_norm": 1.2701437018148622, + "learning_rate": 2.4783366014124514e-06, + "loss": 0.2711, + "step": 23383 + }, + { + "epoch": 0.68, + "grad_norm": 1.3963322920755172, + "learning_rate": 2.4779310155504456e-06, + "loss": 0.2843, + "step": 23384 + }, + { + "epoch": 0.68, + "grad_norm": 1.4768319623097883, + "learning_rate": 2.4775254519457264e-06, + "loss": 0.2941, + "step": 23385 + }, + { + "epoch": 0.68, + "grad_norm": 1.3532843097762146, + "learning_rate": 2.477119910601873e-06, + "loss": 0.2728, + "step": 23386 + }, + { + "epoch": 0.68, + "grad_norm": 1.5486442473170163, + "learning_rate": 2.4767143915224645e-06, + "loss": 0.2878, + "step": 23387 + }, + { + "epoch": 0.68, + "grad_norm": 1.6828926283458818, + "learning_rate": 2.4763088947110793e-06, + "loss": 0.2999, + "step": 23388 + }, + { + "epoch": 0.68, + "grad_norm": 1.757496910127018, + "learning_rate": 2.4759034201712984e-06, + "loss": 0.3109, + "step": 23389 + }, + { + "epoch": 0.68, + "grad_norm": 3.004598412334181, + "learning_rate": 2.475497967906695e-06, + "loss": 0.2715, + "step": 23390 + }, + { + "epoch": 0.68, + "grad_norm": 1.535157734007945, + "learning_rate": 2.475092537920853e-06, + "loss": 0.2869, + "step": 23391 + }, + { + "epoch": 0.68, + "grad_norm": 1.4642796626822163, + "learning_rate": 2.4746871302173447e-06, + "loss": 0.2972, + "step": 23392 + }, + { + "epoch": 0.68, + "grad_norm": 1.3049146862192378, + "learning_rate": 2.474281744799751e-06, + "loss": 0.2674, + "step": 23393 + }, + { + "epoch": 0.68, + "grad_norm": 1.339399930979118, + "learning_rate": 2.4738763816716488e-06, + "loss": 0.3032, + "step": 23394 + }, + { + "epoch": 0.68, + "grad_norm": 1.3756905834320396, + "learning_rate": 2.473471040836616e-06, + "loss": 0.2904, + "step": 23395 + }, + { + "epoch": 0.68, + "grad_norm": 1.3803026281601747, + "learning_rate": 2.4730657222982285e-06, + "loss": 0.2608, + "step": 23396 + }, + { + "epoch": 0.68, + "grad_norm": 1.3957607390535904, + "learning_rate": 2.472660426060064e-06, + "loss": 0.2742, + "step": 23397 + }, + { + "epoch": 0.68, + "grad_norm": 1.6654598429349736, + "learning_rate": 2.4722551521257e-06, + "loss": 0.2914, + "step": 23398 + }, + { + "epoch": 0.68, + "grad_norm": 1.2196472605957354, + "learning_rate": 2.4718499004987135e-06, + "loss": 0.2921, + "step": 23399 + }, + { + "epoch": 0.68, + "grad_norm": 1.3825860483737789, + "learning_rate": 2.4714446711826784e-06, + "loss": 0.2945, + "step": 23400 + }, + { + "epoch": 0.68, + "grad_norm": 1.3156119557884671, + "learning_rate": 2.471039464181172e-06, + "loss": 0.2883, + "step": 23401 + }, + { + "epoch": 0.68, + "grad_norm": 1.6190341415361256, + "learning_rate": 2.47063427949777e-06, + "loss": 0.2715, + "step": 23402 + }, + { + "epoch": 0.68, + "grad_norm": 1.5845901459072658, + "learning_rate": 2.470229117136049e-06, + "loss": 0.291, + "step": 23403 + }, + { + "epoch": 0.68, + "grad_norm": 1.27123722923741, + "learning_rate": 2.4698239770995836e-06, + "loss": 0.2874, + "step": 23404 + }, + { + "epoch": 0.68, + "grad_norm": 1.142531130614679, + "learning_rate": 2.46941885939195e-06, + "loss": 0.2692, + "step": 23405 + }, + { + "epoch": 0.68, + "grad_norm": 1.3311491083012221, + "learning_rate": 2.469013764016723e-06, + "loss": 0.2821, + "step": 23406 + }, + { + "epoch": 0.68, + "grad_norm": 1.7087193511396426, + "learning_rate": 2.4686086909774776e-06, + "loss": 0.2816, + "step": 23407 + }, + { + "epoch": 0.68, + "grad_norm": 1.3916461964825786, + "learning_rate": 2.4682036402777904e-06, + "loss": 0.2585, + "step": 23408 + }, + { + "epoch": 0.68, + "grad_norm": 1.2916208556162965, + "learning_rate": 2.4677986119212322e-06, + "loss": 0.278, + "step": 23409 + }, + { + "epoch": 0.68, + "grad_norm": 1.5056318831088926, + "learning_rate": 2.4673936059113794e-06, + "loss": 0.2766, + "step": 23410 + }, + { + "epoch": 0.68, + "grad_norm": 4.616665174645842, + "learning_rate": 2.4669886222518062e-06, + "loss": 0.2814, + "step": 23411 + }, + { + "epoch": 0.68, + "grad_norm": 2.1421654005036754, + "learning_rate": 2.4665836609460865e-06, + "loss": 0.2556, + "step": 23412 + }, + { + "epoch": 0.68, + "grad_norm": 1.3923893174869748, + "learning_rate": 2.466178721997794e-06, + "loss": 0.2854, + "step": 23413 + }, + { + "epoch": 0.68, + "grad_norm": 1.3697597786107885, + "learning_rate": 2.4657738054105026e-06, + "loss": 0.2936, + "step": 23414 + }, + { + "epoch": 0.68, + "grad_norm": 1.275519071768427, + "learning_rate": 2.4653689111877852e-06, + "loss": 0.272, + "step": 23415 + }, + { + "epoch": 0.68, + "grad_norm": 1.777157863912985, + "learning_rate": 2.4649640393332175e-06, + "loss": 0.3101, + "step": 23416 + }, + { + "epoch": 0.68, + "grad_norm": 1.3796330077081946, + "learning_rate": 2.4645591898503683e-06, + "loss": 0.2764, + "step": 23417 + }, + { + "epoch": 0.68, + "grad_norm": 1.239296119920184, + "learning_rate": 2.464154362742812e-06, + "loss": 0.2687, + "step": 23418 + }, + { + "epoch": 0.68, + "grad_norm": 1.4390399558568534, + "learning_rate": 2.463749558014124e-06, + "loss": 0.2926, + "step": 23419 + }, + { + "epoch": 0.68, + "grad_norm": 1.4397416113548451, + "learning_rate": 2.463344775667872e-06, + "loss": 0.2981, + "step": 23420 + }, + { + "epoch": 0.68, + "grad_norm": 2.196173079452277, + "learning_rate": 2.4629400157076305e-06, + "loss": 0.2985, + "step": 23421 + }, + { + "epoch": 0.68, + "grad_norm": 1.3308916459556797, + "learning_rate": 2.462535278136972e-06, + "loss": 0.2968, + "step": 23422 + }, + { + "epoch": 0.68, + "grad_norm": 1.3804784377751835, + "learning_rate": 2.462130562959467e-06, + "loss": 0.2742, + "step": 23423 + }, + { + "epoch": 0.68, + "grad_norm": 1.5594305775146469, + "learning_rate": 2.461725870178689e-06, + "loss": 0.2945, + "step": 23424 + }, + { + "epoch": 0.68, + "grad_norm": 1.2778804494196476, + "learning_rate": 2.461321199798208e-06, + "loss": 0.2663, + "step": 23425 + }, + { + "epoch": 0.68, + "grad_norm": 1.2906862307189917, + "learning_rate": 2.460916551821595e-06, + "loss": 0.2952, + "step": 23426 + }, + { + "epoch": 0.68, + "grad_norm": 1.7867134797041533, + "learning_rate": 2.4605119262524237e-06, + "loss": 0.2938, + "step": 23427 + }, + { + "epoch": 0.68, + "grad_norm": 2.2054534274255984, + "learning_rate": 2.460107323094261e-06, + "loss": 0.3045, + "step": 23428 + }, + { + "epoch": 0.68, + "grad_norm": 1.1472242161781054, + "learning_rate": 2.4597027423506796e-06, + "loss": 0.2643, + "step": 23429 + }, + { + "epoch": 0.68, + "grad_norm": 1.3630072415895662, + "learning_rate": 2.4592981840252493e-06, + "loss": 0.3176, + "step": 23430 + }, + { + "epoch": 0.68, + "grad_norm": 1.3084856860906744, + "learning_rate": 2.458893648121541e-06, + "loss": 0.2875, + "step": 23431 + }, + { + "epoch": 0.68, + "grad_norm": 1.2422525385010847, + "learning_rate": 2.4584891346431246e-06, + "loss": 0.2859, + "step": 23432 + }, + { + "epoch": 0.68, + "grad_norm": 1.286447003241901, + "learning_rate": 2.4580846435935695e-06, + "loss": 0.2757, + "step": 23433 + }, + { + "epoch": 0.68, + "grad_norm": 1.4284003325970747, + "learning_rate": 2.4576801749764457e-06, + "loss": 0.289, + "step": 23434 + }, + { + "epoch": 0.68, + "grad_norm": 1.4184445914215211, + "learning_rate": 2.4572757287953243e-06, + "loss": 0.2964, + "step": 23435 + }, + { + "epoch": 0.68, + "grad_norm": 1.2896286120005236, + "learning_rate": 2.4568713050537706e-06, + "loss": 0.2786, + "step": 23436 + }, + { + "epoch": 0.68, + "grad_norm": 1.2747678119486392, + "learning_rate": 2.456466903755357e-06, + "loss": 0.3067, + "step": 23437 + }, + { + "epoch": 0.68, + "grad_norm": 1.2974055750596456, + "learning_rate": 2.45606252490365e-06, + "loss": 0.2938, + "step": 23438 + }, + { + "epoch": 0.68, + "grad_norm": 0.9590710820511745, + "learning_rate": 2.4556581685022197e-06, + "loss": 0.5715, + "step": 23439 + }, + { + "epoch": 0.68, + "grad_norm": 1.4776607397317212, + "learning_rate": 2.4552538345546347e-06, + "loss": 0.2803, + "step": 23440 + }, + { + "epoch": 0.68, + "grad_norm": 1.4013003614858084, + "learning_rate": 2.4548495230644625e-06, + "loss": 0.2766, + "step": 23441 + }, + { + "epoch": 0.68, + "grad_norm": 1.366870181569987, + "learning_rate": 2.4544452340352714e-06, + "loss": 0.2791, + "step": 23442 + }, + { + "epoch": 0.68, + "grad_norm": 1.3988149261373235, + "learning_rate": 2.45404096747063e-06, + "loss": 0.273, + "step": 23443 + }, + { + "epoch": 0.68, + "grad_norm": 1.2567500469839041, + "learning_rate": 2.4536367233741065e-06, + "loss": 0.2836, + "step": 23444 + }, + { + "epoch": 0.68, + "grad_norm": 1.3821651045187924, + "learning_rate": 2.453232501749265e-06, + "loss": 0.2668, + "step": 23445 + }, + { + "epoch": 0.68, + "grad_norm": 1.5787176417222333, + "learning_rate": 2.4528283025996756e-06, + "loss": 0.2723, + "step": 23446 + }, + { + "epoch": 0.68, + "grad_norm": 1.368149139215216, + "learning_rate": 2.4524241259289065e-06, + "loss": 0.2995, + "step": 23447 + }, + { + "epoch": 0.68, + "grad_norm": 1.2913813507900194, + "learning_rate": 2.452019971740521e-06, + "loss": 0.2728, + "step": 23448 + }, + { + "epoch": 0.68, + "grad_norm": 1.728154428275929, + "learning_rate": 2.4516158400380875e-06, + "loss": 0.2688, + "step": 23449 + }, + { + "epoch": 0.68, + "grad_norm": 1.2561133002114981, + "learning_rate": 2.451211730825172e-06, + "loss": 0.268, + "step": 23450 + }, + { + "epoch": 0.68, + "grad_norm": 1.2932085290502324, + "learning_rate": 2.4508076441053418e-06, + "loss": 0.2787, + "step": 23451 + }, + { + "epoch": 0.68, + "grad_norm": 1.8439929905098098, + "learning_rate": 2.4504035798821618e-06, + "loss": 0.3022, + "step": 23452 + }, + { + "epoch": 0.68, + "grad_norm": 1.329248310233332, + "learning_rate": 2.449999538159199e-06, + "loss": 0.2915, + "step": 23453 + }, + { + "epoch": 0.68, + "grad_norm": 1.4820311994445117, + "learning_rate": 2.4495955189400205e-06, + "loss": 0.2853, + "step": 23454 + }, + { + "epoch": 0.68, + "grad_norm": 1.2976037783174459, + "learning_rate": 2.4491915222281874e-06, + "loss": 0.2834, + "step": 23455 + }, + { + "epoch": 0.68, + "grad_norm": 1.2629882877611889, + "learning_rate": 2.4487875480272684e-06, + "loss": 0.2862, + "step": 23456 + }, + { + "epoch": 0.68, + "grad_norm": 1.2069426576231068, + "learning_rate": 2.4483835963408265e-06, + "loss": 0.2703, + "step": 23457 + }, + { + "epoch": 0.68, + "grad_norm": 1.2246609217581674, + "learning_rate": 2.447979667172428e-06, + "loss": 0.2796, + "step": 23458 + }, + { + "epoch": 0.68, + "grad_norm": 1.2674844506438154, + "learning_rate": 2.447575760525637e-06, + "loss": 0.2737, + "step": 23459 + }, + { + "epoch": 0.68, + "grad_norm": 0.9649090142534243, + "learning_rate": 2.447171876404019e-06, + "loss": 0.568, + "step": 23460 + }, + { + "epoch": 0.68, + "grad_norm": 1.2721275922066626, + "learning_rate": 2.446768014811137e-06, + "loss": 0.2907, + "step": 23461 + }, + { + "epoch": 0.68, + "grad_norm": 1.465583326934761, + "learning_rate": 2.4463641757505556e-06, + "loss": 0.2824, + "step": 23462 + }, + { + "epoch": 0.68, + "grad_norm": 1.3841755627873025, + "learning_rate": 2.445960359225841e-06, + "loss": 0.2854, + "step": 23463 + }, + { + "epoch": 0.68, + "grad_norm": 1.3169826755040048, + "learning_rate": 2.445556565240552e-06, + "loss": 0.2859, + "step": 23464 + }, + { + "epoch": 0.68, + "grad_norm": 1.3452286630798973, + "learning_rate": 2.4451527937982554e-06, + "loss": 0.2885, + "step": 23465 + }, + { + "epoch": 0.68, + "grad_norm": 1.4667371821195616, + "learning_rate": 2.444749044902513e-06, + "loss": 0.3164, + "step": 23466 + }, + { + "epoch": 0.68, + "grad_norm": 1.5003891428162661, + "learning_rate": 2.4443453185568893e-06, + "loss": 0.289, + "step": 23467 + }, + { + "epoch": 0.68, + "grad_norm": 1.2777766375567685, + "learning_rate": 2.4439416147649457e-06, + "loss": 0.2958, + "step": 23468 + }, + { + "epoch": 0.68, + "grad_norm": 1.4861061340584747, + "learning_rate": 2.4435379335302467e-06, + "loss": 0.3049, + "step": 23469 + }, + { + "epoch": 0.68, + "grad_norm": 1.3982629723306335, + "learning_rate": 2.443134274856353e-06, + "loss": 0.2806, + "step": 23470 + }, + { + "epoch": 0.68, + "grad_norm": 1.2493167063307238, + "learning_rate": 2.4427306387468297e-06, + "loss": 0.2762, + "step": 23471 + }, + { + "epoch": 0.68, + "grad_norm": 1.4051626819791734, + "learning_rate": 2.4423270252052356e-06, + "loss": 0.2952, + "step": 23472 + }, + { + "epoch": 0.68, + "grad_norm": 1.390183105583464, + "learning_rate": 2.4419234342351333e-06, + "loss": 0.3065, + "step": 23473 + }, + { + "epoch": 0.68, + "grad_norm": 1.16758626465215, + "learning_rate": 2.441519865840085e-06, + "loss": 0.2829, + "step": 23474 + }, + { + "epoch": 0.68, + "grad_norm": 1.2950270088917646, + "learning_rate": 2.4411163200236544e-06, + "loss": 0.2617, + "step": 23475 + }, + { + "epoch": 0.68, + "grad_norm": 1.4234207085112829, + "learning_rate": 2.4407127967893983e-06, + "loss": 0.292, + "step": 23476 + }, + { + "epoch": 0.68, + "grad_norm": 1.3913472565330602, + "learning_rate": 2.440309296140881e-06, + "loss": 0.2729, + "step": 23477 + }, + { + "epoch": 0.68, + "grad_norm": 1.3042251669575353, + "learning_rate": 2.439905818081662e-06, + "loss": 0.3111, + "step": 23478 + }, + { + "epoch": 0.68, + "grad_norm": 1.686621172762999, + "learning_rate": 2.4395023626153025e-06, + "loss": 0.2705, + "step": 23479 + }, + { + "epoch": 0.68, + "grad_norm": 3.0446407029750384, + "learning_rate": 2.439098929745363e-06, + "loss": 0.2718, + "step": 23480 + }, + { + "epoch": 0.68, + "grad_norm": 1.1964803928604821, + "learning_rate": 2.4386955194754065e-06, + "loss": 0.265, + "step": 23481 + }, + { + "epoch": 0.68, + "grad_norm": 1.4051352867098748, + "learning_rate": 2.438292131808988e-06, + "loss": 0.3041, + "step": 23482 + }, + { + "epoch": 0.68, + "grad_norm": 1.2890605305718177, + "learning_rate": 2.4378887667496696e-06, + "loss": 0.2858, + "step": 23483 + }, + { + "epoch": 0.68, + "grad_norm": 1.3922433728398718, + "learning_rate": 2.4374854243010122e-06, + "loss": 0.3141, + "step": 23484 + }, + { + "epoch": 0.68, + "grad_norm": 1.2158984938366808, + "learning_rate": 2.437082104466573e-06, + "loss": 0.2792, + "step": 23485 + }, + { + "epoch": 0.68, + "grad_norm": 1.4988086630337945, + "learning_rate": 2.436678807249914e-06, + "loss": 0.2759, + "step": 23486 + }, + { + "epoch": 0.68, + "grad_norm": 1.2102701831156113, + "learning_rate": 2.436275532654592e-06, + "loss": 0.2851, + "step": 23487 + }, + { + "epoch": 0.68, + "grad_norm": 1.3197630309866164, + "learning_rate": 2.4358722806841672e-06, + "loss": 0.2921, + "step": 23488 + }, + { + "epoch": 0.68, + "grad_norm": 1.219273015154915, + "learning_rate": 2.4354690513421982e-06, + "loss": 0.2876, + "step": 23489 + }, + { + "epoch": 0.68, + "grad_norm": 1.2617685776442622, + "learning_rate": 2.435065844632245e-06, + "loss": 0.2805, + "step": 23490 + }, + { + "epoch": 0.68, + "grad_norm": 1.184251133357628, + "learning_rate": 2.434662660557862e-06, + "loss": 0.2658, + "step": 23491 + }, + { + "epoch": 0.68, + "grad_norm": 1.2100466370283305, + "learning_rate": 2.43425949912261e-06, + "loss": 0.2821, + "step": 23492 + }, + { + "epoch": 0.68, + "grad_norm": 1.3926661403837202, + "learning_rate": 2.4338563603300457e-06, + "loss": 0.2969, + "step": 23493 + }, + { + "epoch": 0.68, + "grad_norm": 1.3536414444450489, + "learning_rate": 2.4334532441837277e-06, + "loss": 0.2831, + "step": 23494 + }, + { + "epoch": 0.68, + "grad_norm": 1.273761639798247, + "learning_rate": 2.4330501506872138e-06, + "loss": 0.3133, + "step": 23495 + }, + { + "epoch": 0.68, + "grad_norm": 1.2344814613897515, + "learning_rate": 2.4326470798440603e-06, + "loss": 0.2761, + "step": 23496 + }, + { + "epoch": 0.68, + "grad_norm": 1.3550265963869264, + "learning_rate": 2.4322440316578248e-06, + "loss": 0.2865, + "step": 23497 + }, + { + "epoch": 0.68, + "grad_norm": 1.1831514156031413, + "learning_rate": 2.4318410061320648e-06, + "loss": 0.2757, + "step": 23498 + }, + { + "epoch": 0.68, + "grad_norm": 0.9081255924728154, + "learning_rate": 2.431438003270338e-06, + "loss": 0.5367, + "step": 23499 + }, + { + "epoch": 0.68, + "grad_norm": 1.516929357070544, + "learning_rate": 2.431035023076197e-06, + "loss": 0.3151, + "step": 23500 + }, + { + "epoch": 0.68, + "grad_norm": 1.3447489675897921, + "learning_rate": 2.430632065553201e-06, + "loss": 0.2929, + "step": 23501 + }, + { + "epoch": 0.68, + "grad_norm": 1.604498921115291, + "learning_rate": 2.4302291307049047e-06, + "loss": 0.2955, + "step": 23502 + }, + { + "epoch": 0.68, + "grad_norm": 1.2941102763234849, + "learning_rate": 2.4298262185348675e-06, + "loss": 0.3019, + "step": 23503 + }, + { + "epoch": 0.68, + "grad_norm": 1.2750008670253459, + "learning_rate": 2.42942332904664e-06, + "loss": 0.2957, + "step": 23504 + }, + { + "epoch": 0.68, + "grad_norm": 1.2625290588193092, + "learning_rate": 2.4290204622437806e-06, + "loss": 0.2842, + "step": 23505 + }, + { + "epoch": 0.68, + "grad_norm": 1.3188149941428462, + "learning_rate": 2.428617618129844e-06, + "loss": 0.3289, + "step": 23506 + }, + { + "epoch": 0.68, + "grad_norm": 1.6209489745290142, + "learning_rate": 2.4282147967083853e-06, + "loss": 0.3554, + "step": 23507 + }, + { + "epoch": 0.68, + "grad_norm": 1.1904931974007205, + "learning_rate": 2.4278119979829594e-06, + "loss": 0.2674, + "step": 23508 + }, + { + "epoch": 0.68, + "grad_norm": 1.5868595149737492, + "learning_rate": 2.4274092219571237e-06, + "loss": 0.2888, + "step": 23509 + }, + { + "epoch": 0.68, + "grad_norm": 1.2649061125842063, + "learning_rate": 2.427006468634428e-06, + "loss": 0.2924, + "step": 23510 + }, + { + "epoch": 0.68, + "grad_norm": 1.3117485121945034, + "learning_rate": 2.426603738018429e-06, + "loss": 0.2583, + "step": 23511 + }, + { + "epoch": 0.68, + "grad_norm": 1.158645323765807, + "learning_rate": 2.42620103011268e-06, + "loss": 0.2794, + "step": 23512 + }, + { + "epoch": 0.68, + "grad_norm": 1.200924253882369, + "learning_rate": 2.4257983449207355e-06, + "loss": 0.2795, + "step": 23513 + }, + { + "epoch": 0.68, + "grad_norm": 1.4988364653169581, + "learning_rate": 2.4253956824461495e-06, + "loss": 0.3096, + "step": 23514 + }, + { + "epoch": 0.68, + "grad_norm": 1.2771861436784346, + "learning_rate": 2.4249930426924745e-06, + "loss": 0.2728, + "step": 23515 + }, + { + "epoch": 0.68, + "grad_norm": 1.173128462194375, + "learning_rate": 2.4245904256632653e-06, + "loss": 0.2551, + "step": 23516 + }, + { + "epoch": 0.68, + "grad_norm": 1.63720318744538, + "learning_rate": 2.4241878313620753e-06, + "loss": 0.2732, + "step": 23517 + }, + { + "epoch": 0.68, + "grad_norm": 1.3502284050753055, + "learning_rate": 2.4237852597924548e-06, + "loss": 0.2671, + "step": 23518 + }, + { + "epoch": 0.68, + "grad_norm": 1.2575221086667503, + "learning_rate": 2.4233827109579584e-06, + "loss": 0.2803, + "step": 23519 + }, + { + "epoch": 0.68, + "grad_norm": 1.2543221878002977, + "learning_rate": 2.4229801848621377e-06, + "loss": 0.3071, + "step": 23520 + }, + { + "epoch": 0.68, + "grad_norm": 1.4690706405281135, + "learning_rate": 2.422577681508545e-06, + "loss": 0.2951, + "step": 23521 + }, + { + "epoch": 0.68, + "grad_norm": 1.5084326510191153, + "learning_rate": 2.4221752009007333e-06, + "loss": 0.3236, + "step": 23522 + }, + { + "epoch": 0.68, + "grad_norm": 1.3284029824517327, + "learning_rate": 2.4217727430422544e-06, + "loss": 0.3094, + "step": 23523 + }, + { + "epoch": 0.68, + "grad_norm": 1.6042193471119193, + "learning_rate": 2.421370307936659e-06, + "loss": 0.291, + "step": 23524 + }, + { + "epoch": 0.68, + "grad_norm": 1.4434751655045002, + "learning_rate": 2.4209678955874994e-06, + "loss": 0.3015, + "step": 23525 + }, + { + "epoch": 0.68, + "grad_norm": 1.5119817679349297, + "learning_rate": 2.420565505998329e-06, + "loss": 0.3143, + "step": 23526 + }, + { + "epoch": 0.68, + "grad_norm": 1.513342279321378, + "learning_rate": 2.4201631391726943e-06, + "loss": 0.2863, + "step": 23527 + }, + { + "epoch": 0.68, + "grad_norm": 2.6011162139818667, + "learning_rate": 2.419760795114149e-06, + "loss": 0.2824, + "step": 23528 + }, + { + "epoch": 0.68, + "grad_norm": 5.706402386012303, + "learning_rate": 2.4193584738262426e-06, + "loss": 0.3059, + "step": 23529 + }, + { + "epoch": 0.68, + "grad_norm": 1.272117344959533, + "learning_rate": 2.4189561753125268e-06, + "loss": 0.2677, + "step": 23530 + }, + { + "epoch": 0.68, + "grad_norm": 1.3187417363793197, + "learning_rate": 2.418553899576552e-06, + "loss": 0.2839, + "step": 23531 + }, + { + "epoch": 0.68, + "grad_norm": 1.6252946226484892, + "learning_rate": 2.4181516466218684e-06, + "loss": 0.2758, + "step": 23532 + }, + { + "epoch": 0.68, + "grad_norm": 1.2699544903197175, + "learning_rate": 2.4177494164520236e-06, + "loss": 0.2854, + "step": 23533 + }, + { + "epoch": 0.68, + "grad_norm": 1.447352435590144, + "learning_rate": 2.417347209070569e-06, + "loss": 0.3024, + "step": 23534 + }, + { + "epoch": 0.68, + "grad_norm": 1.4191253992307957, + "learning_rate": 2.4169450244810537e-06, + "loss": 0.3173, + "step": 23535 + }, + { + "epoch": 0.68, + "grad_norm": 1.3033196782647334, + "learning_rate": 2.4165428626870296e-06, + "loss": 0.2874, + "step": 23536 + }, + { + "epoch": 0.68, + "grad_norm": 1.5255808807527083, + "learning_rate": 2.416140723692041e-06, + "loss": 0.3204, + "step": 23537 + }, + { + "epoch": 0.68, + "grad_norm": 1.2873119651182179, + "learning_rate": 2.4157386074996397e-06, + "loss": 0.2906, + "step": 23538 + }, + { + "epoch": 0.68, + "grad_norm": 1.213425842957924, + "learning_rate": 2.4153365141133733e-06, + "loss": 0.2718, + "step": 23539 + }, + { + "epoch": 0.68, + "grad_norm": 1.2247836586705043, + "learning_rate": 2.414934443536791e-06, + "loss": 0.3233, + "step": 23540 + }, + { + "epoch": 0.68, + "grad_norm": 1.3183831489573086, + "learning_rate": 2.4145323957734413e-06, + "loss": 0.2726, + "step": 23541 + }, + { + "epoch": 0.68, + "grad_norm": 1.3436763629831563, + "learning_rate": 2.4141303708268716e-06, + "loss": 0.2857, + "step": 23542 + }, + { + "epoch": 0.68, + "grad_norm": 1.3392951072823056, + "learning_rate": 2.4137283687006295e-06, + "loss": 0.2956, + "step": 23543 + }, + { + "epoch": 0.68, + "grad_norm": 1.394307324396702, + "learning_rate": 2.413326389398264e-06, + "loss": 0.2752, + "step": 23544 + }, + { + "epoch": 0.68, + "grad_norm": 4.582496750945687, + "learning_rate": 2.412924432923324e-06, + "loss": 0.2777, + "step": 23545 + }, + { + "epoch": 0.68, + "grad_norm": 1.2543551302580003, + "learning_rate": 2.4125224992793516e-06, + "loss": 0.3057, + "step": 23546 + }, + { + "epoch": 0.68, + "grad_norm": 1.3257169047141188, + "learning_rate": 2.4121205884698978e-06, + "loss": 0.2962, + "step": 23547 + }, + { + "epoch": 0.68, + "grad_norm": 1.32046013793298, + "learning_rate": 2.4117187004985083e-06, + "loss": 0.2755, + "step": 23548 + }, + { + "epoch": 0.68, + "grad_norm": 1.686984448602857, + "learning_rate": 2.4113168353687294e-06, + "loss": 0.2877, + "step": 23549 + }, + { + "epoch": 0.68, + "grad_norm": 1.4475138659430837, + "learning_rate": 2.4109149930841084e-06, + "loss": 0.3048, + "step": 23550 + }, + { + "epoch": 0.68, + "grad_norm": 1.3280921561182202, + "learning_rate": 2.410513173648192e-06, + "loss": 0.2765, + "step": 23551 + }, + { + "epoch": 0.68, + "grad_norm": 1.2678055743861343, + "learning_rate": 2.4101113770645245e-06, + "loss": 0.3149, + "step": 23552 + }, + { + "epoch": 0.68, + "grad_norm": 0.9793438042456315, + "learning_rate": 2.4097096033366556e-06, + "loss": 0.5944, + "step": 23553 + }, + { + "epoch": 0.68, + "grad_norm": 1.7535790704082683, + "learning_rate": 2.4093078524681255e-06, + "loss": 0.2739, + "step": 23554 + }, + { + "epoch": 0.68, + "grad_norm": 1.412500956128126, + "learning_rate": 2.4089061244624835e-06, + "loss": 0.2745, + "step": 23555 + }, + { + "epoch": 0.68, + "grad_norm": 1.5006395323940882, + "learning_rate": 2.4085044193232726e-06, + "loss": 0.2894, + "step": 23556 + }, + { + "epoch": 0.68, + "grad_norm": 1.2529247680006472, + "learning_rate": 2.4081027370540394e-06, + "loss": 0.2797, + "step": 23557 + }, + { + "epoch": 0.68, + "grad_norm": 1.3442895171222122, + "learning_rate": 2.4077010776583282e-06, + "loss": 0.2809, + "step": 23558 + }, + { + "epoch": 0.68, + "grad_norm": 1.2437235452340782, + "learning_rate": 2.407299441139684e-06, + "loss": 0.2897, + "step": 23559 + }, + { + "epoch": 0.68, + "grad_norm": 1.2625391871138267, + "learning_rate": 2.4068978275016523e-06, + "loss": 0.2638, + "step": 23560 + }, + { + "epoch": 0.68, + "grad_norm": 1.4547032347892377, + "learning_rate": 2.4064962367477747e-06, + "loss": 0.2879, + "step": 23561 + }, + { + "epoch": 0.68, + "grad_norm": 1.5062100955385855, + "learning_rate": 2.406094668881596e-06, + "loss": 0.2892, + "step": 23562 + }, + { + "epoch": 0.68, + "grad_norm": 0.8933128180334563, + "learning_rate": 2.4056931239066618e-06, + "loss": 0.5862, + "step": 23563 + }, + { + "epoch": 0.68, + "grad_norm": 1.0247115252024328, + "learning_rate": 2.4052916018265153e-06, + "loss": 0.5647, + "step": 23564 + }, + { + "epoch": 0.68, + "grad_norm": 1.6861019817069836, + "learning_rate": 2.4048901026446975e-06, + "loss": 0.2807, + "step": 23565 + }, + { + "epoch": 0.68, + "grad_norm": 1.717954477177659, + "learning_rate": 2.4044886263647536e-06, + "loss": 0.2646, + "step": 23566 + }, + { + "epoch": 0.68, + "grad_norm": 1.3390862209361305, + "learning_rate": 2.4040871729902264e-06, + "loss": 0.2766, + "step": 23567 + }, + { + "epoch": 0.68, + "grad_norm": 1.412246952176927, + "learning_rate": 2.4036857425246587e-06, + "loss": 0.2788, + "step": 23568 + }, + { + "epoch": 0.68, + "grad_norm": 1.217357466375855, + "learning_rate": 2.403284334971593e-06, + "loss": 0.2752, + "step": 23569 + }, + { + "epoch": 0.68, + "grad_norm": 1.3193793290039042, + "learning_rate": 2.4028829503345718e-06, + "loss": 0.2957, + "step": 23570 + }, + { + "epoch": 0.68, + "grad_norm": 1.4237560986472126, + "learning_rate": 2.402481588617137e-06, + "loss": 0.265, + "step": 23571 + }, + { + "epoch": 0.68, + "grad_norm": 1.362412941543329, + "learning_rate": 2.4020802498228333e-06, + "loss": 0.2765, + "step": 23572 + }, + { + "epoch": 0.68, + "grad_norm": 1.3597152280173266, + "learning_rate": 2.401678933955199e-06, + "loss": 0.3036, + "step": 23573 + }, + { + "epoch": 0.68, + "grad_norm": 1.0108654230771303, + "learning_rate": 2.4012776410177757e-06, + "loss": 0.5721, + "step": 23574 + }, + { + "epoch": 0.68, + "grad_norm": 1.4102284959923588, + "learning_rate": 2.400876371014107e-06, + "loss": 0.2976, + "step": 23575 + }, + { + "epoch": 0.68, + "grad_norm": 1.3038178731605095, + "learning_rate": 2.4004751239477327e-06, + "loss": 0.2845, + "step": 23576 + }, + { + "epoch": 0.68, + "grad_norm": 1.1789316826092269, + "learning_rate": 2.4000738998221945e-06, + "loss": 0.2751, + "step": 23577 + }, + { + "epoch": 0.68, + "grad_norm": 1.4581863747867692, + "learning_rate": 2.3996726986410324e-06, + "loss": 0.3033, + "step": 23578 + }, + { + "epoch": 0.68, + "grad_norm": 1.2960396754985204, + "learning_rate": 2.3992715204077884e-06, + "loss": 0.3005, + "step": 23579 + }, + { + "epoch": 0.68, + "grad_norm": 1.5902904480584121, + "learning_rate": 2.3988703651260016e-06, + "loss": 0.3063, + "step": 23580 + }, + { + "epoch": 0.68, + "grad_norm": 0.9265074911852431, + "learning_rate": 2.3984692327992143e-06, + "loss": 0.58, + "step": 23581 + }, + { + "epoch": 0.68, + "grad_norm": 1.2279655941851086, + "learning_rate": 2.3980681234309636e-06, + "loss": 0.2629, + "step": 23582 + }, + { + "epoch": 0.68, + "grad_norm": 1.3703266374372567, + "learning_rate": 2.3976670370247902e-06, + "loss": 0.2947, + "step": 23583 + }, + { + "epoch": 0.68, + "grad_norm": 1.347930701537185, + "learning_rate": 2.3972659735842342e-06, + "loss": 0.3179, + "step": 23584 + }, + { + "epoch": 0.68, + "grad_norm": 1.4323486644788728, + "learning_rate": 2.396864933112835e-06, + "loss": 0.2729, + "step": 23585 + }, + { + "epoch": 0.68, + "grad_norm": 1.2619628726225363, + "learning_rate": 2.396463915614131e-06, + "loss": 0.2521, + "step": 23586 + }, + { + "epoch": 0.68, + "grad_norm": 1.3711972697639059, + "learning_rate": 2.3960629210916624e-06, + "loss": 0.2867, + "step": 23587 + }, + { + "epoch": 0.68, + "grad_norm": 1.3117832638037465, + "learning_rate": 2.39566194954897e-06, + "loss": 0.2656, + "step": 23588 + }, + { + "epoch": 0.68, + "grad_norm": 2.5280989996922654, + "learning_rate": 2.3952610009895866e-06, + "loss": 0.2655, + "step": 23589 + }, + { + "epoch": 0.68, + "grad_norm": 1.9443309291301096, + "learning_rate": 2.3948600754170543e-06, + "loss": 0.3092, + "step": 23590 + }, + { + "epoch": 0.68, + "grad_norm": 0.9819227159794639, + "learning_rate": 2.3944591728349126e-06, + "loss": 0.544, + "step": 23591 + }, + { + "epoch": 0.68, + "grad_norm": 1.373913888704919, + "learning_rate": 2.3940582932466957e-06, + "loss": 0.2879, + "step": 23592 + }, + { + "epoch": 0.68, + "grad_norm": 0.8953237776002437, + "learning_rate": 2.3936574366559434e-06, + "loss": 0.6064, + "step": 23593 + }, + { + "epoch": 0.68, + "grad_norm": 1.364728222473628, + "learning_rate": 2.393256603066193e-06, + "loss": 0.2746, + "step": 23594 + }, + { + "epoch": 0.68, + "grad_norm": 1.3876164494697398, + "learning_rate": 2.392855792480982e-06, + "loss": 0.285, + "step": 23595 + }, + { + "epoch": 0.68, + "grad_norm": 1.4381701322333293, + "learning_rate": 2.3924550049038475e-06, + "loss": 0.2731, + "step": 23596 + }, + { + "epoch": 0.68, + "grad_norm": 2.8093645118458155, + "learning_rate": 2.392054240338327e-06, + "loss": 0.2712, + "step": 23597 + }, + { + "epoch": 0.68, + "grad_norm": 1.5547078207672114, + "learning_rate": 2.3916534987879558e-06, + "loss": 0.2798, + "step": 23598 + }, + { + "epoch": 0.68, + "grad_norm": 1.4217934207380531, + "learning_rate": 2.391252780256272e-06, + "loss": 0.3053, + "step": 23599 + }, + { + "epoch": 0.68, + "grad_norm": 1.4123124499993671, + "learning_rate": 2.390852084746813e-06, + "loss": 0.3001, + "step": 23600 + }, + { + "epoch": 0.68, + "grad_norm": 1.3239211392814807, + "learning_rate": 2.390451412263112e-06, + "loss": 0.3002, + "step": 23601 + }, + { + "epoch": 0.68, + "grad_norm": 1.2864244596798988, + "learning_rate": 2.3900507628087057e-06, + "loss": 0.3025, + "step": 23602 + }, + { + "epoch": 0.68, + "grad_norm": 1.2875480902544323, + "learning_rate": 2.3896501363871306e-06, + "loss": 0.2657, + "step": 23603 + }, + { + "epoch": 0.68, + "grad_norm": 1.333365501474769, + "learning_rate": 2.389249533001922e-06, + "loss": 0.2836, + "step": 23604 + }, + { + "epoch": 0.68, + "grad_norm": 1.3664156808462298, + "learning_rate": 2.388848952656615e-06, + "loss": 0.2832, + "step": 23605 + }, + { + "epoch": 0.68, + "grad_norm": 1.3946845645733825, + "learning_rate": 2.3884483953547448e-06, + "loss": 0.2788, + "step": 23606 + }, + { + "epoch": 0.68, + "grad_norm": 0.945449744533708, + "learning_rate": 2.388047861099847e-06, + "loss": 0.5753, + "step": 23607 + }, + { + "epoch": 0.68, + "grad_norm": 0.9331151811355018, + "learning_rate": 2.3876473498954573e-06, + "loss": 0.5568, + "step": 23608 + }, + { + "epoch": 0.68, + "grad_norm": 1.7046759607133075, + "learning_rate": 2.387246861745107e-06, + "loss": 0.2825, + "step": 23609 + }, + { + "epoch": 0.68, + "grad_norm": 1.350342807846268, + "learning_rate": 2.386846396652332e-06, + "loss": 0.2828, + "step": 23610 + }, + { + "epoch": 0.68, + "grad_norm": 1.1407532127550142, + "learning_rate": 2.386445954620667e-06, + "loss": 0.2765, + "step": 23611 + }, + { + "epoch": 0.68, + "grad_norm": 1.3778606367814492, + "learning_rate": 2.386045535653645e-06, + "loss": 0.2838, + "step": 23612 + }, + { + "epoch": 0.68, + "grad_norm": 1.4426314437858698, + "learning_rate": 2.3856451397548004e-06, + "loss": 0.28, + "step": 23613 + }, + { + "epoch": 0.68, + "grad_norm": 1.2295696533971725, + "learning_rate": 2.3852447669276663e-06, + "loss": 0.2748, + "step": 23614 + }, + { + "epoch": 0.68, + "grad_norm": 1.296389313981684, + "learning_rate": 2.3848444171757764e-06, + "loss": 0.279, + "step": 23615 + }, + { + "epoch": 0.68, + "grad_norm": 1.2653739201979173, + "learning_rate": 2.3844440905026646e-06, + "loss": 0.2852, + "step": 23616 + }, + { + "epoch": 0.69, + "grad_norm": 1.675086536796731, + "learning_rate": 2.3840437869118616e-06, + "loss": 0.2618, + "step": 23617 + }, + { + "epoch": 0.69, + "grad_norm": 1.219708359191806, + "learning_rate": 2.3836435064069006e-06, + "loss": 0.2805, + "step": 23618 + }, + { + "epoch": 0.69, + "grad_norm": 1.2785285859932896, + "learning_rate": 2.383243248991317e-06, + "loss": 0.279, + "step": 23619 + }, + { + "epoch": 0.69, + "grad_norm": 0.98139300764966, + "learning_rate": 2.382843014668639e-06, + "loss": 0.5626, + "step": 23620 + }, + { + "epoch": 0.69, + "grad_norm": 1.665271360374928, + "learning_rate": 2.3824428034424e-06, + "loss": 0.2728, + "step": 23621 + }, + { + "epoch": 0.69, + "grad_norm": 15.297725481664548, + "learning_rate": 2.382042615316133e-06, + "loss": 0.3058, + "step": 23622 + }, + { + "epoch": 0.69, + "grad_norm": 1.4084585507103056, + "learning_rate": 2.3816424502933684e-06, + "loss": 0.2873, + "step": 23623 + }, + { + "epoch": 0.69, + "grad_norm": 1.2265457295721542, + "learning_rate": 2.381242308377638e-06, + "loss": 0.2821, + "step": 23624 + }, + { + "epoch": 0.69, + "grad_norm": 1.5418591702206863, + "learning_rate": 2.3808421895724738e-06, + "loss": 0.2665, + "step": 23625 + }, + { + "epoch": 0.69, + "grad_norm": 1.4713168953892548, + "learning_rate": 2.380442093881406e-06, + "loss": 0.3026, + "step": 23626 + }, + { + "epoch": 0.69, + "grad_norm": 1.3199241176856744, + "learning_rate": 2.380042021307968e-06, + "loss": 0.2844, + "step": 23627 + }, + { + "epoch": 0.69, + "grad_norm": 1.4180397490992689, + "learning_rate": 2.379641971855686e-06, + "loss": 0.2811, + "step": 23628 + }, + { + "epoch": 0.69, + "grad_norm": 1.2941970601309196, + "learning_rate": 2.379241945528093e-06, + "loss": 0.2773, + "step": 23629 + }, + { + "epoch": 0.69, + "grad_norm": 1.3223453380631045, + "learning_rate": 2.378841942328719e-06, + "loss": 0.286, + "step": 23630 + }, + { + "epoch": 0.69, + "grad_norm": 1.5118465159010146, + "learning_rate": 2.378441962261093e-06, + "loss": 0.2847, + "step": 23631 + }, + { + "epoch": 0.69, + "grad_norm": 1.4818237162440242, + "learning_rate": 2.3780420053287468e-06, + "loss": 0.2978, + "step": 23632 + }, + { + "epoch": 0.69, + "grad_norm": 1.6695324993140608, + "learning_rate": 2.377642071535209e-06, + "loss": 0.276, + "step": 23633 + }, + { + "epoch": 0.69, + "grad_norm": 1.487242456468403, + "learning_rate": 2.3772421608840084e-06, + "loss": 0.3097, + "step": 23634 + }, + { + "epoch": 0.69, + "grad_norm": 1.605433046901651, + "learning_rate": 2.376842273378675e-06, + "loss": 0.2859, + "step": 23635 + }, + { + "epoch": 0.69, + "grad_norm": 1.2955736603919479, + "learning_rate": 2.3764424090227397e-06, + "loss": 0.303, + "step": 23636 + }, + { + "epoch": 0.69, + "grad_norm": 1.7410695932211735, + "learning_rate": 2.376042567819727e-06, + "loss": 0.3312, + "step": 23637 + }, + { + "epoch": 0.69, + "grad_norm": 1.3059803947511246, + "learning_rate": 2.3756427497731684e-06, + "loss": 0.2587, + "step": 23638 + }, + { + "epoch": 0.69, + "grad_norm": 1.3811695498845267, + "learning_rate": 2.3752429548865906e-06, + "loss": 0.2639, + "step": 23639 + }, + { + "epoch": 0.69, + "grad_norm": 1.3488220902346244, + "learning_rate": 2.3748431831635237e-06, + "loss": 0.289, + "step": 23640 + }, + { + "epoch": 0.69, + "grad_norm": 1.2762268003690995, + "learning_rate": 2.3744434346074945e-06, + "loss": 0.2777, + "step": 23641 + }, + { + "epoch": 0.69, + "grad_norm": 1.268458676647654, + "learning_rate": 2.3740437092220316e-06, + "loss": 0.2839, + "step": 23642 + }, + { + "epoch": 0.69, + "grad_norm": 1.2014292542248775, + "learning_rate": 2.3736440070106613e-06, + "loss": 0.2856, + "step": 23643 + }, + { + "epoch": 0.69, + "grad_norm": 1.680519118854205, + "learning_rate": 2.373244327976914e-06, + "loss": 0.2942, + "step": 23644 + }, + { + "epoch": 0.69, + "grad_norm": 1.5194750225631826, + "learning_rate": 2.3728446721243133e-06, + "loss": 0.2963, + "step": 23645 + }, + { + "epoch": 0.69, + "grad_norm": 1.3453589932414742, + "learning_rate": 2.3724450394563885e-06, + "loss": 0.2897, + "step": 23646 + }, + { + "epoch": 0.69, + "grad_norm": 1.258761704010476, + "learning_rate": 2.3720454299766636e-06, + "loss": 0.2639, + "step": 23647 + }, + { + "epoch": 0.69, + "grad_norm": 1.3306417905774404, + "learning_rate": 2.3716458436886673e-06, + "loss": 0.2884, + "step": 23648 + }, + { + "epoch": 0.69, + "grad_norm": 1.457034450671983, + "learning_rate": 2.371246280595926e-06, + "loss": 0.303, + "step": 23649 + }, + { + "epoch": 0.69, + "grad_norm": 1.3843013335467325, + "learning_rate": 2.3708467407019645e-06, + "loss": 0.2836, + "step": 23650 + }, + { + "epoch": 0.69, + "grad_norm": 1.3499628479154528, + "learning_rate": 2.3704472240103105e-06, + "loss": 0.282, + "step": 23651 + }, + { + "epoch": 0.69, + "grad_norm": 1.537955972524241, + "learning_rate": 2.3700477305244886e-06, + "loss": 0.2711, + "step": 23652 + }, + { + "epoch": 0.69, + "grad_norm": 1.2463251217036344, + "learning_rate": 2.3696482602480247e-06, + "loss": 0.2929, + "step": 23653 + }, + { + "epoch": 0.69, + "grad_norm": 1.5126845827591984, + "learning_rate": 2.3692488131844436e-06, + "loss": 0.3102, + "step": 23654 + }, + { + "epoch": 0.69, + "grad_norm": 1.4696379817699698, + "learning_rate": 2.3688493893372733e-06, + "loss": 0.2892, + "step": 23655 + }, + { + "epoch": 0.69, + "grad_norm": 1.3562344230433139, + "learning_rate": 2.3684499887100344e-06, + "loss": 0.2908, + "step": 23656 + }, + { + "epoch": 0.69, + "grad_norm": 0.9385735447302921, + "learning_rate": 2.3680506113062536e-06, + "loss": 0.5705, + "step": 23657 + }, + { + "epoch": 0.69, + "grad_norm": 1.3911864409833008, + "learning_rate": 2.3676512571294553e-06, + "loss": 0.2668, + "step": 23658 + }, + { + "epoch": 0.69, + "grad_norm": 1.4618900656648666, + "learning_rate": 2.3672519261831638e-06, + "loss": 0.2915, + "step": 23659 + }, + { + "epoch": 0.69, + "grad_norm": 1.5102624288184399, + "learning_rate": 2.3668526184709033e-06, + "loss": 0.29, + "step": 23660 + }, + { + "epoch": 0.69, + "grad_norm": 1.3021486470536874, + "learning_rate": 2.3664533339961977e-06, + "loss": 0.2788, + "step": 23661 + }, + { + "epoch": 0.69, + "grad_norm": 1.6852809730298761, + "learning_rate": 2.36605407276257e-06, + "loss": 0.2782, + "step": 23662 + }, + { + "epoch": 0.69, + "grad_norm": 1.7896828416146782, + "learning_rate": 2.365654834773547e-06, + "loss": 0.2885, + "step": 23663 + }, + { + "epoch": 0.69, + "grad_norm": 0.896321544289418, + "learning_rate": 2.365255620032647e-06, + "loss": 0.5682, + "step": 23664 + }, + { + "epoch": 0.69, + "grad_norm": 1.3283144133179203, + "learning_rate": 2.3648564285433957e-06, + "loss": 0.2924, + "step": 23665 + }, + { + "epoch": 0.69, + "grad_norm": 1.318811496021561, + "learning_rate": 2.3644572603093156e-06, + "loss": 0.3079, + "step": 23666 + }, + { + "epoch": 0.69, + "grad_norm": 1.4540144679468585, + "learning_rate": 2.3640581153339293e-06, + "loss": 0.3045, + "step": 23667 + }, + { + "epoch": 0.69, + "grad_norm": 1.3577305737437468, + "learning_rate": 2.3636589936207592e-06, + "loss": 0.2794, + "step": 23668 + }, + { + "epoch": 0.69, + "grad_norm": 1.4687869359805943, + "learning_rate": 2.363259895173328e-06, + "loss": 0.2764, + "step": 23669 + }, + { + "epoch": 0.69, + "grad_norm": 1.1788369576305764, + "learning_rate": 2.3628608199951575e-06, + "loss": 0.277, + "step": 23670 + }, + { + "epoch": 0.69, + "grad_norm": 1.2674791758988067, + "learning_rate": 2.3624617680897695e-06, + "loss": 0.2709, + "step": 23671 + }, + { + "epoch": 0.69, + "grad_norm": 1.2493084753569517, + "learning_rate": 2.3620627394606875e-06, + "loss": 0.2895, + "step": 23672 + }, + { + "epoch": 0.69, + "grad_norm": 1.3765199097177552, + "learning_rate": 2.361663734111429e-06, + "loss": 0.2917, + "step": 23673 + }, + { + "epoch": 0.69, + "grad_norm": 1.5687485859533692, + "learning_rate": 2.3612647520455193e-06, + "loss": 0.2939, + "step": 23674 + }, + { + "epoch": 0.69, + "grad_norm": 1.9515064578383587, + "learning_rate": 2.3608657932664757e-06, + "loss": 0.2982, + "step": 23675 + }, + { + "epoch": 0.69, + "grad_norm": 1.2366819968542275, + "learning_rate": 2.3604668577778207e-06, + "loss": 0.2726, + "step": 23676 + }, + { + "epoch": 0.69, + "grad_norm": 1.3677472486442133, + "learning_rate": 2.3600679455830748e-06, + "loss": 0.3463, + "step": 23677 + }, + { + "epoch": 0.69, + "grad_norm": 1.346819261399186, + "learning_rate": 2.3596690566857587e-06, + "loss": 0.2674, + "step": 23678 + }, + { + "epoch": 0.69, + "grad_norm": 1.3593787173217802, + "learning_rate": 2.359270191089393e-06, + "loss": 0.2999, + "step": 23679 + }, + { + "epoch": 0.69, + "grad_norm": 9.083516006197422, + "learning_rate": 2.358871348797496e-06, + "loss": 0.2655, + "step": 23680 + }, + { + "epoch": 0.69, + "grad_norm": 1.3028623325621327, + "learning_rate": 2.3584725298135897e-06, + "loss": 0.2817, + "step": 23681 + }, + { + "epoch": 0.69, + "grad_norm": 1.2796967314605114, + "learning_rate": 2.358073734141194e-06, + "loss": 0.3096, + "step": 23682 + }, + { + "epoch": 0.69, + "grad_norm": 1.3856163267029362, + "learning_rate": 2.3576749617838252e-06, + "loss": 0.2744, + "step": 23683 + }, + { + "epoch": 0.69, + "grad_norm": 1.5149418059826472, + "learning_rate": 2.357276212745004e-06, + "loss": 0.2577, + "step": 23684 + }, + { + "epoch": 0.69, + "grad_norm": 1.187108539334928, + "learning_rate": 2.3568774870282494e-06, + "loss": 0.2782, + "step": 23685 + }, + { + "epoch": 0.69, + "grad_norm": 1.2757716716972176, + "learning_rate": 2.3564787846370805e-06, + "loss": 0.2743, + "step": 23686 + }, + { + "epoch": 0.69, + "grad_norm": 1.3753637722182697, + "learning_rate": 2.356080105575016e-06, + "loss": 0.2825, + "step": 23687 + }, + { + "epoch": 0.69, + "grad_norm": 1.4459675802570882, + "learning_rate": 2.355681449845573e-06, + "loss": 0.2788, + "step": 23688 + }, + { + "epoch": 0.69, + "grad_norm": 1.335258071969475, + "learning_rate": 2.355282817452271e-06, + "loss": 0.3001, + "step": 23689 + }, + { + "epoch": 0.69, + "grad_norm": 1.3996766824638327, + "learning_rate": 2.3548842083986277e-06, + "loss": 0.272, + "step": 23690 + }, + { + "epoch": 0.69, + "grad_norm": 1.3130087535083739, + "learning_rate": 2.354485622688163e-06, + "loss": 0.2738, + "step": 23691 + }, + { + "epoch": 0.69, + "grad_norm": 0.9695233568960164, + "learning_rate": 2.354087060324389e-06, + "loss": 0.6127, + "step": 23692 + }, + { + "epoch": 0.69, + "grad_norm": 1.805753681623754, + "learning_rate": 2.353688521310827e-06, + "loss": 0.2732, + "step": 23693 + }, + { + "epoch": 0.69, + "grad_norm": 2.2134311003346077, + "learning_rate": 2.3532900056509927e-06, + "loss": 0.2862, + "step": 23694 + }, + { + "epoch": 0.69, + "grad_norm": 1.2302775703414075, + "learning_rate": 2.3528915133484033e-06, + "loss": 0.2611, + "step": 23695 + }, + { + "epoch": 0.69, + "grad_norm": 1.3691284502992056, + "learning_rate": 2.352493044406576e-06, + "loss": 0.2849, + "step": 23696 + }, + { + "epoch": 0.69, + "grad_norm": 1.4712739411375964, + "learning_rate": 2.3520945988290267e-06, + "loss": 0.2683, + "step": 23697 + }, + { + "epoch": 0.69, + "grad_norm": 1.3784143267810218, + "learning_rate": 2.351696176619272e-06, + "loss": 0.2865, + "step": 23698 + }, + { + "epoch": 0.69, + "grad_norm": 1.6239885553546827, + "learning_rate": 2.35129777778083e-06, + "loss": 0.2728, + "step": 23699 + }, + { + "epoch": 0.69, + "grad_norm": 1.2522467716937165, + "learning_rate": 2.350899402317212e-06, + "loss": 0.2696, + "step": 23700 + }, + { + "epoch": 0.69, + "grad_norm": 1.5149598899283185, + "learning_rate": 2.3505010502319388e-06, + "loss": 0.2873, + "step": 23701 + }, + { + "epoch": 0.69, + "grad_norm": 1.341672016873618, + "learning_rate": 2.3501027215285212e-06, + "loss": 0.2858, + "step": 23702 + }, + { + "epoch": 0.69, + "grad_norm": 1.390247912090421, + "learning_rate": 2.349704416210476e-06, + "loss": 0.2984, + "step": 23703 + }, + { + "epoch": 0.69, + "grad_norm": 1.1947560999511793, + "learning_rate": 2.349306134281319e-06, + "loss": 0.2745, + "step": 23704 + }, + { + "epoch": 0.69, + "grad_norm": 1.381995569025968, + "learning_rate": 2.3489078757445648e-06, + "loss": 0.2741, + "step": 23705 + }, + { + "epoch": 0.69, + "grad_norm": 1.4697782240466057, + "learning_rate": 2.3485096406037277e-06, + "loss": 0.263, + "step": 23706 + }, + { + "epoch": 0.69, + "grad_norm": 1.4061221175484115, + "learning_rate": 2.348111428862323e-06, + "loss": 0.2932, + "step": 23707 + }, + { + "epoch": 0.69, + "grad_norm": 1.2042037100678933, + "learning_rate": 2.3477132405238638e-06, + "loss": 0.2571, + "step": 23708 + }, + { + "epoch": 0.69, + "grad_norm": 1.562134921431058, + "learning_rate": 2.3473150755918657e-06, + "loss": 0.3101, + "step": 23709 + }, + { + "epoch": 0.69, + "grad_norm": 1.715450669112574, + "learning_rate": 2.3469169340698423e-06, + "loss": 0.2919, + "step": 23710 + }, + { + "epoch": 0.69, + "grad_norm": 1.3166217299288343, + "learning_rate": 2.346518815961305e-06, + "loss": 0.289, + "step": 23711 + }, + { + "epoch": 0.69, + "grad_norm": 1.353285050046501, + "learning_rate": 2.3461207212697685e-06, + "loss": 0.2969, + "step": 23712 + }, + { + "epoch": 0.69, + "grad_norm": 1.3719353964066143, + "learning_rate": 2.3457226499987456e-06, + "loss": 0.2906, + "step": 23713 + }, + { + "epoch": 0.69, + "grad_norm": 3.005213953526837, + "learning_rate": 2.3453246021517505e-06, + "loss": 0.2688, + "step": 23714 + }, + { + "epoch": 0.69, + "grad_norm": 1.7431082946228154, + "learning_rate": 2.344926577732295e-06, + "loss": 0.2592, + "step": 23715 + }, + { + "epoch": 0.69, + "grad_norm": 1.3566133341069775, + "learning_rate": 2.3445285767438915e-06, + "loss": 0.2739, + "step": 23716 + }, + { + "epoch": 0.69, + "grad_norm": 1.2687667156196951, + "learning_rate": 2.3441305991900533e-06, + "loss": 0.301, + "step": 23717 + }, + { + "epoch": 0.69, + "grad_norm": 1.4315208560975436, + "learning_rate": 2.3437326450742935e-06, + "loss": 0.2891, + "step": 23718 + }, + { + "epoch": 0.69, + "grad_norm": 1.5173044383709722, + "learning_rate": 2.343334714400121e-06, + "loss": 0.2799, + "step": 23719 + }, + { + "epoch": 0.69, + "grad_norm": 1.3884879220296333, + "learning_rate": 2.342936807171049e-06, + "loss": 0.3042, + "step": 23720 + }, + { + "epoch": 0.69, + "grad_norm": 1.6337640150990633, + "learning_rate": 2.342538923390589e-06, + "loss": 0.2795, + "step": 23721 + }, + { + "epoch": 0.69, + "grad_norm": 1.2268533406937674, + "learning_rate": 2.3421410630622527e-06, + "loss": 0.2797, + "step": 23722 + }, + { + "epoch": 0.69, + "grad_norm": 3.2573194960982166, + "learning_rate": 2.3417432261895506e-06, + "loss": 0.2855, + "step": 23723 + }, + { + "epoch": 0.69, + "grad_norm": 1.329106968150347, + "learning_rate": 2.3413454127759947e-06, + "loss": 0.2827, + "step": 23724 + }, + { + "epoch": 0.69, + "grad_norm": 1.2607851152936498, + "learning_rate": 2.3409476228250944e-06, + "loss": 0.2743, + "step": 23725 + }, + { + "epoch": 0.69, + "grad_norm": 1.389523568740818, + "learning_rate": 2.340549856340361e-06, + "loss": 0.3079, + "step": 23726 + }, + { + "epoch": 0.69, + "grad_norm": 1.5084647994506697, + "learning_rate": 2.3401521133253067e-06, + "loss": 0.2907, + "step": 23727 + }, + { + "epoch": 0.69, + "grad_norm": 1.6001816953882466, + "learning_rate": 2.339754393783438e-06, + "loss": 0.2775, + "step": 23728 + }, + { + "epoch": 0.69, + "grad_norm": 1.2659758410345505, + "learning_rate": 2.3393566977182674e-06, + "loss": 0.2646, + "step": 23729 + }, + { + "epoch": 0.69, + "grad_norm": 1.2554117732961954, + "learning_rate": 2.338959025133302e-06, + "loss": 0.285, + "step": 23730 + }, + { + "epoch": 0.69, + "grad_norm": 1.2911104220635718, + "learning_rate": 2.3385613760320526e-06, + "loss": 0.278, + "step": 23731 + }, + { + "epoch": 0.69, + "grad_norm": 1.2904323276929166, + "learning_rate": 2.338163750418029e-06, + "loss": 0.2655, + "step": 23732 + }, + { + "epoch": 0.69, + "grad_norm": 0.9451149204887603, + "learning_rate": 2.337766148294739e-06, + "loss": 0.5378, + "step": 23733 + }, + { + "epoch": 0.69, + "grad_norm": 1.3661485984873276, + "learning_rate": 2.337368569665693e-06, + "loss": 0.2852, + "step": 23734 + }, + { + "epoch": 0.69, + "grad_norm": 1.371104020054278, + "learning_rate": 2.336971014534398e-06, + "loss": 0.2849, + "step": 23735 + }, + { + "epoch": 0.69, + "grad_norm": 1.2827550641407197, + "learning_rate": 2.336573482904364e-06, + "loss": 0.3086, + "step": 23736 + }, + { + "epoch": 0.69, + "grad_norm": 1.5927013842149123, + "learning_rate": 2.3361759747791005e-06, + "loss": 0.3204, + "step": 23737 + }, + { + "epoch": 0.69, + "grad_norm": 1.5710644405505068, + "learning_rate": 2.3357784901621118e-06, + "loss": 0.2678, + "step": 23738 + }, + { + "epoch": 0.69, + "grad_norm": 1.2722490984024135, + "learning_rate": 2.335381029056907e-06, + "loss": 0.3134, + "step": 23739 + }, + { + "epoch": 0.69, + "grad_norm": 1.7650201637377783, + "learning_rate": 2.3349835914669942e-06, + "loss": 0.2868, + "step": 23740 + }, + { + "epoch": 0.69, + "grad_norm": 1.5233707886842978, + "learning_rate": 2.334586177395881e-06, + "loss": 0.2873, + "step": 23741 + }, + { + "epoch": 0.69, + "grad_norm": 1.3155841300457518, + "learning_rate": 2.3341887868470747e-06, + "loss": 0.2848, + "step": 23742 + }, + { + "epoch": 0.69, + "grad_norm": 1.4478656323004342, + "learning_rate": 2.3337914198240812e-06, + "loss": 0.291, + "step": 23743 + }, + { + "epoch": 0.69, + "grad_norm": 1.881976105141193, + "learning_rate": 2.3333940763304086e-06, + "loss": 0.2815, + "step": 23744 + }, + { + "epoch": 0.69, + "grad_norm": 1.5779954444518733, + "learning_rate": 2.3329967563695626e-06, + "loss": 0.2541, + "step": 23745 + }, + { + "epoch": 0.69, + "grad_norm": 1.2891783954752825, + "learning_rate": 2.3325994599450518e-06, + "loss": 0.2854, + "step": 23746 + }, + { + "epoch": 0.69, + "grad_norm": 1.4899649999855646, + "learning_rate": 2.3322021870603785e-06, + "loss": 0.3072, + "step": 23747 + }, + { + "epoch": 0.69, + "grad_norm": 0.9282099299816733, + "learning_rate": 2.3318049377190504e-06, + "loss": 0.5986, + "step": 23748 + }, + { + "epoch": 0.69, + "grad_norm": 1.2130239967517082, + "learning_rate": 2.3314077119245733e-06, + "loss": 0.278, + "step": 23749 + }, + { + "epoch": 0.69, + "grad_norm": 1.2953180531400015, + "learning_rate": 2.3310105096804526e-06, + "loss": 0.2677, + "step": 23750 + }, + { + "epoch": 0.69, + "grad_norm": 2.0756062453909823, + "learning_rate": 2.330613330990194e-06, + "loss": 0.2648, + "step": 23751 + }, + { + "epoch": 0.69, + "grad_norm": 1.525940180833704, + "learning_rate": 2.330216175857302e-06, + "loss": 0.2746, + "step": 23752 + }, + { + "epoch": 0.69, + "grad_norm": 1.2328540052385761, + "learning_rate": 2.3298190442852826e-06, + "loss": 0.2544, + "step": 23753 + }, + { + "epoch": 0.69, + "grad_norm": 1.7630047555049282, + "learning_rate": 2.329421936277641e-06, + "loss": 0.324, + "step": 23754 + }, + { + "epoch": 0.69, + "grad_norm": 1.2801007881451714, + "learning_rate": 2.3290248518378787e-06, + "loss": 0.283, + "step": 23755 + }, + { + "epoch": 0.69, + "grad_norm": 1.3069086938200085, + "learning_rate": 2.3286277909695036e-06, + "loss": 0.28, + "step": 23756 + }, + { + "epoch": 0.69, + "grad_norm": 1.3405156998553918, + "learning_rate": 2.3282307536760162e-06, + "loss": 0.2938, + "step": 23757 + }, + { + "epoch": 0.69, + "grad_norm": 1.2877083316094458, + "learning_rate": 2.3278337399609217e-06, + "loss": 0.2698, + "step": 23758 + }, + { + "epoch": 0.69, + "grad_norm": 1.3553564965200378, + "learning_rate": 2.3274367498277246e-06, + "loss": 0.2643, + "step": 23759 + }, + { + "epoch": 0.69, + "grad_norm": 1.320276008430493, + "learning_rate": 2.3270397832799274e-06, + "loss": 0.2868, + "step": 23760 + }, + { + "epoch": 0.69, + "grad_norm": 1.3145870491839076, + "learning_rate": 2.326642840321034e-06, + "loss": 0.2773, + "step": 23761 + }, + { + "epoch": 0.69, + "grad_norm": 1.4270709778012196, + "learning_rate": 2.326245920954547e-06, + "loss": 0.2829, + "step": 23762 + }, + { + "epoch": 0.69, + "grad_norm": 1.2194095071160502, + "learning_rate": 2.3258490251839693e-06, + "loss": 0.2758, + "step": 23763 + }, + { + "epoch": 0.69, + "grad_norm": 1.405361904736191, + "learning_rate": 2.3254521530128037e-06, + "loss": 0.2952, + "step": 23764 + }, + { + "epoch": 0.69, + "grad_norm": 1.289124079000003, + "learning_rate": 2.3250553044445547e-06, + "loss": 0.2654, + "step": 23765 + }, + { + "epoch": 0.69, + "grad_norm": 1.6729902816005016, + "learning_rate": 2.3246584794827205e-06, + "loss": 0.2811, + "step": 23766 + }, + { + "epoch": 0.69, + "grad_norm": 1.390915177832962, + "learning_rate": 2.3242616781308044e-06, + "loss": 0.2589, + "step": 23767 + }, + { + "epoch": 0.69, + "grad_norm": 0.9679156671845749, + "learning_rate": 2.3238649003923084e-06, + "loss": 0.6228, + "step": 23768 + }, + { + "epoch": 0.69, + "grad_norm": 1.2795878768730446, + "learning_rate": 2.323468146270735e-06, + "loss": 0.2632, + "step": 23769 + }, + { + "epoch": 0.69, + "grad_norm": 1.2473358622436521, + "learning_rate": 2.323071415769585e-06, + "loss": 0.2565, + "step": 23770 + }, + { + "epoch": 0.69, + "grad_norm": 1.2981829594007779, + "learning_rate": 2.322674708892359e-06, + "loss": 0.2706, + "step": 23771 + }, + { + "epoch": 0.69, + "grad_norm": 1.2228282722867148, + "learning_rate": 2.3222780256425588e-06, + "loss": 0.2806, + "step": 23772 + }, + { + "epoch": 0.69, + "grad_norm": 1.3277556613072161, + "learning_rate": 2.321881366023686e-06, + "loss": 0.2919, + "step": 23773 + }, + { + "epoch": 0.69, + "grad_norm": 1.3344061926598694, + "learning_rate": 2.3214847300392384e-06, + "loss": 0.2894, + "step": 23774 + }, + { + "epoch": 0.69, + "grad_norm": 1.8143804373767745, + "learning_rate": 2.321088117692718e-06, + "loss": 0.2894, + "step": 23775 + }, + { + "epoch": 0.69, + "grad_norm": 1.3501080179796165, + "learning_rate": 2.320691528987624e-06, + "loss": 0.2807, + "step": 23776 + }, + { + "epoch": 0.69, + "grad_norm": 1.350946839755914, + "learning_rate": 2.3202949639274575e-06, + "loss": 0.2818, + "step": 23777 + }, + { + "epoch": 0.69, + "grad_norm": 1.3034751180729782, + "learning_rate": 2.3198984225157177e-06, + "loss": 0.2673, + "step": 23778 + }, + { + "epoch": 0.69, + "grad_norm": 1.224221583695434, + "learning_rate": 2.319501904755904e-06, + "loss": 0.2791, + "step": 23779 + }, + { + "epoch": 0.69, + "grad_norm": 1.5251459726050043, + "learning_rate": 2.3191054106515156e-06, + "loss": 0.2827, + "step": 23780 + }, + { + "epoch": 0.69, + "grad_norm": 1.3349836181670045, + "learning_rate": 2.318708940206052e-06, + "loss": 0.2995, + "step": 23781 + }, + { + "epoch": 0.69, + "grad_norm": 1.303586835744742, + "learning_rate": 2.3183124934230136e-06, + "loss": 0.2819, + "step": 23782 + }, + { + "epoch": 0.69, + "grad_norm": 1.29391739011039, + "learning_rate": 2.3179160703058944e-06, + "loss": 0.2688, + "step": 23783 + }, + { + "epoch": 0.69, + "grad_norm": 1.2789602891903893, + "learning_rate": 2.3175196708581984e-06, + "loss": 0.2794, + "step": 23784 + }, + { + "epoch": 0.69, + "grad_norm": 1.9109688362193353, + "learning_rate": 2.317123295083419e-06, + "loss": 0.276, + "step": 23785 + }, + { + "epoch": 0.69, + "grad_norm": 0.9868972832464667, + "learning_rate": 2.3167269429850564e-06, + "loss": 0.5934, + "step": 23786 + }, + { + "epoch": 0.69, + "grad_norm": 1.2650718649388668, + "learning_rate": 2.3163306145666083e-06, + "loss": 0.276, + "step": 23787 + }, + { + "epoch": 0.69, + "grad_norm": 1.4542161904577249, + "learning_rate": 2.3159343098315716e-06, + "loss": 0.267, + "step": 23788 + }, + { + "epoch": 0.69, + "grad_norm": 1.5477958734885804, + "learning_rate": 2.315538028783445e-06, + "loss": 0.3302, + "step": 23789 + }, + { + "epoch": 0.69, + "grad_norm": 1.2697360508736193, + "learning_rate": 2.315141771425725e-06, + "loss": 0.309, + "step": 23790 + }, + { + "epoch": 0.69, + "grad_norm": 1.3201277806632838, + "learning_rate": 2.3147455377619086e-06, + "loss": 0.2797, + "step": 23791 + }, + { + "epoch": 0.69, + "grad_norm": 1.5053468997433999, + "learning_rate": 2.3143493277954947e-06, + "loss": 0.2847, + "step": 23792 + }, + { + "epoch": 0.69, + "grad_norm": 1.3369728454038137, + "learning_rate": 2.3139531415299758e-06, + "loss": 0.2879, + "step": 23793 + }, + { + "epoch": 0.69, + "grad_norm": 1.280408374053184, + "learning_rate": 2.31355697896885e-06, + "loss": 0.2914, + "step": 23794 + }, + { + "epoch": 0.69, + "grad_norm": 1.395986045144903, + "learning_rate": 2.313160840115614e-06, + "loss": 0.2784, + "step": 23795 + }, + { + "epoch": 0.69, + "grad_norm": 1.2620332637859382, + "learning_rate": 2.3127647249737635e-06, + "loss": 0.2756, + "step": 23796 + }, + { + "epoch": 0.69, + "grad_norm": 1.2922753762759605, + "learning_rate": 2.3123686335467938e-06, + "loss": 0.2676, + "step": 23797 + }, + { + "epoch": 0.69, + "grad_norm": 0.991127187824917, + "learning_rate": 2.3119725658382005e-06, + "loss": 0.5574, + "step": 23798 + }, + { + "epoch": 0.69, + "grad_norm": 1.920675347482165, + "learning_rate": 2.3115765218514797e-06, + "loss": 0.2674, + "step": 23799 + }, + { + "epoch": 0.69, + "grad_norm": 1.2553488166591782, + "learning_rate": 2.3111805015901256e-06, + "loss": 0.2671, + "step": 23800 + }, + { + "epoch": 0.69, + "grad_norm": 2.0679817176114126, + "learning_rate": 2.310784505057635e-06, + "loss": 0.3042, + "step": 23801 + }, + { + "epoch": 0.69, + "grad_norm": 1.3729091713251727, + "learning_rate": 2.310388532257499e-06, + "loss": 0.2876, + "step": 23802 + }, + { + "epoch": 0.69, + "grad_norm": 1.2177406523204521, + "learning_rate": 2.3099925831932148e-06, + "loss": 0.2625, + "step": 23803 + }, + { + "epoch": 0.69, + "grad_norm": 1.322683160746149, + "learning_rate": 2.3095966578682756e-06, + "loss": 0.3132, + "step": 23804 + }, + { + "epoch": 0.69, + "grad_norm": 1.345594885581546, + "learning_rate": 2.3092007562861756e-06, + "loss": 0.2695, + "step": 23805 + }, + { + "epoch": 0.69, + "grad_norm": 1.5872817013729483, + "learning_rate": 2.3088048784504093e-06, + "loss": 0.2887, + "step": 23806 + }, + { + "epoch": 0.69, + "grad_norm": 1.2019062636828488, + "learning_rate": 2.3084090243644693e-06, + "loss": 0.2756, + "step": 23807 + }, + { + "epoch": 0.69, + "grad_norm": 1.4550992493287835, + "learning_rate": 2.308013194031849e-06, + "loss": 0.2943, + "step": 23808 + }, + { + "epoch": 0.69, + "grad_norm": 1.316836437703343, + "learning_rate": 2.307617387456045e-06, + "loss": 0.2757, + "step": 23809 + }, + { + "epoch": 0.69, + "grad_norm": 1.2744574840151734, + "learning_rate": 2.307221604640545e-06, + "loss": 0.2943, + "step": 23810 + }, + { + "epoch": 0.69, + "grad_norm": 1.6557528933848895, + "learning_rate": 2.3068258455888443e-06, + "loss": 0.3282, + "step": 23811 + }, + { + "epoch": 0.69, + "grad_norm": 1.3099727544582396, + "learning_rate": 2.306430110304435e-06, + "loss": 0.2851, + "step": 23812 + }, + { + "epoch": 0.69, + "grad_norm": 1.2801425123424812, + "learning_rate": 2.3060343987908125e-06, + "loss": 0.2898, + "step": 23813 + }, + { + "epoch": 0.69, + "grad_norm": 1.329212232153532, + "learning_rate": 2.305638711051464e-06, + "loss": 0.2831, + "step": 23814 + }, + { + "epoch": 0.69, + "grad_norm": 1.3394981500777547, + "learning_rate": 2.305243047089884e-06, + "loss": 0.2745, + "step": 23815 + }, + { + "epoch": 0.69, + "grad_norm": 1.387794866709839, + "learning_rate": 2.304847406909564e-06, + "loss": 0.3048, + "step": 23816 + }, + { + "epoch": 0.69, + "grad_norm": 1.2755788582682634, + "learning_rate": 2.304451790513995e-06, + "loss": 0.2788, + "step": 23817 + }, + { + "epoch": 0.69, + "grad_norm": 1.5984045311605812, + "learning_rate": 2.3040561979066695e-06, + "loss": 0.2899, + "step": 23818 + }, + { + "epoch": 0.69, + "grad_norm": 1.3481197657794113, + "learning_rate": 2.303660629091078e-06, + "loss": 0.2902, + "step": 23819 + }, + { + "epoch": 0.69, + "grad_norm": 2.50778650462225, + "learning_rate": 2.303265084070713e-06, + "loss": 0.2696, + "step": 23820 + }, + { + "epoch": 0.69, + "grad_norm": 0.9847543188682492, + "learning_rate": 2.302869562849061e-06, + "loss": 0.5911, + "step": 23821 + }, + { + "epoch": 0.69, + "grad_norm": 2.13175494497467, + "learning_rate": 2.3024740654296158e-06, + "loss": 0.3035, + "step": 23822 + }, + { + "epoch": 0.69, + "grad_norm": 1.3472645435366022, + "learning_rate": 2.3020785918158667e-06, + "loss": 0.2741, + "step": 23823 + }, + { + "epoch": 0.69, + "grad_norm": 1.1969472592046435, + "learning_rate": 2.301683142011304e-06, + "loss": 0.2768, + "step": 23824 + }, + { + "epoch": 0.69, + "grad_norm": 1.4110331131479135, + "learning_rate": 2.3012877160194176e-06, + "loss": 0.2919, + "step": 23825 + }, + { + "epoch": 0.69, + "grad_norm": 1.4167906488065785, + "learning_rate": 2.3008923138436967e-06, + "loss": 0.2954, + "step": 23826 + }, + { + "epoch": 0.69, + "grad_norm": 1.2711252029276086, + "learning_rate": 2.3004969354876316e-06, + "loss": 0.2843, + "step": 23827 + }, + { + "epoch": 0.69, + "grad_norm": 1.2389041758691532, + "learning_rate": 2.3001015809547125e-06, + "loss": 0.2809, + "step": 23828 + }, + { + "epoch": 0.69, + "grad_norm": 1.4837091308086434, + "learning_rate": 2.2997062502484252e-06, + "loss": 0.29, + "step": 23829 + }, + { + "epoch": 0.69, + "grad_norm": 1.2597751616739186, + "learning_rate": 2.2993109433722597e-06, + "loss": 0.2731, + "step": 23830 + }, + { + "epoch": 0.69, + "grad_norm": 0.9163566022461507, + "learning_rate": 2.2989156603297056e-06, + "loss": 0.5704, + "step": 23831 + }, + { + "epoch": 0.69, + "grad_norm": 1.3413341875186238, + "learning_rate": 2.2985204011242507e-06, + "loss": 0.2867, + "step": 23832 + }, + { + "epoch": 0.69, + "grad_norm": 1.3099678003906423, + "learning_rate": 2.2981251657593834e-06, + "loss": 0.2561, + "step": 23833 + }, + { + "epoch": 0.69, + "grad_norm": 1.3135681155696688, + "learning_rate": 2.297729954238591e-06, + "loss": 0.2788, + "step": 23834 + }, + { + "epoch": 0.69, + "grad_norm": 1.3435664958289046, + "learning_rate": 2.2973347665653624e-06, + "loss": 0.2766, + "step": 23835 + }, + { + "epoch": 0.69, + "grad_norm": 1.3601742427720445, + "learning_rate": 2.296939602743184e-06, + "loss": 0.2842, + "step": 23836 + }, + { + "epoch": 0.69, + "grad_norm": 1.3830645045395276, + "learning_rate": 2.296544462775545e-06, + "loss": 0.2863, + "step": 23837 + }, + { + "epoch": 0.69, + "grad_norm": 2.011241451855522, + "learning_rate": 2.29614934666593e-06, + "loss": 0.2692, + "step": 23838 + }, + { + "epoch": 0.69, + "grad_norm": 1.6259451909956792, + "learning_rate": 2.295754254417827e-06, + "loss": 0.2843, + "step": 23839 + }, + { + "epoch": 0.69, + "grad_norm": 1.4264781771606176, + "learning_rate": 2.2953591860347225e-06, + "loss": 0.2973, + "step": 23840 + }, + { + "epoch": 0.69, + "grad_norm": 1.202354778928345, + "learning_rate": 2.294964141520105e-06, + "loss": 0.2687, + "step": 23841 + }, + { + "epoch": 0.69, + "grad_norm": 1.5848070329520667, + "learning_rate": 2.294569120877457e-06, + "loss": 0.2869, + "step": 23842 + }, + { + "epoch": 0.69, + "grad_norm": 1.4230192990508008, + "learning_rate": 2.2941741241102665e-06, + "loss": 0.293, + "step": 23843 + }, + { + "epoch": 0.69, + "grad_norm": 1.4850769606038103, + "learning_rate": 2.29377915122202e-06, + "loss": 0.2973, + "step": 23844 + }, + { + "epoch": 0.69, + "grad_norm": 1.7478756843992245, + "learning_rate": 2.2933842022162018e-06, + "loss": 0.2956, + "step": 23845 + }, + { + "epoch": 0.69, + "grad_norm": 1.3613609245801555, + "learning_rate": 2.292989277096298e-06, + "loss": 0.2964, + "step": 23846 + }, + { + "epoch": 0.69, + "grad_norm": 1.1979625033126984, + "learning_rate": 2.292594375865796e-06, + "loss": 0.2746, + "step": 23847 + }, + { + "epoch": 0.69, + "grad_norm": 1.3944721315171515, + "learning_rate": 2.292199498528177e-06, + "loss": 0.281, + "step": 23848 + }, + { + "epoch": 0.69, + "grad_norm": 1.5323503387419415, + "learning_rate": 2.2918046450869276e-06, + "loss": 0.287, + "step": 23849 + }, + { + "epoch": 0.69, + "grad_norm": 1.3312793963160492, + "learning_rate": 2.2914098155455315e-06, + "loss": 0.2938, + "step": 23850 + }, + { + "epoch": 0.69, + "grad_norm": 1.4575734406230905, + "learning_rate": 2.291015009907474e-06, + "loss": 0.2885, + "step": 23851 + }, + { + "epoch": 0.69, + "grad_norm": 1.578996974145131, + "learning_rate": 2.2906202281762394e-06, + "loss": 0.2866, + "step": 23852 + }, + { + "epoch": 0.69, + "grad_norm": 1.1989747448718937, + "learning_rate": 2.2902254703553112e-06, + "loss": 0.2689, + "step": 23853 + }, + { + "epoch": 0.69, + "grad_norm": 1.2412111115403914, + "learning_rate": 2.289830736448174e-06, + "loss": 0.2626, + "step": 23854 + }, + { + "epoch": 0.69, + "grad_norm": 0.952086464471871, + "learning_rate": 2.2894360264583093e-06, + "loss": 0.525, + "step": 23855 + }, + { + "epoch": 0.69, + "grad_norm": 1.5639423835676376, + "learning_rate": 2.289041340389204e-06, + "loss": 0.2804, + "step": 23856 + }, + { + "epoch": 0.69, + "grad_norm": 1.2269868860034148, + "learning_rate": 2.2886466782443378e-06, + "loss": 0.2788, + "step": 23857 + }, + { + "epoch": 0.69, + "grad_norm": 1.3291749759360756, + "learning_rate": 2.2882520400271947e-06, + "loss": 0.3069, + "step": 23858 + }, + { + "epoch": 0.69, + "grad_norm": 1.2782399755962432, + "learning_rate": 2.2878574257412565e-06, + "loss": 0.2919, + "step": 23859 + }, + { + "epoch": 0.69, + "grad_norm": 1.1455543941021777, + "learning_rate": 2.2874628353900075e-06, + "loss": 0.2614, + "step": 23860 + }, + { + "epoch": 0.69, + "grad_norm": 1.535876334412794, + "learning_rate": 2.2870682689769286e-06, + "loss": 0.3001, + "step": 23861 + }, + { + "epoch": 0.69, + "grad_norm": 1.3323952548591165, + "learning_rate": 2.2866737265055023e-06, + "loss": 0.3014, + "step": 23862 + }, + { + "epoch": 0.69, + "grad_norm": 1.4952695628646464, + "learning_rate": 2.286279207979211e-06, + "loss": 0.2634, + "step": 23863 + }, + { + "epoch": 0.69, + "grad_norm": 1.3231453388427865, + "learning_rate": 2.2858847134015367e-06, + "loss": 0.2939, + "step": 23864 + }, + { + "epoch": 0.69, + "grad_norm": 1.3684339755519266, + "learning_rate": 2.2854902427759584e-06, + "loss": 0.2879, + "step": 23865 + }, + { + "epoch": 0.69, + "grad_norm": 1.2891644493886185, + "learning_rate": 2.2850957961059595e-06, + "loss": 0.264, + "step": 23866 + }, + { + "epoch": 0.69, + "grad_norm": 1.2412670041223164, + "learning_rate": 2.2847013733950194e-06, + "loss": 0.2843, + "step": 23867 + }, + { + "epoch": 0.69, + "grad_norm": 1.3050192668840992, + "learning_rate": 2.2843069746466207e-06, + "loss": 0.2802, + "step": 23868 + }, + { + "epoch": 0.69, + "grad_norm": 1.2340635343049338, + "learning_rate": 2.283912599864244e-06, + "loss": 0.2777, + "step": 23869 + }, + { + "epoch": 0.69, + "grad_norm": 1.1612021663452055, + "learning_rate": 2.2835182490513673e-06, + "loss": 0.2611, + "step": 23870 + }, + { + "epoch": 0.69, + "grad_norm": 1.302870670133385, + "learning_rate": 2.2831239222114727e-06, + "loss": 0.2819, + "step": 23871 + }, + { + "epoch": 0.69, + "grad_norm": 1.3487879626388928, + "learning_rate": 2.2827296193480392e-06, + "loss": 0.2867, + "step": 23872 + }, + { + "epoch": 0.69, + "grad_norm": 1.8204046042671784, + "learning_rate": 2.2823353404645464e-06, + "loss": 0.2688, + "step": 23873 + }, + { + "epoch": 0.69, + "grad_norm": 1.2714122121468254, + "learning_rate": 2.2819410855644767e-06, + "loss": 0.2675, + "step": 23874 + }, + { + "epoch": 0.69, + "grad_norm": 1.6440778338071893, + "learning_rate": 2.2815468546513052e-06, + "loss": 0.2854, + "step": 23875 + }, + { + "epoch": 0.69, + "grad_norm": 1.405340613975758, + "learning_rate": 2.2811526477285133e-06, + "loss": 0.3043, + "step": 23876 + }, + { + "epoch": 0.69, + "grad_norm": 1.3943417853677578, + "learning_rate": 2.280758464799579e-06, + "loss": 0.2748, + "step": 23877 + }, + { + "epoch": 0.69, + "grad_norm": 3.3950771307877505, + "learning_rate": 2.2803643058679816e-06, + "loss": 0.2616, + "step": 23878 + }, + { + "epoch": 0.69, + "grad_norm": 1.4299818105479052, + "learning_rate": 2.279970170937199e-06, + "loss": 0.275, + "step": 23879 + }, + { + "epoch": 0.69, + "grad_norm": 1.2573781512553008, + "learning_rate": 2.2795760600107105e-06, + "loss": 0.2803, + "step": 23880 + }, + { + "epoch": 0.69, + "grad_norm": 1.3347255212861802, + "learning_rate": 2.2791819730919927e-06, + "loss": 0.2829, + "step": 23881 + }, + { + "epoch": 0.69, + "grad_norm": 1.1881861308683832, + "learning_rate": 2.2787879101845245e-06, + "loss": 0.2555, + "step": 23882 + }, + { + "epoch": 0.69, + "grad_norm": 1.2494321917171272, + "learning_rate": 2.2783938712917846e-06, + "loss": 0.2806, + "step": 23883 + }, + { + "epoch": 0.69, + "grad_norm": 1.5674822492094809, + "learning_rate": 2.277999856417248e-06, + "loss": 0.2764, + "step": 23884 + }, + { + "epoch": 0.69, + "grad_norm": 0.9731565325944455, + "learning_rate": 2.2776058655643925e-06, + "loss": 0.5728, + "step": 23885 + }, + { + "epoch": 0.69, + "grad_norm": 1.6116080119158467, + "learning_rate": 2.277211898736695e-06, + "loss": 0.2886, + "step": 23886 + }, + { + "epoch": 0.69, + "grad_norm": 1.3665860057300414, + "learning_rate": 2.276817955937633e-06, + "loss": 0.2815, + "step": 23887 + }, + { + "epoch": 0.69, + "grad_norm": 1.2523077729106868, + "learning_rate": 2.2764240371706833e-06, + "loss": 0.2902, + "step": 23888 + }, + { + "epoch": 0.69, + "grad_norm": 1.4758546772875734, + "learning_rate": 2.2760301424393206e-06, + "loss": 0.2744, + "step": 23889 + }, + { + "epoch": 0.69, + "grad_norm": 1.3214364517921406, + "learning_rate": 2.2756362717470227e-06, + "loss": 0.2942, + "step": 23890 + }, + { + "epoch": 0.69, + "grad_norm": 1.426329078546846, + "learning_rate": 2.2752424250972672e-06, + "loss": 0.2752, + "step": 23891 + }, + { + "epoch": 0.69, + "grad_norm": 2.9989234060429864, + "learning_rate": 2.274848602493525e-06, + "loss": 0.2801, + "step": 23892 + }, + { + "epoch": 0.69, + "grad_norm": 1.5803587102236232, + "learning_rate": 2.2744548039392745e-06, + "loss": 0.2923, + "step": 23893 + }, + { + "epoch": 0.69, + "grad_norm": 1.1685486098286473, + "learning_rate": 2.27406102943799e-06, + "loss": 0.2731, + "step": 23894 + }, + { + "epoch": 0.69, + "grad_norm": 1.3475002904194175, + "learning_rate": 2.273667278993148e-06, + "loss": 0.2894, + "step": 23895 + }, + { + "epoch": 0.69, + "grad_norm": 1.248689132706461, + "learning_rate": 2.273273552608222e-06, + "loss": 0.2881, + "step": 23896 + }, + { + "epoch": 0.69, + "grad_norm": 1.200131423247263, + "learning_rate": 2.2728798502866887e-06, + "loss": 0.2985, + "step": 23897 + }, + { + "epoch": 0.69, + "grad_norm": 1.32026124914708, + "learning_rate": 2.2724861720320196e-06, + "loss": 0.2773, + "step": 23898 + }, + { + "epoch": 0.69, + "grad_norm": 1.3117054682499958, + "learning_rate": 2.2720925178476903e-06, + "loss": 0.2647, + "step": 23899 + }, + { + "epoch": 0.69, + "grad_norm": 1.4539790169767193, + "learning_rate": 2.271698887737175e-06, + "loss": 0.2562, + "step": 23900 + }, + { + "epoch": 0.69, + "grad_norm": 1.4241708770501595, + "learning_rate": 2.271305281703947e-06, + "loss": 0.2986, + "step": 23901 + }, + { + "epoch": 0.69, + "grad_norm": 1.4084849038638894, + "learning_rate": 2.270911699751482e-06, + "loss": 0.2965, + "step": 23902 + }, + { + "epoch": 0.69, + "grad_norm": 1.2767215724148848, + "learning_rate": 2.2705181418832493e-06, + "loss": 0.2861, + "step": 23903 + }, + { + "epoch": 0.69, + "grad_norm": 1.208834859360874, + "learning_rate": 2.270124608102725e-06, + "loss": 0.2803, + "step": 23904 + }, + { + "epoch": 0.69, + "grad_norm": 1.4317232422701347, + "learning_rate": 2.269731098413381e-06, + "loss": 0.2668, + "step": 23905 + }, + { + "epoch": 0.69, + "grad_norm": 0.9732901491099889, + "learning_rate": 2.26933761281869e-06, + "loss": 0.5904, + "step": 23906 + }, + { + "epoch": 0.69, + "grad_norm": 1.9525307196118773, + "learning_rate": 2.268944151322125e-06, + "loss": 0.285, + "step": 23907 + }, + { + "epoch": 0.69, + "grad_norm": 1.912229779138504, + "learning_rate": 2.2685507139271585e-06, + "loss": 0.2738, + "step": 23908 + }, + { + "epoch": 0.69, + "grad_norm": 1.2934244925131169, + "learning_rate": 2.268157300637262e-06, + "loss": 0.2716, + "step": 23909 + }, + { + "epoch": 0.69, + "grad_norm": 1.7134202742105358, + "learning_rate": 2.2677639114559097e-06, + "loss": 0.3044, + "step": 23910 + }, + { + "epoch": 0.69, + "grad_norm": 1.2189792569152116, + "learning_rate": 2.267370546386569e-06, + "loss": 0.2729, + "step": 23911 + }, + { + "epoch": 0.69, + "grad_norm": 1.4103566525849458, + "learning_rate": 2.266977205432714e-06, + "loss": 0.3, + "step": 23912 + }, + { + "epoch": 0.69, + "grad_norm": 1.3281855325242673, + "learning_rate": 2.266583888597815e-06, + "loss": 0.2876, + "step": 23913 + }, + { + "epoch": 0.69, + "grad_norm": 1.6367614444570449, + "learning_rate": 2.2661905958853437e-06, + "loss": 0.2916, + "step": 23914 + }, + { + "epoch": 0.69, + "grad_norm": 1.5893948818530321, + "learning_rate": 2.265797327298771e-06, + "loss": 0.3077, + "step": 23915 + }, + { + "epoch": 0.69, + "grad_norm": 1.4245495476409855, + "learning_rate": 2.2654040828415668e-06, + "loss": 0.2825, + "step": 23916 + }, + { + "epoch": 0.69, + "grad_norm": 3.556328498227469, + "learning_rate": 2.2650108625172023e-06, + "loss": 0.271, + "step": 23917 + }, + { + "epoch": 0.69, + "grad_norm": 1.253351875870539, + "learning_rate": 2.264617666329147e-06, + "loss": 0.302, + "step": 23918 + }, + { + "epoch": 0.69, + "grad_norm": 1.3033667466315875, + "learning_rate": 2.2642244942808727e-06, + "loss": 0.2754, + "step": 23919 + }, + { + "epoch": 0.69, + "grad_norm": 1.3721004489395356, + "learning_rate": 2.2638313463758465e-06, + "loss": 0.2628, + "step": 23920 + }, + { + "epoch": 0.69, + "grad_norm": 1.4981504912423937, + "learning_rate": 2.2634382226175384e-06, + "loss": 0.2914, + "step": 23921 + }, + { + "epoch": 0.69, + "grad_norm": 1.5426953778685562, + "learning_rate": 2.2630451230094187e-06, + "loss": 0.279, + "step": 23922 + }, + { + "epoch": 0.69, + "grad_norm": 1.192862054070626, + "learning_rate": 2.262652047554956e-06, + "loss": 0.2756, + "step": 23923 + }, + { + "epoch": 0.69, + "grad_norm": 1.3108804434467172, + "learning_rate": 2.262258996257619e-06, + "loss": 0.2794, + "step": 23924 + }, + { + "epoch": 0.69, + "grad_norm": 1.274047252644305, + "learning_rate": 2.261865969120879e-06, + "loss": 0.2893, + "step": 23925 + }, + { + "epoch": 0.69, + "grad_norm": 1.6190162849321634, + "learning_rate": 2.2614729661482005e-06, + "loss": 0.2989, + "step": 23926 + }, + { + "epoch": 0.69, + "grad_norm": 1.4368244925657927, + "learning_rate": 2.2610799873430528e-06, + "loss": 0.3225, + "step": 23927 + }, + { + "epoch": 0.69, + "grad_norm": 1.5466769688421924, + "learning_rate": 2.2606870327089054e-06, + "loss": 0.2985, + "step": 23928 + }, + { + "epoch": 0.69, + "grad_norm": 1.722140189668214, + "learning_rate": 2.2602941022492265e-06, + "loss": 0.2668, + "step": 23929 + }, + { + "epoch": 0.69, + "grad_norm": 1.4647931382297228, + "learning_rate": 2.259901195967481e-06, + "loss": 0.2835, + "step": 23930 + }, + { + "epoch": 0.69, + "grad_norm": 1.311465302834257, + "learning_rate": 2.2595083138671382e-06, + "loss": 0.3013, + "step": 23931 + }, + { + "epoch": 0.69, + "grad_norm": 1.4105117243915202, + "learning_rate": 2.2591154559516643e-06, + "loss": 0.2886, + "step": 23932 + }, + { + "epoch": 0.69, + "grad_norm": 2.266327439929794, + "learning_rate": 2.2587226222245274e-06, + "loss": 0.3126, + "step": 23933 + }, + { + "epoch": 0.69, + "grad_norm": 1.381104052087309, + "learning_rate": 2.258329812689193e-06, + "loss": 0.2866, + "step": 23934 + }, + { + "epoch": 0.69, + "grad_norm": 2.0329472601256477, + "learning_rate": 2.257937027349129e-06, + "loss": 0.3401, + "step": 23935 + }, + { + "epoch": 0.69, + "grad_norm": 1.2340489931756395, + "learning_rate": 2.257544266207801e-06, + "loss": 0.2947, + "step": 23936 + }, + { + "epoch": 0.69, + "grad_norm": 1.2804208761523892, + "learning_rate": 2.257151529268676e-06, + "loss": 0.2698, + "step": 23937 + }, + { + "epoch": 0.69, + "grad_norm": 0.9505776808259896, + "learning_rate": 2.25675881653522e-06, + "loss": 0.6621, + "step": 23938 + }, + { + "epoch": 0.69, + "grad_norm": 1.2601829101698465, + "learning_rate": 2.2563661280108967e-06, + "loss": 0.3153, + "step": 23939 + }, + { + "epoch": 0.69, + "grad_norm": 2.0191941806494715, + "learning_rate": 2.255973463699172e-06, + "loss": 0.2942, + "step": 23940 + }, + { + "epoch": 0.69, + "grad_norm": 1.3322775989360793, + "learning_rate": 2.2555808236035127e-06, + "loss": 0.2976, + "step": 23941 + }, + { + "epoch": 0.69, + "grad_norm": 1.316715325514289, + "learning_rate": 2.255188207727383e-06, + "loss": 0.2673, + "step": 23942 + }, + { + "epoch": 0.69, + "grad_norm": 1.8423844364196817, + "learning_rate": 2.2547956160742473e-06, + "loss": 0.28, + "step": 23943 + }, + { + "epoch": 0.69, + "grad_norm": 1.4804515422516618, + "learning_rate": 2.254403048647571e-06, + "loss": 0.2924, + "step": 23944 + }, + { + "epoch": 0.69, + "grad_norm": 1.4064659504623973, + "learning_rate": 2.2540105054508183e-06, + "loss": 0.262, + "step": 23945 + }, + { + "epoch": 0.69, + "grad_norm": 1.3243854196182998, + "learning_rate": 2.2536179864874553e-06, + "loss": 0.269, + "step": 23946 + }, + { + "epoch": 0.69, + "grad_norm": 1.3030332545115497, + "learning_rate": 2.253225491760942e-06, + "loss": 0.2833, + "step": 23947 + }, + { + "epoch": 0.69, + "grad_norm": 1.45940992691385, + "learning_rate": 2.2528330212747445e-06, + "loss": 0.2932, + "step": 23948 + }, + { + "epoch": 0.69, + "grad_norm": 1.614182656607098, + "learning_rate": 2.252440575032326e-06, + "loss": 0.2765, + "step": 23949 + }, + { + "epoch": 0.69, + "grad_norm": 1.3570727116101482, + "learning_rate": 2.2520481530371495e-06, + "loss": 0.2874, + "step": 23950 + }, + { + "epoch": 0.69, + "grad_norm": 1.2226929971931741, + "learning_rate": 2.2516557552926788e-06, + "loss": 0.2809, + "step": 23951 + }, + { + "epoch": 0.69, + "grad_norm": 1.2549451909538587, + "learning_rate": 2.2512633818023765e-06, + "loss": 0.2769, + "step": 23952 + }, + { + "epoch": 0.69, + "grad_norm": 1.4913681357901318, + "learning_rate": 2.2508710325697074e-06, + "loss": 0.3145, + "step": 23953 + }, + { + "epoch": 0.69, + "grad_norm": 1.2140086550842202, + "learning_rate": 2.25047870759813e-06, + "loss": 0.278, + "step": 23954 + }, + { + "epoch": 0.69, + "grad_norm": 1.3901094947761596, + "learning_rate": 2.250086406891108e-06, + "loss": 0.3005, + "step": 23955 + }, + { + "epoch": 0.69, + "grad_norm": 1.0092181666945708, + "learning_rate": 2.2496941304521046e-06, + "loss": 0.607, + "step": 23956 + }, + { + "epoch": 0.69, + "grad_norm": 1.2823258868850864, + "learning_rate": 2.2493018782845828e-06, + "loss": 0.2877, + "step": 23957 + }, + { + "epoch": 0.69, + "grad_norm": 1.4710194419979628, + "learning_rate": 2.2489096503920003e-06, + "loss": 0.2912, + "step": 23958 + }, + { + "epoch": 0.69, + "grad_norm": 5.838600358887637, + "learning_rate": 2.2485174467778207e-06, + "loss": 0.2879, + "step": 23959 + }, + { + "epoch": 0.69, + "grad_norm": 1.4529611046686803, + "learning_rate": 2.2481252674455054e-06, + "loss": 0.2896, + "step": 23960 + }, + { + "epoch": 0.69, + "grad_norm": 1.5499603418441332, + "learning_rate": 2.2477331123985154e-06, + "loss": 0.2895, + "step": 23961 + }, + { + "epoch": 0.7, + "grad_norm": 1.3986014909602038, + "learning_rate": 2.2473409816403103e-06, + "loss": 0.2757, + "step": 23962 + }, + { + "epoch": 0.7, + "grad_norm": 1.2458000084369845, + "learning_rate": 2.2469488751743524e-06, + "loss": 0.2789, + "step": 23963 + }, + { + "epoch": 0.7, + "grad_norm": 1.3274350091551537, + "learning_rate": 2.2465567930041006e-06, + "loss": 0.2727, + "step": 23964 + }, + { + "epoch": 0.7, + "grad_norm": 1.261636361069858, + "learning_rate": 2.246164735133018e-06, + "loss": 0.2931, + "step": 23965 + }, + { + "epoch": 0.7, + "grad_norm": 1.6314342069419454, + "learning_rate": 2.24577270156456e-06, + "loss": 0.3594, + "step": 23966 + }, + { + "epoch": 0.7, + "grad_norm": 1.357654744173747, + "learning_rate": 2.2453806923021887e-06, + "loss": 0.3038, + "step": 23967 + }, + { + "epoch": 0.7, + "grad_norm": 1.2396740882072927, + "learning_rate": 2.2449887073493633e-06, + "loss": 0.2962, + "step": 23968 + }, + { + "epoch": 0.7, + "grad_norm": 1.7180707720781179, + "learning_rate": 2.244596746709543e-06, + "loss": 0.291, + "step": 23969 + }, + { + "epoch": 0.7, + "grad_norm": 1.3978415945955431, + "learning_rate": 2.244204810386187e-06, + "loss": 0.2678, + "step": 23970 + }, + { + "epoch": 0.7, + "grad_norm": 1.3493229825152424, + "learning_rate": 2.243812898382754e-06, + "loss": 0.2897, + "step": 23971 + }, + { + "epoch": 0.7, + "grad_norm": 1.2932121822626346, + "learning_rate": 2.2434210107027025e-06, + "loss": 0.284, + "step": 23972 + }, + { + "epoch": 0.7, + "grad_norm": 1.4828604236543093, + "learning_rate": 2.243029147349492e-06, + "loss": 0.2668, + "step": 23973 + }, + { + "epoch": 0.7, + "grad_norm": 1.2195314618337332, + "learning_rate": 2.2426373083265813e-06, + "loss": 0.2841, + "step": 23974 + }, + { + "epoch": 0.7, + "grad_norm": 1.4697642564778248, + "learning_rate": 2.242245493637425e-06, + "loss": 0.2713, + "step": 23975 + }, + { + "epoch": 0.7, + "grad_norm": 1.4708782288249191, + "learning_rate": 2.2418537032854827e-06, + "loss": 0.3098, + "step": 23976 + }, + { + "epoch": 0.7, + "grad_norm": 1.3140901453837057, + "learning_rate": 2.2414619372742125e-06, + "loss": 0.2855, + "step": 23977 + }, + { + "epoch": 0.7, + "grad_norm": 2.6257516967002204, + "learning_rate": 2.2410701956070706e-06, + "loss": 0.278, + "step": 23978 + }, + { + "epoch": 0.7, + "grad_norm": 1.3177773566358189, + "learning_rate": 2.2406784782875157e-06, + "loss": 0.2765, + "step": 23979 + }, + { + "epoch": 0.7, + "grad_norm": 1.4611429018559565, + "learning_rate": 2.240286785319003e-06, + "loss": 0.2833, + "step": 23980 + }, + { + "epoch": 0.7, + "grad_norm": 1.4199851368235588, + "learning_rate": 2.2398951167049927e-06, + "loss": 0.2728, + "step": 23981 + }, + { + "epoch": 0.7, + "grad_norm": 1.3293355127929165, + "learning_rate": 2.239503472448936e-06, + "loss": 0.296, + "step": 23982 + }, + { + "epoch": 0.7, + "grad_norm": 1.6915800125193652, + "learning_rate": 2.239111852554292e-06, + "loss": 0.2801, + "step": 23983 + }, + { + "epoch": 0.7, + "grad_norm": 1.7005722816556252, + "learning_rate": 2.2387202570245186e-06, + "loss": 0.306, + "step": 23984 + }, + { + "epoch": 0.7, + "grad_norm": 1.6993702837105766, + "learning_rate": 2.2383286858630677e-06, + "loss": 0.3064, + "step": 23985 + }, + { + "epoch": 0.7, + "grad_norm": 1.389145633231868, + "learning_rate": 2.2379371390733966e-06, + "loss": 0.299, + "step": 23986 + }, + { + "epoch": 0.7, + "grad_norm": 1.5856757052729928, + "learning_rate": 2.2375456166589608e-06, + "loss": 0.2887, + "step": 23987 + }, + { + "epoch": 0.7, + "grad_norm": 1.3229810492881855, + "learning_rate": 2.237154118623216e-06, + "loss": 0.3236, + "step": 23988 + }, + { + "epoch": 0.7, + "grad_norm": 1.2698174386751777, + "learning_rate": 2.2367626449696168e-06, + "loss": 0.2769, + "step": 23989 + }, + { + "epoch": 0.7, + "grad_norm": 1.3676978869364729, + "learning_rate": 2.236371195701618e-06, + "loss": 0.2896, + "step": 23990 + }, + { + "epoch": 0.7, + "grad_norm": 1.4717926127128338, + "learning_rate": 2.235979770822673e-06, + "loss": 0.3041, + "step": 23991 + }, + { + "epoch": 0.7, + "grad_norm": 1.623348754154734, + "learning_rate": 2.235588370336238e-06, + "loss": 0.2777, + "step": 23992 + }, + { + "epoch": 0.7, + "grad_norm": 1.2322925237115117, + "learning_rate": 2.235196994245768e-06, + "loss": 0.2745, + "step": 23993 + }, + { + "epoch": 0.7, + "grad_norm": 1.3198919770364104, + "learning_rate": 2.234805642554714e-06, + "loss": 0.2891, + "step": 23994 + }, + { + "epoch": 0.7, + "grad_norm": 1.5332405785951189, + "learning_rate": 2.23441431526653e-06, + "loss": 0.2814, + "step": 23995 + }, + { + "epoch": 0.7, + "grad_norm": 1.497670399674921, + "learning_rate": 2.23402301238467e-06, + "loss": 0.2736, + "step": 23996 + }, + { + "epoch": 0.7, + "grad_norm": 1.3473826855420128, + "learning_rate": 2.2336317339125885e-06, + "loss": 0.3207, + "step": 23997 + }, + { + "epoch": 0.7, + "grad_norm": 1.3175898019823364, + "learning_rate": 2.2332404798537373e-06, + "loss": 0.2835, + "step": 23998 + }, + { + "epoch": 0.7, + "grad_norm": 1.2478849715300602, + "learning_rate": 2.2328492502115695e-06, + "loss": 0.2748, + "step": 23999 + }, + { + "epoch": 0.7, + "grad_norm": 1.2888379804968193, + "learning_rate": 2.232458044989538e-06, + "loss": 0.2971, + "step": 24000 + }, + { + "epoch": 0.7, + "grad_norm": 0.974866310298317, + "learning_rate": 2.232066864191097e-06, + "loss": 0.5779, + "step": 24001 + }, + { + "epoch": 0.7, + "grad_norm": 1.345300785409114, + "learning_rate": 2.2316757078196944e-06, + "loss": 0.29, + "step": 24002 + }, + { + "epoch": 0.7, + "grad_norm": 1.6213471531349957, + "learning_rate": 2.231284575878784e-06, + "loss": 0.2694, + "step": 24003 + }, + { + "epoch": 0.7, + "grad_norm": 1.587852930606406, + "learning_rate": 2.230893468371819e-06, + "loss": 0.2785, + "step": 24004 + }, + { + "epoch": 0.7, + "grad_norm": 1.31681019032499, + "learning_rate": 2.230502385302249e-06, + "loss": 0.2638, + "step": 24005 + }, + { + "epoch": 0.7, + "grad_norm": 1.3627258915377454, + "learning_rate": 2.2301113266735266e-06, + "loss": 0.282, + "step": 24006 + }, + { + "epoch": 0.7, + "grad_norm": 1.2820190970267824, + "learning_rate": 2.229720292489102e-06, + "loss": 0.28, + "step": 24007 + }, + { + "epoch": 0.7, + "grad_norm": 1.4066177560872388, + "learning_rate": 2.2293292827524267e-06, + "loss": 0.291, + "step": 24008 + }, + { + "epoch": 0.7, + "grad_norm": 1.7116137515720187, + "learning_rate": 2.228938297466953e-06, + "loss": 0.2969, + "step": 24009 + }, + { + "epoch": 0.7, + "grad_norm": 0.9460619484828909, + "learning_rate": 2.228547336636128e-06, + "loss": 0.5826, + "step": 24010 + }, + { + "epoch": 0.7, + "grad_norm": 1.2303923654906466, + "learning_rate": 2.228156400263403e-06, + "loss": 0.2937, + "step": 24011 + }, + { + "epoch": 0.7, + "grad_norm": 1.4006092363227474, + "learning_rate": 2.227765488352231e-06, + "loss": 0.2969, + "step": 24012 + }, + { + "epoch": 0.7, + "grad_norm": 1.4309676778177525, + "learning_rate": 2.2273746009060574e-06, + "loss": 0.2943, + "step": 24013 + }, + { + "epoch": 0.7, + "grad_norm": 1.187888211854668, + "learning_rate": 2.226983737928334e-06, + "loss": 0.2618, + "step": 24014 + }, + { + "epoch": 0.7, + "grad_norm": 1.8307023861442286, + "learning_rate": 2.226592899422509e-06, + "loss": 0.2797, + "step": 24015 + }, + { + "epoch": 0.7, + "grad_norm": 4.004085432527446, + "learning_rate": 2.226202085392033e-06, + "loss": 0.2662, + "step": 24016 + }, + { + "epoch": 0.7, + "grad_norm": 1.4392488701668473, + "learning_rate": 2.225811295840354e-06, + "loss": 0.2755, + "step": 24017 + }, + { + "epoch": 0.7, + "grad_norm": 2.5952014794299316, + "learning_rate": 2.2254205307709216e-06, + "loss": 0.3073, + "step": 24018 + }, + { + "epoch": 0.7, + "grad_norm": 1.310517654565145, + "learning_rate": 2.225029790187183e-06, + "loss": 0.2865, + "step": 24019 + }, + { + "epoch": 0.7, + "grad_norm": 1.5177576080496535, + "learning_rate": 2.22463907409259e-06, + "loss": 0.2649, + "step": 24020 + }, + { + "epoch": 0.7, + "grad_norm": 1.5650639281951808, + "learning_rate": 2.224248382490586e-06, + "loss": 0.2676, + "step": 24021 + }, + { + "epoch": 0.7, + "grad_norm": 1.5424832999917493, + "learning_rate": 2.2238577153846203e-06, + "loss": 0.2778, + "step": 24022 + }, + { + "epoch": 0.7, + "grad_norm": 1.3864841176529874, + "learning_rate": 2.223467072778141e-06, + "loss": 0.2922, + "step": 24023 + }, + { + "epoch": 0.7, + "grad_norm": 1.379277448918874, + "learning_rate": 2.223076454674596e-06, + "loss": 0.2806, + "step": 24024 + }, + { + "epoch": 0.7, + "grad_norm": 1.403949373476405, + "learning_rate": 2.222685861077432e-06, + "loss": 0.2931, + "step": 24025 + }, + { + "epoch": 0.7, + "grad_norm": 1.4218139252247606, + "learning_rate": 2.222295291990096e-06, + "loss": 0.2755, + "step": 24026 + }, + { + "epoch": 0.7, + "grad_norm": 1.3246723819587232, + "learning_rate": 2.221904747416035e-06, + "loss": 0.2942, + "step": 24027 + }, + { + "epoch": 0.7, + "grad_norm": 1.3192842344493991, + "learning_rate": 2.2215142273586955e-06, + "loss": 0.2902, + "step": 24028 + }, + { + "epoch": 0.7, + "grad_norm": 1.3618110117933038, + "learning_rate": 2.221123731821526e-06, + "loss": 0.2904, + "step": 24029 + }, + { + "epoch": 0.7, + "grad_norm": 1.448067973076273, + "learning_rate": 2.2207332608079675e-06, + "loss": 0.3262, + "step": 24030 + }, + { + "epoch": 0.7, + "grad_norm": 1.47158947768343, + "learning_rate": 2.2203428143214696e-06, + "loss": 0.2876, + "step": 24031 + }, + { + "epoch": 0.7, + "grad_norm": 1.5648122468383794, + "learning_rate": 2.2199523923654765e-06, + "loss": 0.3015, + "step": 24032 + }, + { + "epoch": 0.7, + "grad_norm": 1.4352334707559007, + "learning_rate": 2.219561994943435e-06, + "loss": 0.2994, + "step": 24033 + }, + { + "epoch": 0.7, + "grad_norm": 1.5463817764667578, + "learning_rate": 2.2191716220587896e-06, + "loss": 0.2616, + "step": 24034 + }, + { + "epoch": 0.7, + "grad_norm": 1.4238387836551898, + "learning_rate": 2.2187812737149856e-06, + "loss": 0.283, + "step": 24035 + }, + { + "epoch": 0.7, + "grad_norm": 1.4850307613171971, + "learning_rate": 2.218390949915467e-06, + "loss": 0.285, + "step": 24036 + }, + { + "epoch": 0.7, + "grad_norm": 1.4430786212913942, + "learning_rate": 2.218000650663682e-06, + "loss": 0.2992, + "step": 24037 + }, + { + "epoch": 0.7, + "grad_norm": 1.2545423193280476, + "learning_rate": 2.2176103759630695e-06, + "loss": 0.2582, + "step": 24038 + }, + { + "epoch": 0.7, + "grad_norm": 1.412690274347806, + "learning_rate": 2.2172201258170782e-06, + "loss": 0.3048, + "step": 24039 + }, + { + "epoch": 0.7, + "grad_norm": 1.6324791051590504, + "learning_rate": 2.216829900229149e-06, + "loss": 0.3022, + "step": 24040 + }, + { + "epoch": 0.7, + "grad_norm": 1.3842914811786855, + "learning_rate": 2.216439699202726e-06, + "loss": 0.2648, + "step": 24041 + }, + { + "epoch": 0.7, + "grad_norm": 1.411811838633396, + "learning_rate": 2.216049522741254e-06, + "loss": 0.2839, + "step": 24042 + }, + { + "epoch": 0.7, + "grad_norm": 1.3331282163327383, + "learning_rate": 2.2156593708481756e-06, + "loss": 0.3038, + "step": 24043 + }, + { + "epoch": 0.7, + "grad_norm": 1.3663857895890723, + "learning_rate": 2.2152692435269347e-06, + "loss": 0.2937, + "step": 24044 + }, + { + "epoch": 0.7, + "grad_norm": 2.4774385999901316, + "learning_rate": 2.2148791407809727e-06, + "loss": 0.2945, + "step": 24045 + }, + { + "epoch": 0.7, + "grad_norm": 2.217347730669824, + "learning_rate": 2.214489062613734e-06, + "loss": 0.2992, + "step": 24046 + }, + { + "epoch": 0.7, + "grad_norm": 1.2346474729487038, + "learning_rate": 2.2140990090286596e-06, + "loss": 0.2915, + "step": 24047 + }, + { + "epoch": 0.7, + "grad_norm": 1.3453636788241972, + "learning_rate": 2.2137089800291943e-06, + "loss": 0.2926, + "step": 24048 + }, + { + "epoch": 0.7, + "grad_norm": 1.2178043642691005, + "learning_rate": 2.2133189756187766e-06, + "loss": 0.2938, + "step": 24049 + }, + { + "epoch": 0.7, + "grad_norm": 1.3333146886460618, + "learning_rate": 2.21292899580085e-06, + "loss": 0.2753, + "step": 24050 + }, + { + "epoch": 0.7, + "grad_norm": 1.4197047720075462, + "learning_rate": 2.2125390405788557e-06, + "loss": 0.2785, + "step": 24051 + }, + { + "epoch": 0.7, + "grad_norm": 1.1722012461828453, + "learning_rate": 2.2121491099562355e-06, + "loss": 0.2523, + "step": 24052 + }, + { + "epoch": 0.7, + "grad_norm": 1.4131217481563239, + "learning_rate": 2.21175920393643e-06, + "loss": 0.2627, + "step": 24053 + }, + { + "epoch": 0.7, + "grad_norm": 1.2760875009051336, + "learning_rate": 2.2113693225228806e-06, + "loss": 0.3068, + "step": 24054 + }, + { + "epoch": 0.7, + "grad_norm": 1.644681184673278, + "learning_rate": 2.2109794657190277e-06, + "loss": 0.2782, + "step": 24055 + }, + { + "epoch": 0.7, + "grad_norm": 1.3818174952568913, + "learning_rate": 2.210589633528314e-06, + "loss": 0.2849, + "step": 24056 + }, + { + "epoch": 0.7, + "grad_norm": 1.8516889751035575, + "learning_rate": 2.2101998259541756e-06, + "loss": 0.2815, + "step": 24057 + }, + { + "epoch": 0.7, + "grad_norm": 1.4262938102238256, + "learning_rate": 2.209810043000055e-06, + "loss": 0.3064, + "step": 24058 + }, + { + "epoch": 0.7, + "grad_norm": 1.6973787588762916, + "learning_rate": 2.2094202846693918e-06, + "loss": 0.2939, + "step": 24059 + }, + { + "epoch": 0.7, + "grad_norm": 1.3109515426954204, + "learning_rate": 2.209030550965625e-06, + "loss": 0.276, + "step": 24060 + }, + { + "epoch": 0.7, + "grad_norm": 1.3400706131719493, + "learning_rate": 2.208640841892195e-06, + "loss": 0.2829, + "step": 24061 + }, + { + "epoch": 0.7, + "grad_norm": 2.066405671084596, + "learning_rate": 2.2082511574525408e-06, + "loss": 0.2894, + "step": 24062 + }, + { + "epoch": 0.7, + "grad_norm": 1.3465739467085953, + "learning_rate": 2.207861497650101e-06, + "loss": 0.2899, + "step": 24063 + }, + { + "epoch": 0.7, + "grad_norm": 1.2893367437501861, + "learning_rate": 2.207471862488314e-06, + "loss": 0.2942, + "step": 24064 + }, + { + "epoch": 0.7, + "grad_norm": 1.5678951940710324, + "learning_rate": 2.2070822519706207e-06, + "loss": 0.2772, + "step": 24065 + }, + { + "epoch": 0.7, + "grad_norm": 1.3337697445867134, + "learning_rate": 2.206692666100456e-06, + "loss": 0.2792, + "step": 24066 + }, + { + "epoch": 0.7, + "grad_norm": 1.5187382752297407, + "learning_rate": 2.206303104881261e-06, + "loss": 0.3034, + "step": 24067 + }, + { + "epoch": 0.7, + "grad_norm": 1.3134362134230966, + "learning_rate": 2.20591356831647e-06, + "loss": 0.2866, + "step": 24068 + }, + { + "epoch": 0.7, + "grad_norm": 1.0042294843597923, + "learning_rate": 2.205524056409524e-06, + "loss": 0.5755, + "step": 24069 + }, + { + "epoch": 0.7, + "grad_norm": 2.126533556238368, + "learning_rate": 2.2051345691638575e-06, + "loss": 0.2694, + "step": 24070 + }, + { + "epoch": 0.7, + "grad_norm": 1.2037579270725067, + "learning_rate": 2.2047451065829105e-06, + "loss": 0.2723, + "step": 24071 + }, + { + "epoch": 0.7, + "grad_norm": 1.3347783161743143, + "learning_rate": 2.204355668670119e-06, + "loss": 0.2953, + "step": 24072 + }, + { + "epoch": 0.7, + "grad_norm": 1.356256531785304, + "learning_rate": 2.2039662554289194e-06, + "loss": 0.2643, + "step": 24073 + }, + { + "epoch": 0.7, + "grad_norm": 1.3594304937322257, + "learning_rate": 2.203576866862749e-06, + "loss": 0.2845, + "step": 24074 + }, + { + "epoch": 0.7, + "grad_norm": 1.4559138870338169, + "learning_rate": 2.2031875029750447e-06, + "loss": 0.2994, + "step": 24075 + }, + { + "epoch": 0.7, + "grad_norm": 1.4373413751535455, + "learning_rate": 2.2027981637692407e-06, + "loss": 0.26, + "step": 24076 + }, + { + "epoch": 0.7, + "grad_norm": 1.3473753998004219, + "learning_rate": 2.2024088492487737e-06, + "loss": 0.2835, + "step": 24077 + }, + { + "epoch": 0.7, + "grad_norm": 1.2268084176944314, + "learning_rate": 2.2020195594170796e-06, + "loss": 0.2976, + "step": 24078 + }, + { + "epoch": 0.7, + "grad_norm": 1.408882854554601, + "learning_rate": 2.201630294277594e-06, + "loss": 0.2972, + "step": 24079 + }, + { + "epoch": 0.7, + "grad_norm": 1.3116997664347734, + "learning_rate": 2.201241053833752e-06, + "loss": 0.3005, + "step": 24080 + }, + { + "epoch": 0.7, + "grad_norm": 1.3013565022191904, + "learning_rate": 2.2008518380889892e-06, + "loss": 0.3078, + "step": 24081 + }, + { + "epoch": 0.7, + "grad_norm": 1.298326827127612, + "learning_rate": 2.2004626470467393e-06, + "loss": 0.2958, + "step": 24082 + }, + { + "epoch": 0.7, + "grad_norm": 1.3574931242725397, + "learning_rate": 2.2000734807104377e-06, + "loss": 0.2552, + "step": 24083 + }, + { + "epoch": 0.7, + "grad_norm": 1.2486554621533827, + "learning_rate": 2.1996843390835208e-06, + "loss": 0.2759, + "step": 24084 + }, + { + "epoch": 0.7, + "grad_norm": 1.4186035657038265, + "learning_rate": 2.199295222169419e-06, + "loss": 0.2726, + "step": 24085 + }, + { + "epoch": 0.7, + "grad_norm": 1.388319215648611, + "learning_rate": 2.198906129971568e-06, + "loss": 0.2892, + "step": 24086 + }, + { + "epoch": 0.7, + "grad_norm": 1.8999099792998235, + "learning_rate": 2.198517062493401e-06, + "loss": 0.2989, + "step": 24087 + }, + { + "epoch": 0.7, + "grad_norm": 1.3948381248904502, + "learning_rate": 2.198128019738352e-06, + "loss": 0.2653, + "step": 24088 + }, + { + "epoch": 0.7, + "grad_norm": 3.5458443737966903, + "learning_rate": 2.1977390017098548e-06, + "loss": 0.286, + "step": 24089 + }, + { + "epoch": 0.7, + "grad_norm": 1.3982835194005856, + "learning_rate": 2.1973500084113415e-06, + "loss": 0.2779, + "step": 24090 + }, + { + "epoch": 0.7, + "grad_norm": 1.3528135150143312, + "learning_rate": 2.196961039846246e-06, + "loss": 0.3353, + "step": 24091 + }, + { + "epoch": 0.7, + "grad_norm": 1.4390987151134036, + "learning_rate": 2.1965720960180014e-06, + "loss": 0.2812, + "step": 24092 + }, + { + "epoch": 0.7, + "grad_norm": 1.439707544629207, + "learning_rate": 2.1961831769300377e-06, + "loss": 0.2875, + "step": 24093 + }, + { + "epoch": 0.7, + "grad_norm": 1.3780308283345235, + "learning_rate": 2.195794282585791e-06, + "loss": 0.2944, + "step": 24094 + }, + { + "epoch": 0.7, + "grad_norm": 1.398439958881699, + "learning_rate": 2.1954054129886885e-06, + "loss": 0.3045, + "step": 24095 + }, + { + "epoch": 0.7, + "grad_norm": 1.3370914731268144, + "learning_rate": 2.1950165681421644e-06, + "loss": 0.2822, + "step": 24096 + }, + { + "epoch": 0.7, + "grad_norm": 1.333511180817603, + "learning_rate": 2.19462774804965e-06, + "loss": 0.2917, + "step": 24097 + }, + { + "epoch": 0.7, + "grad_norm": 1.451186184408764, + "learning_rate": 2.1942389527145773e-06, + "loss": 0.2871, + "step": 24098 + }, + { + "epoch": 0.7, + "grad_norm": 1.3897000245257713, + "learning_rate": 2.193850182140377e-06, + "loss": 0.2757, + "step": 24099 + }, + { + "epoch": 0.7, + "grad_norm": 1.285603997226525, + "learning_rate": 2.1934614363304798e-06, + "loss": 0.2916, + "step": 24100 + }, + { + "epoch": 0.7, + "grad_norm": 2.6267757463180677, + "learning_rate": 2.1930727152883157e-06, + "loss": 0.3014, + "step": 24101 + }, + { + "epoch": 0.7, + "grad_norm": 1.2301119565320358, + "learning_rate": 2.192684019017317e-06, + "loss": 0.267, + "step": 24102 + }, + { + "epoch": 0.7, + "grad_norm": 1.8261607492076968, + "learning_rate": 2.1922953475209146e-06, + "loss": 0.2966, + "step": 24103 + }, + { + "epoch": 0.7, + "grad_norm": 1.3234813227290825, + "learning_rate": 2.1919067008025352e-06, + "loss": 0.2763, + "step": 24104 + }, + { + "epoch": 0.7, + "grad_norm": 1.2250027097315652, + "learning_rate": 2.19151807886561e-06, + "loss": 0.2525, + "step": 24105 + }, + { + "epoch": 0.7, + "grad_norm": 1.26912796121531, + "learning_rate": 2.191129481713569e-06, + "loss": 0.2688, + "step": 24106 + }, + { + "epoch": 0.7, + "grad_norm": 1.4462493480456635, + "learning_rate": 2.1907409093498417e-06, + "loss": 0.2858, + "step": 24107 + }, + { + "epoch": 0.7, + "grad_norm": 1.5874038082302289, + "learning_rate": 2.190352361777857e-06, + "loss": 0.3025, + "step": 24108 + }, + { + "epoch": 0.7, + "grad_norm": 1.281412820054881, + "learning_rate": 2.1899638390010436e-06, + "loss": 0.2999, + "step": 24109 + }, + { + "epoch": 0.7, + "grad_norm": 1.3892609781318852, + "learning_rate": 2.189575341022831e-06, + "loss": 0.2767, + "step": 24110 + }, + { + "epoch": 0.7, + "grad_norm": 1.2773855456801446, + "learning_rate": 2.1891868678466484e-06, + "loss": 0.2714, + "step": 24111 + }, + { + "epoch": 0.7, + "grad_norm": 1.4725291311917696, + "learning_rate": 2.1887984194759215e-06, + "loss": 0.3061, + "step": 24112 + }, + { + "epoch": 0.7, + "grad_norm": 1.33360338735462, + "learning_rate": 2.18840999591408e-06, + "loss": 0.2865, + "step": 24113 + }, + { + "epoch": 0.7, + "grad_norm": 1.3914649600085114, + "learning_rate": 2.188021597164551e-06, + "loss": 0.2815, + "step": 24114 + }, + { + "epoch": 0.7, + "grad_norm": 1.2664710672598416, + "learning_rate": 2.1876332232307622e-06, + "loss": 0.2725, + "step": 24115 + }, + { + "epoch": 0.7, + "grad_norm": 1.4479501721282777, + "learning_rate": 2.1872448741161417e-06, + "loss": 0.2883, + "step": 24116 + }, + { + "epoch": 0.7, + "grad_norm": 1.7974088915141126, + "learning_rate": 2.186856549824117e-06, + "loss": 0.2829, + "step": 24117 + }, + { + "epoch": 0.7, + "grad_norm": 1.3090355889583578, + "learning_rate": 2.1864682503581136e-06, + "loss": 0.283, + "step": 24118 + }, + { + "epoch": 0.7, + "grad_norm": 1.594298271698875, + "learning_rate": 2.1860799757215594e-06, + "loss": 0.289, + "step": 24119 + }, + { + "epoch": 0.7, + "grad_norm": 2.6799646266886086, + "learning_rate": 2.1856917259178826e-06, + "loss": 0.2695, + "step": 24120 + }, + { + "epoch": 0.7, + "grad_norm": 1.2421316161065556, + "learning_rate": 2.1853035009505057e-06, + "loss": 0.2818, + "step": 24121 + }, + { + "epoch": 0.7, + "grad_norm": 1.5440516764883314, + "learning_rate": 2.1849153008228587e-06, + "loss": 0.2691, + "step": 24122 + }, + { + "epoch": 0.7, + "grad_norm": 0.9051705820081916, + "learning_rate": 2.184527125538363e-06, + "loss": 0.5586, + "step": 24123 + }, + { + "epoch": 0.7, + "grad_norm": 1.5183970651275234, + "learning_rate": 2.184138975100447e-06, + "loss": 0.3093, + "step": 24124 + }, + { + "epoch": 0.7, + "grad_norm": 1.363109828584578, + "learning_rate": 2.183750849512536e-06, + "loss": 0.3083, + "step": 24125 + }, + { + "epoch": 0.7, + "grad_norm": 1.9280068014721141, + "learning_rate": 2.1833627487780545e-06, + "loss": 0.2672, + "step": 24126 + }, + { + "epoch": 0.7, + "grad_norm": 1.456805038371524, + "learning_rate": 2.182974672900428e-06, + "loss": 0.2851, + "step": 24127 + }, + { + "epoch": 0.7, + "grad_norm": 0.8954245656860362, + "learning_rate": 2.1825866218830815e-06, + "loss": 0.5097, + "step": 24128 + }, + { + "epoch": 0.7, + "grad_norm": 1.46586584028096, + "learning_rate": 2.1821985957294395e-06, + "loss": 0.2802, + "step": 24129 + }, + { + "epoch": 0.7, + "grad_norm": 1.280385472588492, + "learning_rate": 2.181810594442928e-06, + "loss": 0.2779, + "step": 24130 + }, + { + "epoch": 0.7, + "grad_norm": 1.2924263952345523, + "learning_rate": 2.181422618026967e-06, + "loss": 0.2823, + "step": 24131 + }, + { + "epoch": 0.7, + "grad_norm": 1.5453505733676562, + "learning_rate": 2.181034666484983e-06, + "loss": 0.2842, + "step": 24132 + }, + { + "epoch": 0.7, + "grad_norm": 1.3577698457312226, + "learning_rate": 2.180646739820399e-06, + "loss": 0.2762, + "step": 24133 + }, + { + "epoch": 0.7, + "grad_norm": 1.388453331294929, + "learning_rate": 2.180258838036639e-06, + "loss": 0.3123, + "step": 24134 + }, + { + "epoch": 0.7, + "grad_norm": 1.2700817562448306, + "learning_rate": 2.179870961137126e-06, + "loss": 0.2879, + "step": 24135 + }, + { + "epoch": 0.7, + "grad_norm": 1.5489914145622095, + "learning_rate": 2.1794831091252835e-06, + "loss": 0.2824, + "step": 24136 + }, + { + "epoch": 0.7, + "grad_norm": 2.194630573144972, + "learning_rate": 2.1790952820045334e-06, + "loss": 0.2898, + "step": 24137 + }, + { + "epoch": 0.7, + "grad_norm": 1.2463115562599008, + "learning_rate": 2.1787074797782985e-06, + "loss": 0.2801, + "step": 24138 + }, + { + "epoch": 0.7, + "grad_norm": 2.5962633191255633, + "learning_rate": 2.1783197024500037e-06, + "loss": 0.3085, + "step": 24139 + }, + { + "epoch": 0.7, + "grad_norm": 1.284650266097628, + "learning_rate": 2.177931950023067e-06, + "loss": 0.2845, + "step": 24140 + }, + { + "epoch": 0.7, + "grad_norm": 1.2348889450370537, + "learning_rate": 2.177544222500912e-06, + "loss": 0.2478, + "step": 24141 + }, + { + "epoch": 0.7, + "grad_norm": 1.4718220214267745, + "learning_rate": 2.1771565198869604e-06, + "loss": 0.2859, + "step": 24142 + }, + { + "epoch": 0.7, + "grad_norm": 1.4302829034636284, + "learning_rate": 2.176768842184634e-06, + "loss": 0.2905, + "step": 24143 + }, + { + "epoch": 0.7, + "grad_norm": 1.4483608145360471, + "learning_rate": 2.1763811893973536e-06, + "loss": 0.3135, + "step": 24144 + }, + { + "epoch": 0.7, + "grad_norm": 0.8910831604694424, + "learning_rate": 2.1759935615285403e-06, + "loss": 0.5905, + "step": 24145 + }, + { + "epoch": 0.7, + "grad_norm": 1.6255446891615493, + "learning_rate": 2.175605958581616e-06, + "loss": 0.2671, + "step": 24146 + }, + { + "epoch": 0.7, + "grad_norm": 1.374120524709925, + "learning_rate": 2.175218380560002e-06, + "loss": 0.2804, + "step": 24147 + }, + { + "epoch": 0.7, + "grad_norm": 1.3251605968989568, + "learning_rate": 2.174830827467115e-06, + "loss": 0.2954, + "step": 24148 + }, + { + "epoch": 0.7, + "grad_norm": 1.2340440652851457, + "learning_rate": 2.1744432993063773e-06, + "loss": 0.2689, + "step": 24149 + }, + { + "epoch": 0.7, + "grad_norm": 1.5154859631260613, + "learning_rate": 2.174055796081211e-06, + "loss": 0.2968, + "step": 24150 + }, + { + "epoch": 0.7, + "grad_norm": 1.4640931013885194, + "learning_rate": 2.1736683177950317e-06, + "loss": 0.2981, + "step": 24151 + }, + { + "epoch": 0.7, + "grad_norm": 1.3069024868843793, + "learning_rate": 2.173280864451261e-06, + "loss": 0.2719, + "step": 24152 + }, + { + "epoch": 0.7, + "grad_norm": 1.9839296163850941, + "learning_rate": 2.1728934360533176e-06, + "loss": 0.2833, + "step": 24153 + }, + { + "epoch": 0.7, + "grad_norm": 1.3978256578298656, + "learning_rate": 2.172506032604621e-06, + "loss": 0.2821, + "step": 24154 + }, + { + "epoch": 0.7, + "grad_norm": 1.3483825352614216, + "learning_rate": 2.1721186541085905e-06, + "loss": 0.2893, + "step": 24155 + }, + { + "epoch": 0.7, + "grad_norm": 1.3836616859363498, + "learning_rate": 2.1717313005686436e-06, + "loss": 0.2817, + "step": 24156 + }, + { + "epoch": 0.7, + "grad_norm": 1.5053925188853587, + "learning_rate": 2.1713439719882e-06, + "loss": 0.2827, + "step": 24157 + }, + { + "epoch": 0.7, + "grad_norm": 1.356698393537771, + "learning_rate": 2.1709566683706785e-06, + "loss": 0.2754, + "step": 24158 + }, + { + "epoch": 0.7, + "grad_norm": 1.28267466894634, + "learning_rate": 2.1705693897194946e-06, + "loss": 0.2646, + "step": 24159 + }, + { + "epoch": 0.7, + "grad_norm": 1.597904712998626, + "learning_rate": 2.1701821360380665e-06, + "loss": 0.2956, + "step": 24160 + }, + { + "epoch": 0.7, + "grad_norm": 1.3917574563118722, + "learning_rate": 2.169794907329813e-06, + "loss": 0.2842, + "step": 24161 + }, + { + "epoch": 0.7, + "grad_norm": 1.3947938325208347, + "learning_rate": 2.16940770359815e-06, + "loss": 0.2924, + "step": 24162 + }, + { + "epoch": 0.7, + "grad_norm": 1.301729294266567, + "learning_rate": 2.169020524846497e-06, + "loss": 0.2842, + "step": 24163 + }, + { + "epoch": 0.7, + "grad_norm": 1.3133934851849132, + "learning_rate": 2.1686333710782683e-06, + "loss": 0.2783, + "step": 24164 + }, + { + "epoch": 0.7, + "grad_norm": 2.043304037994362, + "learning_rate": 2.1682462422968813e-06, + "loss": 0.2799, + "step": 24165 + }, + { + "epoch": 0.7, + "grad_norm": 1.5417053150273212, + "learning_rate": 2.1678591385057547e-06, + "loss": 0.3079, + "step": 24166 + }, + { + "epoch": 0.7, + "grad_norm": 1.3565062984458167, + "learning_rate": 2.1674720597083004e-06, + "loss": 0.2676, + "step": 24167 + }, + { + "epoch": 0.7, + "grad_norm": 1.2341548902795902, + "learning_rate": 2.1670850059079373e-06, + "loss": 0.2645, + "step": 24168 + }, + { + "epoch": 0.7, + "grad_norm": 1.6853048691079147, + "learning_rate": 2.16669797710808e-06, + "loss": 0.3201, + "step": 24169 + }, + { + "epoch": 0.7, + "grad_norm": 1.4090984595537606, + "learning_rate": 2.1663109733121446e-06, + "loss": 0.2844, + "step": 24170 + }, + { + "epoch": 0.7, + "grad_norm": 1.2908920797022787, + "learning_rate": 2.1659239945235467e-06, + "loss": 0.2723, + "step": 24171 + }, + { + "epoch": 0.7, + "grad_norm": 1.2216508135863071, + "learning_rate": 2.1655370407457007e-06, + "loss": 0.2863, + "step": 24172 + }, + { + "epoch": 0.7, + "grad_norm": 1.5450097116607506, + "learning_rate": 2.1651501119820212e-06, + "loss": 0.3008, + "step": 24173 + }, + { + "epoch": 0.7, + "grad_norm": 1.4733959170281778, + "learning_rate": 2.1647632082359236e-06, + "loss": 0.2783, + "step": 24174 + }, + { + "epoch": 0.7, + "grad_norm": 1.5246672517944875, + "learning_rate": 2.1643763295108244e-06, + "loss": 0.2923, + "step": 24175 + }, + { + "epoch": 0.7, + "grad_norm": 1.2724567052651117, + "learning_rate": 2.1639894758101336e-06, + "loss": 0.2862, + "step": 24176 + }, + { + "epoch": 0.7, + "grad_norm": 1.4788991561488622, + "learning_rate": 2.1636026471372675e-06, + "loss": 0.2926, + "step": 24177 + }, + { + "epoch": 0.7, + "grad_norm": 1.376989549918952, + "learning_rate": 2.1632158434956408e-06, + "loss": 0.2775, + "step": 24178 + }, + { + "epoch": 0.7, + "grad_norm": 1.3512761350513702, + "learning_rate": 2.162829064888664e-06, + "loss": 0.2783, + "step": 24179 + }, + { + "epoch": 0.7, + "grad_norm": 0.934697593017749, + "learning_rate": 2.1624423113197525e-06, + "loss": 0.6219, + "step": 24180 + }, + { + "epoch": 0.7, + "grad_norm": 1.3108521530874335, + "learning_rate": 2.1620555827923185e-06, + "loss": 0.2716, + "step": 24181 + }, + { + "epoch": 0.7, + "grad_norm": 1.3869299388136895, + "learning_rate": 2.1616688793097755e-06, + "loss": 0.2772, + "step": 24182 + }, + { + "epoch": 0.7, + "grad_norm": 1.268907036304445, + "learning_rate": 2.1612822008755362e-06, + "loss": 0.2819, + "step": 24183 + }, + { + "epoch": 0.7, + "grad_norm": 1.3425580650692996, + "learning_rate": 2.160895547493014e-06, + "loss": 0.2865, + "step": 24184 + }, + { + "epoch": 0.7, + "grad_norm": 1.9910811160158464, + "learning_rate": 2.1605089191656205e-06, + "loss": 0.271, + "step": 24185 + }, + { + "epoch": 0.7, + "grad_norm": 1.342137814313562, + "learning_rate": 2.160122315896766e-06, + "loss": 0.2907, + "step": 24186 + }, + { + "epoch": 0.7, + "grad_norm": 1.1957835029585575, + "learning_rate": 2.1597357376898636e-06, + "loss": 0.2675, + "step": 24187 + }, + { + "epoch": 0.7, + "grad_norm": 1.3909284349619233, + "learning_rate": 2.159349184548325e-06, + "loss": 0.2936, + "step": 24188 + }, + { + "epoch": 0.7, + "grad_norm": 1.3908757854794604, + "learning_rate": 2.158962656475561e-06, + "loss": 0.2897, + "step": 24189 + }, + { + "epoch": 0.7, + "grad_norm": 1.5522571190985313, + "learning_rate": 2.1585761534749832e-06, + "loss": 0.3206, + "step": 24190 + }, + { + "epoch": 0.7, + "grad_norm": 1.3358901836487025, + "learning_rate": 2.1581896755500025e-06, + "loss": 0.3076, + "step": 24191 + }, + { + "epoch": 0.7, + "grad_norm": 1.3613724350702396, + "learning_rate": 2.1578032227040296e-06, + "loss": 0.2956, + "step": 24192 + }, + { + "epoch": 0.7, + "grad_norm": 1.3433301051404956, + "learning_rate": 2.1574167949404747e-06, + "loss": 0.2829, + "step": 24193 + }, + { + "epoch": 0.7, + "grad_norm": 1.3614206339188586, + "learning_rate": 2.1570303922627496e-06, + "loss": 0.2669, + "step": 24194 + }, + { + "epoch": 0.7, + "grad_norm": 1.273026900034714, + "learning_rate": 2.156644014674261e-06, + "loss": 0.2875, + "step": 24195 + }, + { + "epoch": 0.7, + "grad_norm": 1.2051546977705294, + "learning_rate": 2.156257662178421e-06, + "loss": 0.2539, + "step": 24196 + }, + { + "epoch": 0.7, + "grad_norm": 1.7390216492975645, + "learning_rate": 2.155871334778638e-06, + "loss": 0.2904, + "step": 24197 + }, + { + "epoch": 0.7, + "grad_norm": 1.2390338729514827, + "learning_rate": 2.1554850324783228e-06, + "loss": 0.2965, + "step": 24198 + }, + { + "epoch": 0.7, + "grad_norm": 1.251919901505443, + "learning_rate": 2.1550987552808835e-06, + "loss": 0.2917, + "step": 24199 + }, + { + "epoch": 0.7, + "grad_norm": 0.9387985206348757, + "learning_rate": 2.1547125031897286e-06, + "loss": 0.6308, + "step": 24200 + }, + { + "epoch": 0.7, + "grad_norm": 1.3326378139717896, + "learning_rate": 2.1543262762082683e-06, + "loss": 0.2681, + "step": 24201 + }, + { + "epoch": 0.7, + "grad_norm": 1.4153092434419314, + "learning_rate": 2.1539400743399115e-06, + "loss": 0.282, + "step": 24202 + }, + { + "epoch": 0.7, + "grad_norm": 1.8442534114860583, + "learning_rate": 2.153553897588063e-06, + "loss": 0.3071, + "step": 24203 + }, + { + "epoch": 0.7, + "grad_norm": 1.2521822476557212, + "learning_rate": 2.153167745956134e-06, + "loss": 0.2864, + "step": 24204 + }, + { + "epoch": 0.7, + "grad_norm": 1.7114465222451047, + "learning_rate": 2.1527816194475306e-06, + "loss": 0.2917, + "step": 24205 + }, + { + "epoch": 0.7, + "grad_norm": 1.279644168725427, + "learning_rate": 2.1523955180656624e-06, + "loss": 0.2828, + "step": 24206 + }, + { + "epoch": 0.7, + "grad_norm": 1.2457664041073369, + "learning_rate": 2.1520094418139344e-06, + "loss": 0.267, + "step": 24207 + }, + { + "epoch": 0.7, + "grad_norm": 1.2979827423459356, + "learning_rate": 2.151623390695754e-06, + "loss": 0.2708, + "step": 24208 + }, + { + "epoch": 0.7, + "grad_norm": 1.3421209037373683, + "learning_rate": 2.151237364714529e-06, + "loss": 0.2858, + "step": 24209 + }, + { + "epoch": 0.7, + "grad_norm": 1.6033787929514225, + "learning_rate": 2.1508513638736657e-06, + "loss": 0.2802, + "step": 24210 + }, + { + "epoch": 0.7, + "grad_norm": 1.8929337059299303, + "learning_rate": 2.1504653881765706e-06, + "loss": 0.2907, + "step": 24211 + }, + { + "epoch": 0.7, + "grad_norm": 1.3171545757641878, + "learning_rate": 2.150079437626652e-06, + "loss": 0.2852, + "step": 24212 + }, + { + "epoch": 0.7, + "grad_norm": 0.9531609365208292, + "learning_rate": 2.149693512227313e-06, + "loss": 0.5394, + "step": 24213 + }, + { + "epoch": 0.7, + "grad_norm": 1.4110155538917006, + "learning_rate": 2.1493076119819595e-06, + "loss": 0.2718, + "step": 24214 + }, + { + "epoch": 0.7, + "grad_norm": 1.952134253379996, + "learning_rate": 2.148921736893998e-06, + "loss": 0.272, + "step": 24215 + }, + { + "epoch": 0.7, + "grad_norm": 1.3704795004180699, + "learning_rate": 2.1485358869668337e-06, + "loss": 0.3261, + "step": 24216 + }, + { + "epoch": 0.7, + "grad_norm": 1.2876829141314496, + "learning_rate": 2.148150062203872e-06, + "loss": 0.2913, + "step": 24217 + }, + { + "epoch": 0.7, + "grad_norm": 1.3218554904623145, + "learning_rate": 2.1477642626085175e-06, + "loss": 0.2642, + "step": 24218 + }, + { + "epoch": 0.7, + "grad_norm": 2.1880578847277103, + "learning_rate": 2.1473784881841753e-06, + "loss": 0.2794, + "step": 24219 + }, + { + "epoch": 0.7, + "grad_norm": 1.2948529641342625, + "learning_rate": 2.1469927389342495e-06, + "loss": 0.2838, + "step": 24220 + }, + { + "epoch": 0.7, + "grad_norm": 1.6856122385295313, + "learning_rate": 2.1466070148621467e-06, + "loss": 0.2875, + "step": 24221 + }, + { + "epoch": 0.7, + "grad_norm": 1.3027476343061288, + "learning_rate": 2.146221315971266e-06, + "loss": 0.2918, + "step": 24222 + }, + { + "epoch": 0.7, + "grad_norm": 1.8520406186276746, + "learning_rate": 2.145835642265014e-06, + "loss": 0.2666, + "step": 24223 + }, + { + "epoch": 0.7, + "grad_norm": 1.190025364070403, + "learning_rate": 2.1454499937467944e-06, + "loss": 0.28, + "step": 24224 + }, + { + "epoch": 0.7, + "grad_norm": 1.3550226428265426, + "learning_rate": 2.1450643704200103e-06, + "loss": 0.2633, + "step": 24225 + }, + { + "epoch": 0.7, + "grad_norm": 1.375149116285312, + "learning_rate": 2.1446787722880645e-06, + "loss": 0.2898, + "step": 24226 + }, + { + "epoch": 0.7, + "grad_norm": 1.4281560258486823, + "learning_rate": 2.1442931993543604e-06, + "loss": 0.2813, + "step": 24227 + }, + { + "epoch": 0.7, + "grad_norm": 1.5090946880268141, + "learning_rate": 2.1439076516223005e-06, + "loss": 0.3174, + "step": 24228 + }, + { + "epoch": 0.7, + "grad_norm": 1.77838091363159, + "learning_rate": 2.1435221290952874e-06, + "loss": 0.3092, + "step": 24229 + }, + { + "epoch": 0.7, + "grad_norm": 1.624239580399685, + "learning_rate": 2.143136631776725e-06, + "loss": 0.2599, + "step": 24230 + }, + { + "epoch": 0.7, + "grad_norm": 2.053535111498994, + "learning_rate": 2.142751159670011e-06, + "loss": 0.2855, + "step": 24231 + }, + { + "epoch": 0.7, + "grad_norm": 1.2713641256350081, + "learning_rate": 2.142365712778551e-06, + "loss": 0.2971, + "step": 24232 + }, + { + "epoch": 0.7, + "grad_norm": 1.244078394487478, + "learning_rate": 2.141980291105744e-06, + "loss": 0.279, + "step": 24233 + }, + { + "epoch": 0.7, + "grad_norm": 1.4938471168647425, + "learning_rate": 2.141594894654995e-06, + "loss": 0.279, + "step": 24234 + }, + { + "epoch": 0.7, + "grad_norm": 1.3665036811376652, + "learning_rate": 2.1412095234297014e-06, + "loss": 0.3127, + "step": 24235 + }, + { + "epoch": 0.7, + "grad_norm": 1.366989540421058, + "learning_rate": 2.140824177433265e-06, + "loss": 0.2942, + "step": 24236 + }, + { + "epoch": 0.7, + "grad_norm": 1.2654276344107769, + "learning_rate": 2.140438856669087e-06, + "loss": 0.275, + "step": 24237 + }, + { + "epoch": 0.7, + "grad_norm": 1.307813307736005, + "learning_rate": 2.1400535611405677e-06, + "loss": 0.2836, + "step": 24238 + }, + { + "epoch": 0.7, + "grad_norm": 1.264096154930449, + "learning_rate": 2.1396682908511076e-06, + "loss": 0.2671, + "step": 24239 + }, + { + "epoch": 0.7, + "grad_norm": 0.9069706560042727, + "learning_rate": 2.1392830458041085e-06, + "loss": 0.5467, + "step": 24240 + }, + { + "epoch": 0.7, + "grad_norm": 1.2575895019968681, + "learning_rate": 2.1388978260029662e-06, + "loss": 0.286, + "step": 24241 + }, + { + "epoch": 0.7, + "grad_norm": 2.5550888413017385, + "learning_rate": 2.1385126314510824e-06, + "loss": 0.297, + "step": 24242 + }, + { + "epoch": 0.7, + "grad_norm": 0.9503253089331423, + "learning_rate": 2.1381274621518563e-06, + "loss": 0.6184, + "step": 24243 + }, + { + "epoch": 0.7, + "grad_norm": 1.4870187323872466, + "learning_rate": 2.137742318108687e-06, + "loss": 0.2722, + "step": 24244 + }, + { + "epoch": 0.7, + "grad_norm": 1.3391845355928047, + "learning_rate": 2.137357199324974e-06, + "loss": 0.2909, + "step": 24245 + }, + { + "epoch": 0.7, + "grad_norm": 1.3373349641612227, + "learning_rate": 2.136972105804115e-06, + "loss": 0.3039, + "step": 24246 + }, + { + "epoch": 0.7, + "grad_norm": 1.2783782414677645, + "learning_rate": 2.136587037549509e-06, + "loss": 0.2739, + "step": 24247 + }, + { + "epoch": 0.7, + "grad_norm": 1.7516835023678858, + "learning_rate": 2.136201994564556e-06, + "loss": 0.2834, + "step": 24248 + }, + { + "epoch": 0.7, + "grad_norm": 1.2867748896730091, + "learning_rate": 2.1358169768526506e-06, + "loss": 0.2848, + "step": 24249 + }, + { + "epoch": 0.7, + "grad_norm": 1.299462094228315, + "learning_rate": 2.1354319844171916e-06, + "loss": 0.2839, + "step": 24250 + }, + { + "epoch": 0.7, + "grad_norm": 1.5125278227345786, + "learning_rate": 2.135047017261578e-06, + "loss": 0.293, + "step": 24251 + }, + { + "epoch": 0.7, + "grad_norm": 1.5754383145439321, + "learning_rate": 2.1346620753892055e-06, + "loss": 0.2673, + "step": 24252 + }, + { + "epoch": 0.7, + "grad_norm": 1.2983120818746643, + "learning_rate": 2.134277158803472e-06, + "loss": 0.2732, + "step": 24253 + }, + { + "epoch": 0.7, + "grad_norm": 1.3161894389725035, + "learning_rate": 2.1338922675077743e-06, + "loss": 0.2871, + "step": 24254 + }, + { + "epoch": 0.7, + "grad_norm": 1.2919071311777472, + "learning_rate": 2.1335074015055097e-06, + "loss": 0.2827, + "step": 24255 + }, + { + "epoch": 0.7, + "grad_norm": 1.3715114953453473, + "learning_rate": 2.133122560800074e-06, + "loss": 0.2964, + "step": 24256 + }, + { + "epoch": 0.7, + "grad_norm": 0.9326522094758389, + "learning_rate": 2.132737745394865e-06, + "loss": 0.5284, + "step": 24257 + }, + { + "epoch": 0.7, + "grad_norm": 1.2957556660856688, + "learning_rate": 2.1323529552932752e-06, + "loss": 0.2783, + "step": 24258 + }, + { + "epoch": 0.7, + "grad_norm": 1.5259450280021876, + "learning_rate": 2.1319681904987026e-06, + "loss": 0.305, + "step": 24259 + }, + { + "epoch": 0.7, + "grad_norm": 3.0336569388151866, + "learning_rate": 2.131583451014543e-06, + "loss": 0.2813, + "step": 24260 + }, + { + "epoch": 0.7, + "grad_norm": 1.3682042131964858, + "learning_rate": 2.1311987368441904e-06, + "loss": 0.3232, + "step": 24261 + }, + { + "epoch": 0.7, + "grad_norm": 1.331407928134742, + "learning_rate": 2.130814047991043e-06, + "loss": 0.2861, + "step": 24262 + }, + { + "epoch": 0.7, + "grad_norm": 1.284508092707561, + "learning_rate": 2.1304293844584916e-06, + "loss": 0.2953, + "step": 24263 + }, + { + "epoch": 0.7, + "grad_norm": 1.4650306862398657, + "learning_rate": 2.130044746249933e-06, + "loss": 0.2844, + "step": 24264 + }, + { + "epoch": 0.7, + "grad_norm": 1.3693825156479371, + "learning_rate": 2.129660133368761e-06, + "loss": 0.2804, + "step": 24265 + }, + { + "epoch": 0.7, + "grad_norm": 1.5450541160819842, + "learning_rate": 2.1292755458183704e-06, + "loss": 0.2841, + "step": 24266 + }, + { + "epoch": 0.7, + "grad_norm": 1.5874318142699981, + "learning_rate": 2.128890983602157e-06, + "loss": 0.3113, + "step": 24267 + }, + { + "epoch": 0.7, + "grad_norm": 1.6736053675892788, + "learning_rate": 2.12850644672351e-06, + "loss": 0.3116, + "step": 24268 + }, + { + "epoch": 0.7, + "grad_norm": 1.2993140891571664, + "learning_rate": 2.1281219351858264e-06, + "loss": 0.2822, + "step": 24269 + }, + { + "epoch": 0.7, + "grad_norm": 1.7026384758023323, + "learning_rate": 2.127737448992498e-06, + "loss": 0.2797, + "step": 24270 + }, + { + "epoch": 0.7, + "grad_norm": 1.2707743358877186, + "learning_rate": 2.1273529881469177e-06, + "loss": 0.2792, + "step": 24271 + }, + { + "epoch": 0.7, + "grad_norm": 1.2616758409614743, + "learning_rate": 2.12696855265248e-06, + "loss": 0.2783, + "step": 24272 + }, + { + "epoch": 0.7, + "grad_norm": 1.6100286817232294, + "learning_rate": 2.1265841425125773e-06, + "loss": 0.2751, + "step": 24273 + }, + { + "epoch": 0.7, + "grad_norm": 1.4769350410952966, + "learning_rate": 2.1261997577306006e-06, + "loss": 0.2759, + "step": 24274 + }, + { + "epoch": 0.7, + "grad_norm": 1.7671242578860469, + "learning_rate": 2.1258153983099427e-06, + "loss": 0.288, + "step": 24275 + }, + { + "epoch": 0.7, + "grad_norm": 1.2843259830743923, + "learning_rate": 2.125431064253998e-06, + "loss": 0.2904, + "step": 24276 + }, + { + "epoch": 0.7, + "grad_norm": 1.3110759588679326, + "learning_rate": 2.1250467555661546e-06, + "loss": 0.2916, + "step": 24277 + }, + { + "epoch": 0.7, + "grad_norm": 1.441291684380363, + "learning_rate": 2.124662472249805e-06, + "loss": 0.2745, + "step": 24278 + }, + { + "epoch": 0.7, + "grad_norm": 1.3927716253959523, + "learning_rate": 2.1242782143083413e-06, + "loss": 0.3087, + "step": 24279 + }, + { + "epoch": 0.7, + "grad_norm": 1.4411544230521336, + "learning_rate": 2.1238939817451537e-06, + "loss": 0.2768, + "step": 24280 + }, + { + "epoch": 0.7, + "grad_norm": 1.3678803988655486, + "learning_rate": 2.123509774563634e-06, + "loss": 0.2724, + "step": 24281 + }, + { + "epoch": 0.7, + "grad_norm": 1.440285016092016, + "learning_rate": 2.1231255927671723e-06, + "loss": 0.2882, + "step": 24282 + }, + { + "epoch": 0.7, + "grad_norm": 1.3980827850762392, + "learning_rate": 2.1227414363591597e-06, + "loss": 0.2753, + "step": 24283 + }, + { + "epoch": 0.7, + "grad_norm": 1.5131293014922473, + "learning_rate": 2.1223573053429867e-06, + "loss": 0.2662, + "step": 24284 + }, + { + "epoch": 0.7, + "grad_norm": 1.2509113647220722, + "learning_rate": 2.1219731997220414e-06, + "loss": 0.264, + "step": 24285 + }, + { + "epoch": 0.7, + "grad_norm": 1.466671668553565, + "learning_rate": 2.121589119499714e-06, + "loss": 0.3042, + "step": 24286 + }, + { + "epoch": 0.7, + "grad_norm": 1.3260186025079206, + "learning_rate": 2.1212050646793952e-06, + "loss": 0.2935, + "step": 24287 + }, + { + "epoch": 0.7, + "grad_norm": 1.2578660619249307, + "learning_rate": 2.120821035264473e-06, + "loss": 0.2731, + "step": 24288 + }, + { + "epoch": 0.7, + "grad_norm": 1.2313787902710591, + "learning_rate": 2.1204370312583372e-06, + "loss": 0.277, + "step": 24289 + }, + { + "epoch": 0.7, + "grad_norm": 1.8981144416921707, + "learning_rate": 2.1200530526643785e-06, + "loss": 0.2944, + "step": 24290 + }, + { + "epoch": 0.7, + "grad_norm": 1.328178239637908, + "learning_rate": 2.119669099485982e-06, + "loss": 0.2654, + "step": 24291 + }, + { + "epoch": 0.7, + "grad_norm": 1.5972962889839848, + "learning_rate": 2.1192851717265373e-06, + "loss": 0.2681, + "step": 24292 + }, + { + "epoch": 0.7, + "grad_norm": 1.409197300576649, + "learning_rate": 2.1189012693894336e-06, + "loss": 0.2847, + "step": 24293 + }, + { + "epoch": 0.7, + "grad_norm": 1.270193971408535, + "learning_rate": 2.118517392478057e-06, + "loss": 0.2784, + "step": 24294 + }, + { + "epoch": 0.7, + "grad_norm": 1.3516706029307757, + "learning_rate": 2.118133540995799e-06, + "loss": 0.2813, + "step": 24295 + }, + { + "epoch": 0.7, + "grad_norm": 1.3906537789046183, + "learning_rate": 2.1177497149460424e-06, + "loss": 0.2974, + "step": 24296 + }, + { + "epoch": 0.7, + "grad_norm": 1.5034271959119248, + "learning_rate": 2.1173659143321767e-06, + "loss": 0.2665, + "step": 24297 + }, + { + "epoch": 0.7, + "grad_norm": 1.3962848564587478, + "learning_rate": 2.1169821391575883e-06, + "loss": 0.2891, + "step": 24298 + }, + { + "epoch": 0.7, + "grad_norm": 1.3473086919915687, + "learning_rate": 2.1165983894256647e-06, + "loss": 0.2804, + "step": 24299 + }, + { + "epoch": 0.7, + "grad_norm": 1.5168152182141679, + "learning_rate": 2.1162146651397925e-06, + "loss": 0.3034, + "step": 24300 + }, + { + "epoch": 0.7, + "grad_norm": 1.4051488131086007, + "learning_rate": 2.115830966303358e-06, + "loss": 0.272, + "step": 24301 + }, + { + "epoch": 0.7, + "grad_norm": 1.3318643804254413, + "learning_rate": 2.1154472929197475e-06, + "loss": 0.2827, + "step": 24302 + }, + { + "epoch": 0.7, + "grad_norm": 1.262588806890428, + "learning_rate": 2.115063644992348e-06, + "loss": 0.2679, + "step": 24303 + }, + { + "epoch": 0.7, + "grad_norm": 1.2902419787651622, + "learning_rate": 2.1146800225245417e-06, + "loss": 0.2907, + "step": 24304 + }, + { + "epoch": 0.7, + "grad_norm": 1.3060671188758324, + "learning_rate": 2.1142964255197164e-06, + "loss": 0.282, + "step": 24305 + }, + { + "epoch": 0.7, + "grad_norm": 1.3182269563545166, + "learning_rate": 2.1139128539812577e-06, + "loss": 0.2747, + "step": 24306 + }, + { + "epoch": 0.71, + "grad_norm": 1.3714968762161104, + "learning_rate": 2.113529307912549e-06, + "loss": 0.2798, + "step": 24307 + }, + { + "epoch": 0.71, + "grad_norm": 1.2927331166908933, + "learning_rate": 2.113145787316977e-06, + "loss": 0.2886, + "step": 24308 + }, + { + "epoch": 0.71, + "grad_norm": 1.3364073638170761, + "learning_rate": 2.1127622921979253e-06, + "loss": 0.2908, + "step": 24309 + }, + { + "epoch": 0.71, + "grad_norm": 1.3946547661055488, + "learning_rate": 2.1123788225587782e-06, + "loss": 0.2851, + "step": 24310 + }, + { + "epoch": 0.71, + "grad_norm": 1.2647844635285095, + "learning_rate": 2.1119953784029207e-06, + "loss": 0.2693, + "step": 24311 + }, + { + "epoch": 0.71, + "grad_norm": 1.496824324746633, + "learning_rate": 2.111611959733737e-06, + "loss": 0.3306, + "step": 24312 + }, + { + "epoch": 0.71, + "grad_norm": 1.7399396792721782, + "learning_rate": 2.1112285665546082e-06, + "loss": 0.2692, + "step": 24313 + }, + { + "epoch": 0.71, + "grad_norm": 1.1855990572383757, + "learning_rate": 2.11084519886892e-06, + "loss": 0.2547, + "step": 24314 + }, + { + "epoch": 0.71, + "grad_norm": 1.373484590991857, + "learning_rate": 2.1104618566800545e-06, + "loss": 0.3102, + "step": 24315 + }, + { + "epoch": 0.71, + "grad_norm": 1.368292600951441, + "learning_rate": 2.1100785399913946e-06, + "loss": 0.309, + "step": 24316 + }, + { + "epoch": 0.71, + "grad_norm": 1.6741709983996618, + "learning_rate": 2.1096952488063246e-06, + "loss": 0.2952, + "step": 24317 + }, + { + "epoch": 0.71, + "grad_norm": 1.734631878280845, + "learning_rate": 2.109311983128227e-06, + "loss": 0.2957, + "step": 24318 + }, + { + "epoch": 0.71, + "grad_norm": 1.2041025654252653, + "learning_rate": 2.108928742960482e-06, + "loss": 0.2679, + "step": 24319 + }, + { + "epoch": 0.71, + "grad_norm": 1.3036567120848188, + "learning_rate": 2.1085455283064732e-06, + "loss": 0.2799, + "step": 24320 + }, + { + "epoch": 0.71, + "grad_norm": 4.245307022378046, + "learning_rate": 2.1081623391695818e-06, + "loss": 0.2826, + "step": 24321 + }, + { + "epoch": 0.71, + "grad_norm": 1.3312058867153147, + "learning_rate": 2.107779175553191e-06, + "loss": 0.2668, + "step": 24322 + }, + { + "epoch": 0.71, + "grad_norm": 1.2600429703945593, + "learning_rate": 2.1073960374606807e-06, + "loss": 0.3076, + "step": 24323 + }, + { + "epoch": 0.71, + "grad_norm": 1.238979454528688, + "learning_rate": 2.1070129248954313e-06, + "loss": 0.2759, + "step": 24324 + }, + { + "epoch": 0.71, + "grad_norm": 1.9602425962338754, + "learning_rate": 2.106629837860825e-06, + "loss": 0.3058, + "step": 24325 + }, + { + "epoch": 0.71, + "grad_norm": 1.4306470812579397, + "learning_rate": 2.106246776360243e-06, + "loss": 0.2804, + "step": 24326 + }, + { + "epoch": 0.71, + "grad_norm": 1.3548650947234624, + "learning_rate": 2.105863740397065e-06, + "loss": 0.2819, + "step": 24327 + }, + { + "epoch": 0.71, + "grad_norm": 1.4562136911767616, + "learning_rate": 2.1054807299746714e-06, + "loss": 0.279, + "step": 24328 + }, + { + "epoch": 0.71, + "grad_norm": 1.2614561412061482, + "learning_rate": 2.105097745096443e-06, + "loss": 0.3134, + "step": 24329 + }, + { + "epoch": 0.71, + "grad_norm": 1.5264166748504735, + "learning_rate": 2.104714785765759e-06, + "loss": 0.2888, + "step": 24330 + }, + { + "epoch": 0.71, + "grad_norm": 1.384499880549198, + "learning_rate": 2.1043318519860005e-06, + "loss": 0.2818, + "step": 24331 + }, + { + "epoch": 0.71, + "grad_norm": 1.4546477109747051, + "learning_rate": 2.103948943760544e-06, + "loss": 0.2895, + "step": 24332 + }, + { + "epoch": 0.71, + "grad_norm": 1.8316597525755551, + "learning_rate": 2.1035660610927705e-06, + "loss": 0.2781, + "step": 24333 + }, + { + "epoch": 0.71, + "grad_norm": 1.2771821607344638, + "learning_rate": 2.103183203986058e-06, + "loss": 0.2867, + "step": 24334 + }, + { + "epoch": 0.71, + "grad_norm": 1.5618157630703549, + "learning_rate": 2.102800372443786e-06, + "loss": 0.2953, + "step": 24335 + }, + { + "epoch": 0.71, + "grad_norm": 1.340584352625184, + "learning_rate": 2.102417566469333e-06, + "loss": 0.2761, + "step": 24336 + }, + { + "epoch": 0.71, + "grad_norm": 1.3077525545067028, + "learning_rate": 2.102034786066077e-06, + "loss": 0.2888, + "step": 24337 + }, + { + "epoch": 0.71, + "grad_norm": 1.2277523527396426, + "learning_rate": 2.1016520312373965e-06, + "loss": 0.2627, + "step": 24338 + }, + { + "epoch": 0.71, + "grad_norm": 2.5967825855584676, + "learning_rate": 2.101269301986671e-06, + "loss": 0.2817, + "step": 24339 + }, + { + "epoch": 0.71, + "grad_norm": 1.2965981318801665, + "learning_rate": 2.100886598317274e-06, + "loss": 0.2686, + "step": 24340 + }, + { + "epoch": 0.71, + "grad_norm": 1.6978021671588441, + "learning_rate": 2.1005039202325845e-06, + "loss": 0.2846, + "step": 24341 + }, + { + "epoch": 0.71, + "grad_norm": 0.9032910077058517, + "learning_rate": 2.1001212677359808e-06, + "loss": 0.5671, + "step": 24342 + }, + { + "epoch": 0.71, + "grad_norm": 1.2380792779677134, + "learning_rate": 2.0997386408308385e-06, + "loss": 0.2849, + "step": 24343 + }, + { + "epoch": 0.71, + "grad_norm": 1.2181022400984636, + "learning_rate": 2.099356039520535e-06, + "loss": 0.2708, + "step": 24344 + }, + { + "epoch": 0.71, + "grad_norm": 1.2497025501877135, + "learning_rate": 2.0989734638084465e-06, + "loss": 0.2695, + "step": 24345 + }, + { + "epoch": 0.71, + "grad_norm": 1.332177422560449, + "learning_rate": 2.0985909136979493e-06, + "loss": 0.2761, + "step": 24346 + }, + { + "epoch": 0.71, + "grad_norm": 1.345802789151704, + "learning_rate": 2.0982083891924217e-06, + "loss": 0.2597, + "step": 24347 + }, + { + "epoch": 0.71, + "grad_norm": 1.5666728836202508, + "learning_rate": 2.097825890295235e-06, + "loss": 0.2939, + "step": 24348 + }, + { + "epoch": 0.71, + "grad_norm": 1.3043829290921032, + "learning_rate": 2.0974434170097674e-06, + "loss": 0.2705, + "step": 24349 + }, + { + "epoch": 0.71, + "grad_norm": 1.522011359722622, + "learning_rate": 2.0970609693393955e-06, + "loss": 0.2811, + "step": 24350 + }, + { + "epoch": 0.71, + "grad_norm": 1.2155293430157448, + "learning_rate": 2.096678547287491e-06, + "loss": 0.2966, + "step": 24351 + }, + { + "epoch": 0.71, + "grad_norm": 1.8954351940904326, + "learning_rate": 2.0962961508574302e-06, + "loss": 0.2739, + "step": 24352 + }, + { + "epoch": 0.71, + "grad_norm": 1.328826782010594, + "learning_rate": 2.0959137800525886e-06, + "loss": 0.268, + "step": 24353 + }, + { + "epoch": 0.71, + "grad_norm": 1.3557695504544591, + "learning_rate": 2.09553143487634e-06, + "loss": 0.2846, + "step": 24354 + }, + { + "epoch": 0.71, + "grad_norm": 1.225219257053574, + "learning_rate": 2.0951491153320586e-06, + "loss": 0.2724, + "step": 24355 + }, + { + "epoch": 0.71, + "grad_norm": 1.2846970388876635, + "learning_rate": 2.094766821423118e-06, + "loss": 0.2889, + "step": 24356 + }, + { + "epoch": 0.71, + "grad_norm": 1.2755745662704931, + "learning_rate": 2.0943845531528932e-06, + "loss": 0.275, + "step": 24357 + }, + { + "epoch": 0.71, + "grad_norm": 1.3570184811421884, + "learning_rate": 2.094002310524759e-06, + "loss": 0.2857, + "step": 24358 + }, + { + "epoch": 0.71, + "grad_norm": 1.292698836579218, + "learning_rate": 2.093620093542084e-06, + "loss": 0.2776, + "step": 24359 + }, + { + "epoch": 0.71, + "grad_norm": 1.6645721233984592, + "learning_rate": 2.0932379022082437e-06, + "loss": 0.3038, + "step": 24360 + }, + { + "epoch": 0.71, + "grad_norm": 1.6132944684256572, + "learning_rate": 2.092855736526612e-06, + "loss": 0.2664, + "step": 24361 + }, + { + "epoch": 0.71, + "grad_norm": 1.2635485443900754, + "learning_rate": 2.0924735965005603e-06, + "loss": 0.2782, + "step": 24362 + }, + { + "epoch": 0.71, + "grad_norm": 1.465557677508429, + "learning_rate": 2.0920914821334615e-06, + "loss": 0.2991, + "step": 24363 + }, + { + "epoch": 0.71, + "grad_norm": 1.49549984543282, + "learning_rate": 2.0917093934286875e-06, + "loss": 0.2735, + "step": 24364 + }, + { + "epoch": 0.71, + "grad_norm": 1.48442242346087, + "learning_rate": 2.09132733038961e-06, + "loss": 0.2976, + "step": 24365 + }, + { + "epoch": 0.71, + "grad_norm": 1.332096914656607, + "learning_rate": 2.0909452930196017e-06, + "loss": 0.2761, + "step": 24366 + }, + { + "epoch": 0.71, + "grad_norm": 1.324550203782894, + "learning_rate": 2.090563281322035e-06, + "loss": 0.2705, + "step": 24367 + }, + { + "epoch": 0.71, + "grad_norm": 1.2594478377143834, + "learning_rate": 2.090181295300278e-06, + "loss": 0.2941, + "step": 24368 + }, + { + "epoch": 0.71, + "grad_norm": 1.3394621000023628, + "learning_rate": 2.0897993349577035e-06, + "loss": 0.2723, + "step": 24369 + }, + { + "epoch": 0.71, + "grad_norm": 1.6781817739014147, + "learning_rate": 2.0894174002976815e-06, + "loss": 0.2868, + "step": 24370 + }, + { + "epoch": 0.71, + "grad_norm": 1.4435396084728367, + "learning_rate": 2.089035491323584e-06, + "loss": 0.2834, + "step": 24371 + }, + { + "epoch": 0.71, + "grad_norm": 1.297807700816256, + "learning_rate": 2.08865360803878e-06, + "loss": 0.2853, + "step": 24372 + }, + { + "epoch": 0.71, + "grad_norm": 1.398829695464175, + "learning_rate": 2.0882717504466403e-06, + "loss": 0.268, + "step": 24373 + }, + { + "epoch": 0.71, + "grad_norm": 1.2112952627357536, + "learning_rate": 2.0878899185505346e-06, + "loss": 0.2884, + "step": 24374 + }, + { + "epoch": 0.71, + "grad_norm": 1.2060623525601946, + "learning_rate": 2.087508112353835e-06, + "loss": 0.28, + "step": 24375 + }, + { + "epoch": 0.71, + "grad_norm": 1.2963896827514878, + "learning_rate": 2.087126331859906e-06, + "loss": 0.291, + "step": 24376 + }, + { + "epoch": 0.71, + "grad_norm": 1.287812562937987, + "learning_rate": 2.0867445770721216e-06, + "loss": 0.2811, + "step": 24377 + }, + { + "epoch": 0.71, + "grad_norm": 1.3868383585382622, + "learning_rate": 2.0863628479938475e-06, + "loss": 0.2877, + "step": 24378 + }, + { + "epoch": 0.71, + "grad_norm": 2.1543657157413856, + "learning_rate": 2.085981144628453e-06, + "loss": 0.3, + "step": 24379 + }, + { + "epoch": 0.71, + "grad_norm": 1.3775472783557114, + "learning_rate": 2.085599466979307e-06, + "loss": 0.2592, + "step": 24380 + }, + { + "epoch": 0.71, + "grad_norm": 1.2715691166658534, + "learning_rate": 2.0852178150497783e-06, + "loss": 0.2695, + "step": 24381 + }, + { + "epoch": 0.71, + "grad_norm": 1.3919007418280827, + "learning_rate": 2.0848361888432343e-06, + "loss": 0.2652, + "step": 24382 + }, + { + "epoch": 0.71, + "grad_norm": 1.5149795311597143, + "learning_rate": 2.084454588363044e-06, + "loss": 0.2818, + "step": 24383 + }, + { + "epoch": 0.71, + "grad_norm": 1.2936212423100075, + "learning_rate": 2.084073013612574e-06, + "loss": 0.3005, + "step": 24384 + }, + { + "epoch": 0.71, + "grad_norm": 1.3536677547292344, + "learning_rate": 2.083691464595192e-06, + "loss": 0.2712, + "step": 24385 + }, + { + "epoch": 0.71, + "grad_norm": 1.2985013743390608, + "learning_rate": 2.083309941314267e-06, + "loss": 0.2765, + "step": 24386 + }, + { + "epoch": 0.71, + "grad_norm": 1.250681911930591, + "learning_rate": 2.0829284437731623e-06, + "loss": 0.2827, + "step": 24387 + }, + { + "epoch": 0.71, + "grad_norm": 1.487848843883479, + "learning_rate": 2.082546971975246e-06, + "loss": 0.3233, + "step": 24388 + }, + { + "epoch": 0.71, + "grad_norm": 1.3577150511632288, + "learning_rate": 2.082165525923886e-06, + "loss": 0.2769, + "step": 24389 + }, + { + "epoch": 0.71, + "grad_norm": 1.888566713838928, + "learning_rate": 2.0817841056224468e-06, + "loss": 0.273, + "step": 24390 + }, + { + "epoch": 0.71, + "grad_norm": 0.9290265940223036, + "learning_rate": 2.0814027110742957e-06, + "loss": 0.5637, + "step": 24391 + }, + { + "epoch": 0.71, + "grad_norm": 1.8832518112589776, + "learning_rate": 2.0810213422827974e-06, + "loss": 0.2972, + "step": 24392 + }, + { + "epoch": 0.71, + "grad_norm": 1.2083580276027877, + "learning_rate": 2.080639999251319e-06, + "loss": 0.2722, + "step": 24393 + }, + { + "epoch": 0.71, + "grad_norm": 1.2141811718251867, + "learning_rate": 2.080258681983226e-06, + "loss": 0.2795, + "step": 24394 + }, + { + "epoch": 0.71, + "grad_norm": 1.3109935847714604, + "learning_rate": 2.0798773904818814e-06, + "loss": 0.2611, + "step": 24395 + }, + { + "epoch": 0.71, + "grad_norm": 1.2950502721183887, + "learning_rate": 2.079496124750651e-06, + "loss": 0.2711, + "step": 24396 + }, + { + "epoch": 0.71, + "grad_norm": 1.4155091849747257, + "learning_rate": 2.0791148847928994e-06, + "loss": 0.2923, + "step": 24397 + }, + { + "epoch": 0.71, + "grad_norm": 1.2828432029916257, + "learning_rate": 2.0787336706119915e-06, + "loss": 0.283, + "step": 24398 + }, + { + "epoch": 0.71, + "grad_norm": 1.3316142790136085, + "learning_rate": 2.0783524822112916e-06, + "loss": 0.2539, + "step": 24399 + }, + { + "epoch": 0.71, + "grad_norm": 1.3998754166971557, + "learning_rate": 2.077971319594163e-06, + "loss": 0.2829, + "step": 24400 + }, + { + "epoch": 0.71, + "grad_norm": 1.6600179916076876, + "learning_rate": 2.0775901827639703e-06, + "loss": 0.2851, + "step": 24401 + }, + { + "epoch": 0.71, + "grad_norm": 1.2416430733450075, + "learning_rate": 2.0772090717240766e-06, + "loss": 0.2637, + "step": 24402 + }, + { + "epoch": 0.71, + "grad_norm": 1.318305739829436, + "learning_rate": 2.0768279864778475e-06, + "loss": 0.3057, + "step": 24403 + }, + { + "epoch": 0.71, + "grad_norm": 1.2844850415354863, + "learning_rate": 2.0764469270286414e-06, + "loss": 0.2735, + "step": 24404 + }, + { + "epoch": 0.71, + "grad_norm": 1.2594861933111812, + "learning_rate": 2.0760658933798254e-06, + "loss": 0.2579, + "step": 24405 + }, + { + "epoch": 0.71, + "grad_norm": 1.3434425533469554, + "learning_rate": 2.075684885534759e-06, + "loss": 0.2847, + "step": 24406 + }, + { + "epoch": 0.71, + "grad_norm": 1.2641475171928385, + "learning_rate": 2.0753039034968053e-06, + "loss": 0.2671, + "step": 24407 + }, + { + "epoch": 0.71, + "grad_norm": 1.3344154209761527, + "learning_rate": 2.074922947269327e-06, + "loss": 0.2838, + "step": 24408 + }, + { + "epoch": 0.71, + "grad_norm": 1.3089002670638088, + "learning_rate": 2.074542016855687e-06, + "loss": 0.305, + "step": 24409 + }, + { + "epoch": 0.71, + "grad_norm": 1.3168752159048374, + "learning_rate": 2.074161112259246e-06, + "loss": 0.2645, + "step": 24410 + }, + { + "epoch": 0.71, + "grad_norm": 0.9033730306493992, + "learning_rate": 2.073780233483365e-06, + "loss": 0.5026, + "step": 24411 + }, + { + "epoch": 0.71, + "grad_norm": 2.1240699270309538, + "learning_rate": 2.0733993805314062e-06, + "loss": 0.2886, + "step": 24412 + }, + { + "epoch": 0.71, + "grad_norm": 2.435688198032663, + "learning_rate": 2.0730185534067323e-06, + "loss": 0.2619, + "step": 24413 + }, + { + "epoch": 0.71, + "grad_norm": 1.397729081222176, + "learning_rate": 2.0726377521127e-06, + "loss": 0.2857, + "step": 24414 + }, + { + "epoch": 0.71, + "grad_norm": 1.3927088986004468, + "learning_rate": 2.072256976652672e-06, + "loss": 0.2787, + "step": 24415 + }, + { + "epoch": 0.71, + "grad_norm": 1.3693409793631346, + "learning_rate": 2.071876227030009e-06, + "loss": 0.262, + "step": 24416 + }, + { + "epoch": 0.71, + "grad_norm": 1.6861432692489644, + "learning_rate": 2.071495503248071e-06, + "loss": 0.2739, + "step": 24417 + }, + { + "epoch": 0.71, + "grad_norm": 1.2709289235975942, + "learning_rate": 2.0711148053102175e-06, + "loss": 0.2656, + "step": 24418 + }, + { + "epoch": 0.71, + "grad_norm": 1.4164871013632463, + "learning_rate": 2.0707341332198076e-06, + "loss": 0.3145, + "step": 24419 + }, + { + "epoch": 0.71, + "grad_norm": 1.341982423971294, + "learning_rate": 2.0703534869802024e-06, + "loss": 0.2907, + "step": 24420 + }, + { + "epoch": 0.71, + "grad_norm": 2.6357336578931125, + "learning_rate": 2.06997286659476e-06, + "loss": 0.2719, + "step": 24421 + }, + { + "epoch": 0.71, + "grad_norm": 0.9695471262240003, + "learning_rate": 2.069592272066841e-06, + "loss": 0.5522, + "step": 24422 + }, + { + "epoch": 0.71, + "grad_norm": 1.2750930495197912, + "learning_rate": 2.069211703399801e-06, + "loss": 0.2663, + "step": 24423 + }, + { + "epoch": 0.71, + "grad_norm": 1.7325496103165583, + "learning_rate": 2.0688311605970006e-06, + "loss": 0.2714, + "step": 24424 + }, + { + "epoch": 0.71, + "grad_norm": 1.4775358213613552, + "learning_rate": 2.0684506436617974e-06, + "loss": 0.2985, + "step": 24425 + }, + { + "epoch": 0.71, + "grad_norm": 1.4725829945165503, + "learning_rate": 2.06807015259755e-06, + "loss": 0.276, + "step": 24426 + }, + { + "epoch": 0.71, + "grad_norm": 1.3788667907232532, + "learning_rate": 2.0676896874076156e-06, + "loss": 0.2607, + "step": 24427 + }, + { + "epoch": 0.71, + "grad_norm": 1.2811106647910753, + "learning_rate": 2.067309248095353e-06, + "loss": 0.3069, + "step": 24428 + }, + { + "epoch": 0.71, + "grad_norm": 1.4977142055593264, + "learning_rate": 2.066928834664118e-06, + "loss": 0.2946, + "step": 24429 + }, + { + "epoch": 0.71, + "grad_norm": 1.2662381005044818, + "learning_rate": 2.066548447117271e-06, + "loss": 0.2768, + "step": 24430 + }, + { + "epoch": 0.71, + "grad_norm": 1.3149244832497768, + "learning_rate": 2.066168085458164e-06, + "loss": 0.2572, + "step": 24431 + }, + { + "epoch": 0.71, + "grad_norm": 1.3335783506540324, + "learning_rate": 2.0657877496901583e-06, + "loss": 0.2896, + "step": 24432 + }, + { + "epoch": 0.71, + "grad_norm": 1.4498434109753797, + "learning_rate": 2.065407439816607e-06, + "loss": 0.2702, + "step": 24433 + }, + { + "epoch": 0.71, + "grad_norm": 1.3024226838276802, + "learning_rate": 2.065027155840867e-06, + "loss": 0.3045, + "step": 24434 + }, + { + "epoch": 0.71, + "grad_norm": 1.2618895060206161, + "learning_rate": 2.064646897766295e-06, + "loss": 0.2869, + "step": 24435 + }, + { + "epoch": 0.71, + "grad_norm": 1.5309181829321872, + "learning_rate": 2.0642666655962468e-06, + "loss": 0.2958, + "step": 24436 + }, + { + "epoch": 0.71, + "grad_norm": 1.2174159841988805, + "learning_rate": 2.0638864593340774e-06, + "loss": 0.2804, + "step": 24437 + }, + { + "epoch": 0.71, + "grad_norm": 1.3144841301196493, + "learning_rate": 2.0635062789831434e-06, + "loss": 0.2656, + "step": 24438 + }, + { + "epoch": 0.71, + "grad_norm": 1.4741072959724593, + "learning_rate": 2.063126124546798e-06, + "loss": 0.2854, + "step": 24439 + }, + { + "epoch": 0.71, + "grad_norm": 1.3474357727335504, + "learning_rate": 2.0627459960283978e-06, + "loss": 0.2768, + "step": 24440 + }, + { + "epoch": 0.71, + "grad_norm": 1.2574488467887994, + "learning_rate": 2.0623658934312992e-06, + "loss": 0.2821, + "step": 24441 + }, + { + "epoch": 0.71, + "grad_norm": 1.4707200564575118, + "learning_rate": 2.0619858167588514e-06, + "loss": 0.3213, + "step": 24442 + }, + { + "epoch": 0.71, + "grad_norm": 1.256813483743948, + "learning_rate": 2.0616057660144117e-06, + "loss": 0.2612, + "step": 24443 + }, + { + "epoch": 0.71, + "grad_norm": 1.5150784585113661, + "learning_rate": 2.0612257412013337e-06, + "loss": 0.2792, + "step": 24444 + }, + { + "epoch": 0.71, + "grad_norm": 1.362927429634907, + "learning_rate": 2.060845742322971e-06, + "loss": 0.2787, + "step": 24445 + }, + { + "epoch": 0.71, + "grad_norm": 1.1951805201118308, + "learning_rate": 2.0604657693826774e-06, + "loss": 0.2748, + "step": 24446 + }, + { + "epoch": 0.71, + "grad_norm": 1.3822838839129945, + "learning_rate": 2.060085822383806e-06, + "loss": 0.2772, + "step": 24447 + }, + { + "epoch": 0.71, + "grad_norm": 1.2835124890514114, + "learning_rate": 2.0597059013297093e-06, + "loss": 0.2728, + "step": 24448 + }, + { + "epoch": 0.71, + "grad_norm": 1.380682849874334, + "learning_rate": 2.059326006223743e-06, + "loss": 0.2655, + "step": 24449 + }, + { + "epoch": 0.71, + "grad_norm": 1.5204333235929217, + "learning_rate": 2.0589461370692554e-06, + "loss": 0.2648, + "step": 24450 + }, + { + "epoch": 0.71, + "grad_norm": 1.3016114590114487, + "learning_rate": 2.058566293869601e-06, + "loss": 0.2746, + "step": 24451 + }, + { + "epoch": 0.71, + "grad_norm": 1.2490998228853012, + "learning_rate": 2.0581864766281314e-06, + "loss": 0.271, + "step": 24452 + }, + { + "epoch": 0.71, + "grad_norm": 1.649106214419266, + "learning_rate": 2.0578066853481987e-06, + "loss": 0.2784, + "step": 24453 + }, + { + "epoch": 0.71, + "grad_norm": 1.442569800498196, + "learning_rate": 2.057426920033155e-06, + "loss": 0.271, + "step": 24454 + }, + { + "epoch": 0.71, + "grad_norm": 1.350221997979162, + "learning_rate": 2.0570471806863512e-06, + "loss": 0.3132, + "step": 24455 + }, + { + "epoch": 0.71, + "grad_norm": 1.5298128804335178, + "learning_rate": 2.0566674673111385e-06, + "loss": 0.2651, + "step": 24456 + }, + { + "epoch": 0.71, + "grad_norm": 1.560591511120245, + "learning_rate": 2.0562877799108683e-06, + "loss": 0.2659, + "step": 24457 + }, + { + "epoch": 0.71, + "grad_norm": 1.7880326051266444, + "learning_rate": 2.055908118488893e-06, + "loss": 0.2876, + "step": 24458 + }, + { + "epoch": 0.71, + "grad_norm": 1.2386124215420882, + "learning_rate": 2.055528483048559e-06, + "loss": 0.2692, + "step": 24459 + }, + { + "epoch": 0.71, + "grad_norm": 1.1736095505262858, + "learning_rate": 2.0551488735932205e-06, + "loss": 0.2736, + "step": 24460 + }, + { + "epoch": 0.71, + "grad_norm": 1.4192838644197883, + "learning_rate": 2.0547692901262244e-06, + "loss": 0.2613, + "step": 24461 + }, + { + "epoch": 0.71, + "grad_norm": 1.31874363972324, + "learning_rate": 2.054389732650922e-06, + "loss": 0.2807, + "step": 24462 + }, + { + "epoch": 0.71, + "grad_norm": 0.9871454869659543, + "learning_rate": 2.054010201170663e-06, + "loss": 0.5739, + "step": 24463 + }, + { + "epoch": 0.71, + "grad_norm": 1.4185592200429478, + "learning_rate": 2.0536306956887964e-06, + "loss": 0.2873, + "step": 24464 + }, + { + "epoch": 0.71, + "grad_norm": 1.3676438667007815, + "learning_rate": 2.053251216208672e-06, + "loss": 0.2972, + "step": 24465 + }, + { + "epoch": 0.71, + "grad_norm": 1.5093664740231152, + "learning_rate": 2.0528717627336382e-06, + "loss": 0.3018, + "step": 24466 + }, + { + "epoch": 0.71, + "grad_norm": 1.2319350002728546, + "learning_rate": 2.0524923352670433e-06, + "loss": 0.2658, + "step": 24467 + }, + { + "epoch": 0.71, + "grad_norm": 1.470028927583697, + "learning_rate": 2.052112933812238e-06, + "loss": 0.2961, + "step": 24468 + }, + { + "epoch": 0.71, + "grad_norm": 1.4421003398249448, + "learning_rate": 2.051733558372567e-06, + "loss": 0.2914, + "step": 24469 + }, + { + "epoch": 0.71, + "grad_norm": 1.5691297309111065, + "learning_rate": 2.051354208951381e-06, + "loss": 0.2931, + "step": 24470 + }, + { + "epoch": 0.71, + "grad_norm": 1.430329866520835, + "learning_rate": 2.050974885552026e-06, + "loss": 0.2806, + "step": 24471 + }, + { + "epoch": 0.71, + "grad_norm": 1.271866379326537, + "learning_rate": 2.0505955881778506e-06, + "loss": 0.2824, + "step": 24472 + }, + { + "epoch": 0.71, + "grad_norm": 2.1778437936053328, + "learning_rate": 2.0502163168322016e-06, + "loss": 0.2779, + "step": 24473 + }, + { + "epoch": 0.71, + "grad_norm": 1.4271027671367946, + "learning_rate": 2.0498370715184267e-06, + "loss": 0.2652, + "step": 24474 + }, + { + "epoch": 0.71, + "grad_norm": 1.243536609931687, + "learning_rate": 2.049457852239872e-06, + "loss": 0.273, + "step": 24475 + }, + { + "epoch": 0.71, + "grad_norm": 1.3121546824944692, + "learning_rate": 2.0490786589998846e-06, + "loss": 0.2563, + "step": 24476 + }, + { + "epoch": 0.71, + "grad_norm": 6.43212523378824, + "learning_rate": 2.0486994918018127e-06, + "loss": 0.2844, + "step": 24477 + }, + { + "epoch": 0.71, + "grad_norm": 1.3049658905761055, + "learning_rate": 2.0483203506489986e-06, + "loss": 0.2996, + "step": 24478 + }, + { + "epoch": 0.71, + "grad_norm": 1.31026478015871, + "learning_rate": 2.04794123554479e-06, + "loss": 0.2829, + "step": 24479 + }, + { + "epoch": 0.71, + "grad_norm": 1.345104267441397, + "learning_rate": 2.0475621464925334e-06, + "loss": 0.2723, + "step": 24480 + }, + { + "epoch": 0.71, + "grad_norm": 1.254007325994449, + "learning_rate": 2.047183083495573e-06, + "loss": 0.2892, + "step": 24481 + }, + { + "epoch": 0.71, + "grad_norm": 1.2767909952976382, + "learning_rate": 2.0468040465572546e-06, + "loss": 0.2787, + "step": 24482 + }, + { + "epoch": 0.71, + "grad_norm": 1.4058830568122438, + "learning_rate": 2.0464250356809233e-06, + "loss": 0.3001, + "step": 24483 + }, + { + "epoch": 0.71, + "grad_norm": 1.3710785219846255, + "learning_rate": 2.0460460508699244e-06, + "loss": 0.2986, + "step": 24484 + }, + { + "epoch": 0.71, + "grad_norm": 1.4041105368323703, + "learning_rate": 2.0456670921276028e-06, + "loss": 0.2874, + "step": 24485 + }, + { + "epoch": 0.71, + "grad_norm": 1.2370984804529264, + "learning_rate": 2.045288159457301e-06, + "loss": 0.2622, + "step": 24486 + }, + { + "epoch": 0.71, + "grad_norm": 1.4357761572869885, + "learning_rate": 2.044909252862365e-06, + "loss": 0.2934, + "step": 24487 + }, + { + "epoch": 0.71, + "grad_norm": 1.5812323336201703, + "learning_rate": 2.0445303723461363e-06, + "loss": 0.2882, + "step": 24488 + }, + { + "epoch": 0.71, + "grad_norm": 1.227600705970895, + "learning_rate": 2.0441515179119598e-06, + "loss": 0.2677, + "step": 24489 + }, + { + "epoch": 0.71, + "grad_norm": 1.2560873948497073, + "learning_rate": 2.043772689563179e-06, + "loss": 0.2938, + "step": 24490 + }, + { + "epoch": 0.71, + "grad_norm": 1.445019744565085, + "learning_rate": 2.0433938873031363e-06, + "loss": 0.268, + "step": 24491 + }, + { + "epoch": 0.71, + "grad_norm": 1.2293044504694288, + "learning_rate": 2.043015111135176e-06, + "loss": 0.2733, + "step": 24492 + }, + { + "epoch": 0.71, + "grad_norm": 1.2680504255748961, + "learning_rate": 2.04263636106264e-06, + "loss": 0.2581, + "step": 24493 + }, + { + "epoch": 0.71, + "grad_norm": 1.3853601763187073, + "learning_rate": 2.042257637088871e-06, + "loss": 0.2773, + "step": 24494 + }, + { + "epoch": 0.71, + "grad_norm": 1.5899437964498142, + "learning_rate": 2.0418789392172113e-06, + "loss": 0.2805, + "step": 24495 + }, + { + "epoch": 0.71, + "grad_norm": 1.512410211939626, + "learning_rate": 2.0415002674510045e-06, + "loss": 0.2744, + "step": 24496 + }, + { + "epoch": 0.71, + "grad_norm": 1.3733711286325643, + "learning_rate": 2.041121621793588e-06, + "loss": 0.2738, + "step": 24497 + }, + { + "epoch": 0.71, + "grad_norm": 0.9211782678357973, + "learning_rate": 2.040743002248307e-06, + "loss": 0.5383, + "step": 24498 + }, + { + "epoch": 0.71, + "grad_norm": 1.44064486570224, + "learning_rate": 2.0403644088185015e-06, + "loss": 0.2958, + "step": 24499 + }, + { + "epoch": 0.71, + "grad_norm": 1.395968708417612, + "learning_rate": 2.039985841507513e-06, + "loss": 0.2838, + "step": 24500 + }, + { + "epoch": 0.71, + "grad_norm": 1.396168632243311, + "learning_rate": 2.0396073003186823e-06, + "loss": 0.2722, + "step": 24501 + }, + { + "epoch": 0.71, + "grad_norm": 1.3364006211434685, + "learning_rate": 2.03922878525535e-06, + "loss": 0.3097, + "step": 24502 + }, + { + "epoch": 0.71, + "grad_norm": 1.8779381673196396, + "learning_rate": 2.0388502963208567e-06, + "loss": 0.2922, + "step": 24503 + }, + { + "epoch": 0.71, + "grad_norm": 1.2844185268959123, + "learning_rate": 2.0384718335185437e-06, + "loss": 0.2969, + "step": 24504 + }, + { + "epoch": 0.71, + "grad_norm": 1.4714981808978158, + "learning_rate": 2.0380933968517486e-06, + "loss": 0.2615, + "step": 24505 + }, + { + "epoch": 0.71, + "grad_norm": 1.5344801248305173, + "learning_rate": 2.0377149863238117e-06, + "loss": 0.2827, + "step": 24506 + }, + { + "epoch": 0.71, + "grad_norm": 1.330679654170139, + "learning_rate": 2.0373366019380725e-06, + "loss": 0.3044, + "step": 24507 + }, + { + "epoch": 0.71, + "grad_norm": 1.5107252628063923, + "learning_rate": 2.0369582436978713e-06, + "loss": 0.2845, + "step": 24508 + }, + { + "epoch": 0.71, + "grad_norm": 1.5265675014291396, + "learning_rate": 2.036579911606546e-06, + "loss": 0.2814, + "step": 24509 + }, + { + "epoch": 0.71, + "grad_norm": 1.3237209922425337, + "learning_rate": 2.036201605667436e-06, + "loss": 0.2661, + "step": 24510 + }, + { + "epoch": 0.71, + "grad_norm": 1.2957104187440118, + "learning_rate": 2.03582332588388e-06, + "loss": 0.2703, + "step": 24511 + }, + { + "epoch": 0.71, + "grad_norm": 0.9358027732816911, + "learning_rate": 2.0354450722592154e-06, + "loss": 0.6071, + "step": 24512 + }, + { + "epoch": 0.71, + "grad_norm": 1.6329265961560222, + "learning_rate": 2.035066844796783e-06, + "loss": 0.2653, + "step": 24513 + }, + { + "epoch": 0.71, + "grad_norm": 1.4428549079494695, + "learning_rate": 2.0346886434999173e-06, + "loss": 0.294, + "step": 24514 + }, + { + "epoch": 0.71, + "grad_norm": 8.224947631487305, + "learning_rate": 2.0343104683719584e-06, + "loss": 0.3134, + "step": 24515 + }, + { + "epoch": 0.71, + "grad_norm": 1.3747830880190695, + "learning_rate": 2.0339323194162406e-06, + "loss": 0.2793, + "step": 24516 + }, + { + "epoch": 0.71, + "grad_norm": 3.5028649626497184, + "learning_rate": 2.0335541966361037e-06, + "loss": 0.2886, + "step": 24517 + }, + { + "epoch": 0.71, + "grad_norm": 1.201453873753882, + "learning_rate": 2.0331761000348838e-06, + "loss": 0.2716, + "step": 24518 + }, + { + "epoch": 0.71, + "grad_norm": 1.9812515217705422, + "learning_rate": 2.0327980296159172e-06, + "loss": 0.3007, + "step": 24519 + }, + { + "epoch": 0.71, + "grad_norm": 3.5990711991033697, + "learning_rate": 2.032419985382541e-06, + "loss": 0.3008, + "step": 24520 + }, + { + "epoch": 0.71, + "grad_norm": 1.192682047583541, + "learning_rate": 2.032041967338092e-06, + "loss": 0.2749, + "step": 24521 + }, + { + "epoch": 0.71, + "grad_norm": 1.263967936010195, + "learning_rate": 2.0316639754859045e-06, + "loss": 0.2827, + "step": 24522 + }, + { + "epoch": 0.71, + "grad_norm": 1.3284054081383558, + "learning_rate": 2.031286009829318e-06, + "loss": 0.2678, + "step": 24523 + }, + { + "epoch": 0.71, + "grad_norm": 1.6417882809472424, + "learning_rate": 2.0309080703716634e-06, + "loss": 0.3013, + "step": 24524 + }, + { + "epoch": 0.71, + "grad_norm": 1.1735025731723328, + "learning_rate": 2.0305301571162776e-06, + "loss": 0.2757, + "step": 24525 + }, + { + "epoch": 0.71, + "grad_norm": 1.3662508571300638, + "learning_rate": 2.0301522700664967e-06, + "loss": 0.2725, + "step": 24526 + }, + { + "epoch": 0.71, + "grad_norm": 1.3268243335818841, + "learning_rate": 2.029774409225655e-06, + "loss": 0.2653, + "step": 24527 + }, + { + "epoch": 0.71, + "grad_norm": 1.4482547140692548, + "learning_rate": 2.0293965745970867e-06, + "loss": 0.2824, + "step": 24528 + }, + { + "epoch": 0.71, + "grad_norm": 1.3164734320063922, + "learning_rate": 2.0290187661841264e-06, + "loss": 0.2936, + "step": 24529 + }, + { + "epoch": 0.71, + "grad_norm": 1.514375935779233, + "learning_rate": 2.0286409839901087e-06, + "loss": 0.2659, + "step": 24530 + }, + { + "epoch": 0.71, + "grad_norm": 1.2985343229199304, + "learning_rate": 2.028263228018367e-06, + "loss": 0.2728, + "step": 24531 + }, + { + "epoch": 0.71, + "grad_norm": 1.306805560574469, + "learning_rate": 2.027885498272238e-06, + "loss": 0.2635, + "step": 24532 + }, + { + "epoch": 0.71, + "grad_norm": 1.2695516851411552, + "learning_rate": 2.0275077947550496e-06, + "loss": 0.2607, + "step": 24533 + }, + { + "epoch": 0.71, + "grad_norm": 1.2645975148136523, + "learning_rate": 2.0271301174701384e-06, + "loss": 0.2677, + "step": 24534 + }, + { + "epoch": 0.71, + "grad_norm": 1.1973328135706904, + "learning_rate": 2.0267524664208373e-06, + "loss": 0.2732, + "step": 24535 + }, + { + "epoch": 0.71, + "grad_norm": 2.9825069493886835, + "learning_rate": 2.0263748416104777e-06, + "loss": 0.2907, + "step": 24536 + }, + { + "epoch": 0.71, + "grad_norm": 1.4730147614921323, + "learning_rate": 2.0259972430423936e-06, + "loss": 0.2881, + "step": 24537 + }, + { + "epoch": 0.71, + "grad_norm": 1.2829994370152717, + "learning_rate": 2.025619670719917e-06, + "loss": 0.2782, + "step": 24538 + }, + { + "epoch": 0.71, + "grad_norm": 1.2690062625729346, + "learning_rate": 2.0252421246463793e-06, + "loss": 0.2703, + "step": 24539 + }, + { + "epoch": 0.71, + "grad_norm": 1.4793376634037252, + "learning_rate": 2.0248646048251153e-06, + "loss": 0.2955, + "step": 24540 + }, + { + "epoch": 0.71, + "grad_norm": 1.4102733028918613, + "learning_rate": 2.0244871112594523e-06, + "loss": 0.2686, + "step": 24541 + }, + { + "epoch": 0.71, + "grad_norm": 1.2819631391493458, + "learning_rate": 2.024109643952723e-06, + "loss": 0.2841, + "step": 24542 + }, + { + "epoch": 0.71, + "grad_norm": 1.2713322603832664, + "learning_rate": 2.023732202908261e-06, + "loss": 0.2837, + "step": 24543 + }, + { + "epoch": 0.71, + "grad_norm": 1.6328099757580588, + "learning_rate": 2.0233547881293943e-06, + "loss": 0.2898, + "step": 24544 + }, + { + "epoch": 0.71, + "grad_norm": 1.5227349552500038, + "learning_rate": 2.0229773996194536e-06, + "loss": 0.2889, + "step": 24545 + }, + { + "epoch": 0.71, + "grad_norm": 1.341086824766482, + "learning_rate": 2.0226000373817704e-06, + "loss": 0.2855, + "step": 24546 + }, + { + "epoch": 0.71, + "grad_norm": 1.3772009587260898, + "learning_rate": 2.0222227014196756e-06, + "loss": 0.28, + "step": 24547 + }, + { + "epoch": 0.71, + "grad_norm": 1.420225645099739, + "learning_rate": 2.0218453917364984e-06, + "loss": 0.2736, + "step": 24548 + }, + { + "epoch": 0.71, + "grad_norm": 1.2603016189357872, + "learning_rate": 2.0214681083355686e-06, + "loss": 0.281, + "step": 24549 + }, + { + "epoch": 0.71, + "grad_norm": 1.3580734009958626, + "learning_rate": 2.021090851220217e-06, + "loss": 0.2878, + "step": 24550 + }, + { + "epoch": 0.71, + "grad_norm": 1.5184834719622489, + "learning_rate": 2.0207136203937706e-06, + "loss": 0.2958, + "step": 24551 + }, + { + "epoch": 0.71, + "grad_norm": 1.3216858770410551, + "learning_rate": 2.020336415859559e-06, + "loss": 0.2794, + "step": 24552 + }, + { + "epoch": 0.71, + "grad_norm": 1.5657200581158996, + "learning_rate": 2.019959237620912e-06, + "loss": 0.2791, + "step": 24553 + }, + { + "epoch": 0.71, + "grad_norm": 1.4235438479995222, + "learning_rate": 2.0195820856811575e-06, + "loss": 0.2726, + "step": 24554 + }, + { + "epoch": 0.71, + "grad_norm": 1.3169683972540067, + "learning_rate": 2.0192049600436246e-06, + "loss": 0.2981, + "step": 24555 + }, + { + "epoch": 0.71, + "grad_norm": 1.277898267130132, + "learning_rate": 2.0188278607116406e-06, + "loss": 0.2801, + "step": 24556 + }, + { + "epoch": 0.71, + "grad_norm": 1.3965509130089995, + "learning_rate": 2.018450787688534e-06, + "loss": 0.2939, + "step": 24557 + }, + { + "epoch": 0.71, + "grad_norm": 1.325033462950293, + "learning_rate": 2.0180737409776317e-06, + "loss": 0.2844, + "step": 24558 + }, + { + "epoch": 0.71, + "grad_norm": 0.9069411964726714, + "learning_rate": 2.0176967205822646e-06, + "loss": 0.5358, + "step": 24559 + }, + { + "epoch": 0.71, + "grad_norm": 1.307363208026138, + "learning_rate": 2.0173197265057544e-06, + "loss": 0.2797, + "step": 24560 + }, + { + "epoch": 0.71, + "grad_norm": 1.6082874817190385, + "learning_rate": 2.0169427587514313e-06, + "loss": 0.2786, + "step": 24561 + }, + { + "epoch": 0.71, + "grad_norm": 1.3268894648115086, + "learning_rate": 2.0165658173226203e-06, + "loss": 0.3091, + "step": 24562 + }, + { + "epoch": 0.71, + "grad_norm": 0.9309038046444933, + "learning_rate": 2.01618890222265e-06, + "loss": 0.5679, + "step": 24563 + }, + { + "epoch": 0.71, + "grad_norm": 1.2856306968753417, + "learning_rate": 2.0158120134548458e-06, + "loss": 0.2962, + "step": 24564 + }, + { + "epoch": 0.71, + "grad_norm": 1.3801135885029012, + "learning_rate": 2.0154351510225336e-06, + "loss": 0.269, + "step": 24565 + }, + { + "epoch": 0.71, + "grad_norm": 1.3787767950515712, + "learning_rate": 2.015058314929039e-06, + "loss": 0.2684, + "step": 24566 + }, + { + "epoch": 0.71, + "grad_norm": 1.3111143721107146, + "learning_rate": 2.014681505177688e-06, + "loss": 0.2555, + "step": 24567 + }, + { + "epoch": 0.71, + "grad_norm": 1.016986070806363, + "learning_rate": 2.0143047217718075e-06, + "loss": 0.5626, + "step": 24568 + }, + { + "epoch": 0.71, + "grad_norm": 1.307042598073424, + "learning_rate": 2.0139279647147197e-06, + "loss": 0.2946, + "step": 24569 + }, + { + "epoch": 0.71, + "grad_norm": 1.387018564853579, + "learning_rate": 2.01355123400975e-06, + "loss": 0.2655, + "step": 24570 + }, + { + "epoch": 0.71, + "grad_norm": 1.3730919877425216, + "learning_rate": 2.013174529660226e-06, + "loss": 0.2822, + "step": 24571 + }, + { + "epoch": 0.71, + "grad_norm": 1.4468316474986684, + "learning_rate": 2.012797851669468e-06, + "loss": 0.2759, + "step": 24572 + }, + { + "epoch": 0.71, + "grad_norm": 0.9757527292408943, + "learning_rate": 2.0124212000408022e-06, + "loss": 0.5944, + "step": 24573 + }, + { + "epoch": 0.71, + "grad_norm": 1.2163309471894577, + "learning_rate": 2.0120445747775526e-06, + "loss": 0.2825, + "step": 24574 + }, + { + "epoch": 0.71, + "grad_norm": 1.4794595094378948, + "learning_rate": 2.0116679758830426e-06, + "loss": 0.2809, + "step": 24575 + }, + { + "epoch": 0.71, + "grad_norm": 1.3357132209022449, + "learning_rate": 2.011291403360596e-06, + "loss": 0.2663, + "step": 24576 + }, + { + "epoch": 0.71, + "grad_norm": 2.216198771610679, + "learning_rate": 2.010914857213536e-06, + "loss": 0.2692, + "step": 24577 + }, + { + "epoch": 0.71, + "grad_norm": 1.4820649205727727, + "learning_rate": 2.0105383374451867e-06, + "loss": 0.2841, + "step": 24578 + }, + { + "epoch": 0.71, + "grad_norm": 1.35917736009067, + "learning_rate": 2.0101618440588683e-06, + "loss": 0.2879, + "step": 24579 + }, + { + "epoch": 0.71, + "grad_norm": 1.898544922374628, + "learning_rate": 2.0097853770579045e-06, + "loss": 0.2883, + "step": 24580 + }, + { + "epoch": 0.71, + "grad_norm": 1.3260537712168747, + "learning_rate": 2.009408936445619e-06, + "loss": 0.2621, + "step": 24581 + }, + { + "epoch": 0.71, + "grad_norm": 1.1536889766911786, + "learning_rate": 2.0090325222253317e-06, + "loss": 0.2612, + "step": 24582 + }, + { + "epoch": 0.71, + "grad_norm": 1.5852946736024962, + "learning_rate": 2.0086561344003656e-06, + "loss": 0.2843, + "step": 24583 + }, + { + "epoch": 0.71, + "grad_norm": 0.9862538101886347, + "learning_rate": 2.0082797729740428e-06, + "loss": 0.5847, + "step": 24584 + }, + { + "epoch": 0.71, + "grad_norm": 1.2154526738730065, + "learning_rate": 2.0079034379496844e-06, + "loss": 0.269, + "step": 24585 + }, + { + "epoch": 0.71, + "grad_norm": 1.2070237341548637, + "learning_rate": 2.0075271293306125e-06, + "loss": 0.3096, + "step": 24586 + }, + { + "epoch": 0.71, + "grad_norm": 1.5875922662673265, + "learning_rate": 2.007150847120145e-06, + "loss": 0.2644, + "step": 24587 + }, + { + "epoch": 0.71, + "grad_norm": 1.3007422859354438, + "learning_rate": 2.006774591321605e-06, + "loss": 0.2779, + "step": 24588 + }, + { + "epoch": 0.71, + "grad_norm": 1.4781964181554983, + "learning_rate": 2.0063983619383125e-06, + "loss": 0.2806, + "step": 24589 + }, + { + "epoch": 0.71, + "grad_norm": 1.611516853120722, + "learning_rate": 2.0060221589735874e-06, + "loss": 0.3018, + "step": 24590 + }, + { + "epoch": 0.71, + "grad_norm": 1.3190734144168303, + "learning_rate": 2.00564598243075e-06, + "loss": 0.287, + "step": 24591 + }, + { + "epoch": 0.71, + "grad_norm": 1.3415332383006442, + "learning_rate": 2.0052698323131196e-06, + "loss": 0.2672, + "step": 24592 + }, + { + "epoch": 0.71, + "grad_norm": 1.3326350970841716, + "learning_rate": 2.0048937086240173e-06, + "loss": 0.3074, + "step": 24593 + }, + { + "epoch": 0.71, + "grad_norm": 1.0303046459285434, + "learning_rate": 2.0045176113667603e-06, + "loss": 0.5899, + "step": 24594 + }, + { + "epoch": 0.71, + "grad_norm": 1.2909060785758562, + "learning_rate": 2.004141540544671e-06, + "loss": 0.282, + "step": 24595 + }, + { + "epoch": 0.71, + "grad_norm": 1.3700578835378552, + "learning_rate": 2.0037654961610633e-06, + "loss": 0.2818, + "step": 24596 + }, + { + "epoch": 0.71, + "grad_norm": 1.362219843224218, + "learning_rate": 2.0033894782192586e-06, + "loss": 0.3126, + "step": 24597 + }, + { + "epoch": 0.71, + "grad_norm": 1.364649868826822, + "learning_rate": 2.003013486722575e-06, + "loss": 0.2726, + "step": 24598 + }, + { + "epoch": 0.71, + "grad_norm": 1.4397047009059798, + "learning_rate": 2.002637521674333e-06, + "loss": 0.2773, + "step": 24599 + }, + { + "epoch": 0.71, + "grad_norm": 1.3950151775261763, + "learning_rate": 2.0022615830778452e-06, + "loss": 0.2717, + "step": 24600 + }, + { + "epoch": 0.71, + "grad_norm": 1.3720420745504096, + "learning_rate": 2.001885670936433e-06, + "loss": 0.2712, + "step": 24601 + }, + { + "epoch": 0.71, + "grad_norm": 1.2636202608424698, + "learning_rate": 2.001509785253413e-06, + "loss": 0.2735, + "step": 24602 + }, + { + "epoch": 0.71, + "grad_norm": 1.31815718988467, + "learning_rate": 2.0011339260321024e-06, + "loss": 0.2823, + "step": 24603 + }, + { + "epoch": 0.71, + "grad_norm": 1.6673379706726845, + "learning_rate": 2.000758093275818e-06, + "loss": 0.3195, + "step": 24604 + }, + { + "epoch": 0.71, + "grad_norm": 2.4113622715159644, + "learning_rate": 2.000382286987878e-06, + "loss": 0.2916, + "step": 24605 + }, + { + "epoch": 0.71, + "grad_norm": 1.2926203678993173, + "learning_rate": 2.000006507171597e-06, + "loss": 0.2805, + "step": 24606 + }, + { + "epoch": 0.71, + "grad_norm": 1.7023720072202397, + "learning_rate": 1.9996307538302904e-06, + "loss": 0.289, + "step": 24607 + }, + { + "epoch": 0.71, + "grad_norm": 1.3305601580302326, + "learning_rate": 1.9992550269672767e-06, + "loss": 0.2729, + "step": 24608 + }, + { + "epoch": 0.71, + "grad_norm": 1.4204692430185204, + "learning_rate": 1.998879326585871e-06, + "loss": 0.2705, + "step": 24609 + }, + { + "epoch": 0.71, + "grad_norm": 1.2511928307264182, + "learning_rate": 1.9985036526893875e-06, + "loss": 0.2839, + "step": 24610 + }, + { + "epoch": 0.71, + "grad_norm": 1.2454899705352775, + "learning_rate": 1.998128005281143e-06, + "loss": 0.2952, + "step": 24611 + }, + { + "epoch": 0.71, + "grad_norm": 1.4919216914587363, + "learning_rate": 1.9977523843644527e-06, + "loss": 0.32, + "step": 24612 + }, + { + "epoch": 0.71, + "grad_norm": 1.5013320882672971, + "learning_rate": 1.99737678994263e-06, + "loss": 0.2807, + "step": 24613 + }, + { + "epoch": 0.71, + "grad_norm": 1.3261172905317535, + "learning_rate": 1.9970012220189927e-06, + "loss": 0.2858, + "step": 24614 + }, + { + "epoch": 0.71, + "grad_norm": 1.7036225655929964, + "learning_rate": 1.996625680596851e-06, + "loss": 0.2712, + "step": 24615 + }, + { + "epoch": 0.71, + "grad_norm": 1.272382749873989, + "learning_rate": 1.996250165679521e-06, + "loss": 0.302, + "step": 24616 + }, + { + "epoch": 0.71, + "grad_norm": 1.2772120744074933, + "learning_rate": 1.9958746772703168e-06, + "loss": 0.2905, + "step": 24617 + }, + { + "epoch": 0.71, + "grad_norm": 1.3311786194712618, + "learning_rate": 1.9954992153725518e-06, + "loss": 0.2754, + "step": 24618 + }, + { + "epoch": 0.71, + "grad_norm": 1.5223767794833991, + "learning_rate": 1.9951237799895396e-06, + "loss": 0.2664, + "step": 24619 + }, + { + "epoch": 0.71, + "grad_norm": 1.3885364525617492, + "learning_rate": 1.994748371124593e-06, + "loss": 0.2663, + "step": 24620 + }, + { + "epoch": 0.71, + "grad_norm": 1.46279231103984, + "learning_rate": 1.994372988781026e-06, + "loss": 0.2993, + "step": 24621 + }, + { + "epoch": 0.71, + "grad_norm": 1.2289749394872043, + "learning_rate": 1.9939976329621517e-06, + "loss": 0.2439, + "step": 24622 + }, + { + "epoch": 0.71, + "grad_norm": 1.3059939462382024, + "learning_rate": 1.9936223036712804e-06, + "loss": 0.2799, + "step": 24623 + }, + { + "epoch": 0.71, + "grad_norm": 1.2444539389513969, + "learning_rate": 1.9932470009117256e-06, + "loss": 0.2691, + "step": 24624 + }, + { + "epoch": 0.71, + "grad_norm": 1.497717772682582, + "learning_rate": 1.9928717246867997e-06, + "loss": 0.2663, + "step": 24625 + }, + { + "epoch": 0.71, + "grad_norm": 1.3832768771417312, + "learning_rate": 1.9924964749998132e-06, + "loss": 0.3069, + "step": 24626 + }, + { + "epoch": 0.71, + "grad_norm": 1.259618414593918, + "learning_rate": 1.9921212518540795e-06, + "loss": 0.2708, + "step": 24627 + }, + { + "epoch": 0.71, + "grad_norm": 1.8589711519352579, + "learning_rate": 1.9917460552529107e-06, + "loss": 0.2809, + "step": 24628 + }, + { + "epoch": 0.71, + "grad_norm": 1.4471628487791965, + "learning_rate": 1.9913708851996146e-06, + "loss": 0.2789, + "step": 24629 + }, + { + "epoch": 0.71, + "grad_norm": 1.4331643671494998, + "learning_rate": 1.9909957416975035e-06, + "loss": 0.2933, + "step": 24630 + }, + { + "epoch": 0.71, + "grad_norm": 1.2942787124329371, + "learning_rate": 1.9906206247498888e-06, + "loss": 0.295, + "step": 24631 + }, + { + "epoch": 0.71, + "grad_norm": 1.3024762642831045, + "learning_rate": 1.9902455343600803e-06, + "loss": 0.2767, + "step": 24632 + }, + { + "epoch": 0.71, + "grad_norm": 1.9320519436404635, + "learning_rate": 1.98987047053139e-06, + "loss": 0.3069, + "step": 24633 + }, + { + "epoch": 0.71, + "grad_norm": 1.2326172549798453, + "learning_rate": 1.9894954332671246e-06, + "loss": 0.2736, + "step": 24634 + }, + { + "epoch": 0.71, + "grad_norm": 1.2866847120037788, + "learning_rate": 1.989120422570595e-06, + "loss": 0.3148, + "step": 24635 + }, + { + "epoch": 0.71, + "grad_norm": 1.7136215651375042, + "learning_rate": 1.9887454384451116e-06, + "loss": 0.2523, + "step": 24636 + }, + { + "epoch": 0.71, + "grad_norm": 1.3394148396722128, + "learning_rate": 1.988370480893983e-06, + "loss": 0.2768, + "step": 24637 + }, + { + "epoch": 0.71, + "grad_norm": 1.367344885116914, + "learning_rate": 1.987995549920518e-06, + "loss": 0.3148, + "step": 24638 + }, + { + "epoch": 0.71, + "grad_norm": 1.1762815672335583, + "learning_rate": 1.987620645528026e-06, + "loss": 0.281, + "step": 24639 + }, + { + "epoch": 0.71, + "grad_norm": 1.3659148427596841, + "learning_rate": 1.987245767719815e-06, + "loss": 0.2982, + "step": 24640 + }, + { + "epoch": 0.71, + "grad_norm": 1.575505208769086, + "learning_rate": 1.9868709164991954e-06, + "loss": 0.285, + "step": 24641 + }, + { + "epoch": 0.71, + "grad_norm": 1.3319512059730814, + "learning_rate": 1.9864960918694716e-06, + "loss": 0.2601, + "step": 24642 + }, + { + "epoch": 0.71, + "grad_norm": 0.9140214617799055, + "learning_rate": 1.986121293833953e-06, + "loss": 0.5698, + "step": 24643 + }, + { + "epoch": 0.71, + "grad_norm": 1.25212680889589, + "learning_rate": 1.9857465223959475e-06, + "loss": 0.2592, + "step": 24644 + }, + { + "epoch": 0.71, + "grad_norm": 1.4100475256204994, + "learning_rate": 1.9853717775587627e-06, + "loss": 0.2745, + "step": 24645 + }, + { + "epoch": 0.71, + "grad_norm": 1.4419733035573261, + "learning_rate": 1.9849970593257054e-06, + "loss": 0.2835, + "step": 24646 + }, + { + "epoch": 0.71, + "grad_norm": 1.629451949805566, + "learning_rate": 1.9846223677000824e-06, + "loss": 0.2813, + "step": 24647 + }, + { + "epoch": 0.71, + "grad_norm": 1.3733469980189081, + "learning_rate": 1.9842477026852e-06, + "loss": 0.2754, + "step": 24648 + }, + { + "epoch": 0.71, + "grad_norm": 1.366282172516107, + "learning_rate": 1.9838730642843658e-06, + "loss": 0.2793, + "step": 24649 + }, + { + "epoch": 0.71, + "grad_norm": 1.2907200698180123, + "learning_rate": 1.9834984525008866e-06, + "loss": 0.2824, + "step": 24650 + }, + { + "epoch": 0.71, + "grad_norm": 3.8367483933411295, + "learning_rate": 1.9831238673380652e-06, + "loss": 0.2816, + "step": 24651 + }, + { + "epoch": 0.72, + "grad_norm": 1.6813330615585846, + "learning_rate": 1.9827493087992093e-06, + "loss": 0.2855, + "step": 24652 + }, + { + "epoch": 0.72, + "grad_norm": 1.4319285190862292, + "learning_rate": 1.9823747768876243e-06, + "loss": 0.2781, + "step": 24653 + }, + { + "epoch": 0.72, + "grad_norm": 1.2113213121027404, + "learning_rate": 1.9820002716066154e-06, + "loss": 0.2773, + "step": 24654 + }, + { + "epoch": 0.72, + "grad_norm": 1.2477210971831372, + "learning_rate": 1.9816257929594873e-06, + "loss": 0.2856, + "step": 24655 + }, + { + "epoch": 0.72, + "grad_norm": 1.2932541949466412, + "learning_rate": 1.981251340949547e-06, + "loss": 0.2821, + "step": 24656 + }, + { + "epoch": 0.72, + "grad_norm": 1.3322136852674216, + "learning_rate": 1.9808769155800954e-06, + "loss": 0.293, + "step": 24657 + }, + { + "epoch": 0.72, + "grad_norm": 1.3591912807523125, + "learning_rate": 1.9805025168544383e-06, + "loss": 0.2832, + "step": 24658 + }, + { + "epoch": 0.72, + "grad_norm": 1.2208514228884104, + "learning_rate": 1.9801281447758804e-06, + "loss": 0.2892, + "step": 24659 + }, + { + "epoch": 0.72, + "grad_norm": 1.349828004387328, + "learning_rate": 1.979753799347727e-06, + "loss": 0.2941, + "step": 24660 + }, + { + "epoch": 0.72, + "grad_norm": 1.321159294889905, + "learning_rate": 1.9793794805732776e-06, + "loss": 0.2747, + "step": 24661 + }, + { + "epoch": 0.72, + "grad_norm": 1.2422523545765016, + "learning_rate": 1.9790051884558378e-06, + "loss": 0.2905, + "step": 24662 + }, + { + "epoch": 0.72, + "grad_norm": 1.6501824324978076, + "learning_rate": 1.9786309229987104e-06, + "loss": 0.2766, + "step": 24663 + }, + { + "epoch": 0.72, + "grad_norm": 1.3479688093103868, + "learning_rate": 1.9782566842051994e-06, + "loss": 0.3035, + "step": 24664 + }, + { + "epoch": 0.72, + "grad_norm": 1.6717260759967183, + "learning_rate": 1.977882472078606e-06, + "loss": 0.2963, + "step": 24665 + }, + { + "epoch": 0.72, + "grad_norm": 1.6895836060518916, + "learning_rate": 1.977508286622234e-06, + "loss": 0.2763, + "step": 24666 + }, + { + "epoch": 0.72, + "grad_norm": 1.2999063523826173, + "learning_rate": 1.977134127839384e-06, + "loss": 0.2777, + "step": 24667 + }, + { + "epoch": 0.72, + "grad_norm": 1.3260534321324309, + "learning_rate": 1.9767599957333595e-06, + "loss": 0.2723, + "step": 24668 + }, + { + "epoch": 0.72, + "grad_norm": 1.3747701969707944, + "learning_rate": 1.9763858903074633e-06, + "loss": 0.3025, + "step": 24669 + }, + { + "epoch": 0.72, + "grad_norm": 1.3326233824671578, + "learning_rate": 1.9760118115649934e-06, + "loss": 0.2744, + "step": 24670 + }, + { + "epoch": 0.72, + "grad_norm": 1.3497505082692878, + "learning_rate": 1.9756377595092524e-06, + "loss": 0.3013, + "step": 24671 + }, + { + "epoch": 0.72, + "grad_norm": 1.2491315944021542, + "learning_rate": 1.975263734143542e-06, + "loss": 0.2662, + "step": 24672 + }, + { + "epoch": 0.72, + "grad_norm": 1.25088622281622, + "learning_rate": 1.9748897354711634e-06, + "loss": 0.2802, + "step": 24673 + }, + { + "epoch": 0.72, + "grad_norm": 1.4681084408327985, + "learning_rate": 1.9745157634954156e-06, + "loss": 0.3091, + "step": 24674 + }, + { + "epoch": 0.72, + "grad_norm": 1.3655311307917344, + "learning_rate": 1.9741418182196e-06, + "loss": 0.2745, + "step": 24675 + }, + { + "epoch": 0.72, + "grad_norm": 1.4002928828271006, + "learning_rate": 1.973767899647016e-06, + "loss": 0.2833, + "step": 24676 + }, + { + "epoch": 0.72, + "grad_norm": 1.1977704567564438, + "learning_rate": 1.973394007780966e-06, + "loss": 0.2828, + "step": 24677 + }, + { + "epoch": 0.72, + "grad_norm": 1.344319680849655, + "learning_rate": 1.9730201426247457e-06, + "loss": 0.2808, + "step": 24678 + }, + { + "epoch": 0.72, + "grad_norm": 1.3215792571159328, + "learning_rate": 1.972646304181656e-06, + "loss": 0.2787, + "step": 24679 + }, + { + "epoch": 0.72, + "grad_norm": 1.3076876169712046, + "learning_rate": 1.9722724924549968e-06, + "loss": 0.2712, + "step": 24680 + }, + { + "epoch": 0.72, + "grad_norm": 1.401878144080537, + "learning_rate": 1.9718987074480662e-06, + "loss": 0.2816, + "step": 24681 + }, + { + "epoch": 0.72, + "grad_norm": 1.3537539986297429, + "learning_rate": 1.9715249491641625e-06, + "loss": 0.2964, + "step": 24682 + }, + { + "epoch": 0.72, + "grad_norm": 1.308973967669609, + "learning_rate": 1.9711512176065855e-06, + "loss": 0.2813, + "step": 24683 + }, + { + "epoch": 0.72, + "grad_norm": 1.2173225321397558, + "learning_rate": 1.9707775127786338e-06, + "loss": 0.2698, + "step": 24684 + }, + { + "epoch": 0.72, + "grad_norm": 1.4528251454389325, + "learning_rate": 1.970403834683603e-06, + "loss": 0.2583, + "step": 24685 + }, + { + "epoch": 0.72, + "grad_norm": 1.2740368901188637, + "learning_rate": 1.9700301833247915e-06, + "loss": 0.2694, + "step": 24686 + }, + { + "epoch": 0.72, + "grad_norm": 1.3845616321568925, + "learning_rate": 1.9696565587054973e-06, + "loss": 0.2928, + "step": 24687 + }, + { + "epoch": 0.72, + "grad_norm": 1.3164036930542906, + "learning_rate": 1.9692829608290195e-06, + "loss": 0.2894, + "step": 24688 + }, + { + "epoch": 0.72, + "grad_norm": 1.3222763128789634, + "learning_rate": 1.968909389698651e-06, + "loss": 0.2977, + "step": 24689 + }, + { + "epoch": 0.72, + "grad_norm": 1.3761918624392722, + "learning_rate": 1.9685358453176913e-06, + "loss": 0.3074, + "step": 24690 + }, + { + "epoch": 0.72, + "grad_norm": 1.2261460918826985, + "learning_rate": 1.9681623276894364e-06, + "loss": 0.2617, + "step": 24691 + }, + { + "epoch": 0.72, + "grad_norm": 1.384775152303766, + "learning_rate": 1.967788836817182e-06, + "loss": 0.3099, + "step": 24692 + }, + { + "epoch": 0.72, + "grad_norm": 1.3573461969380698, + "learning_rate": 1.9674153727042256e-06, + "loss": 0.2851, + "step": 24693 + }, + { + "epoch": 0.72, + "grad_norm": 1.2943014579926142, + "learning_rate": 1.9670419353538617e-06, + "loss": 0.2705, + "step": 24694 + }, + { + "epoch": 0.72, + "grad_norm": 1.213905147105852, + "learning_rate": 1.966668524769386e-06, + "loss": 0.2682, + "step": 24695 + }, + { + "epoch": 0.72, + "grad_norm": 1.6685248206407821, + "learning_rate": 1.9662951409540965e-06, + "loss": 0.2877, + "step": 24696 + }, + { + "epoch": 0.72, + "grad_norm": 1.234508384046244, + "learning_rate": 1.9659217839112842e-06, + "loss": 0.3013, + "step": 24697 + }, + { + "epoch": 0.72, + "grad_norm": 1.3495132150818994, + "learning_rate": 1.965548453644246e-06, + "loss": 0.2958, + "step": 24698 + }, + { + "epoch": 0.72, + "grad_norm": 1.2690400822736747, + "learning_rate": 1.965175150156276e-06, + "loss": 0.2932, + "step": 24699 + }, + { + "epoch": 0.72, + "grad_norm": 0.9883104412736063, + "learning_rate": 1.964801873450669e-06, + "loss": 0.6021, + "step": 24700 + }, + { + "epoch": 0.72, + "grad_norm": 1.849942520391732, + "learning_rate": 1.964428623530719e-06, + "loss": 0.3053, + "step": 24701 + }, + { + "epoch": 0.72, + "grad_norm": 1.2621818010229535, + "learning_rate": 1.9640554003997203e-06, + "loss": 0.2773, + "step": 24702 + }, + { + "epoch": 0.72, + "grad_norm": 1.3343822635701261, + "learning_rate": 1.9636822040609666e-06, + "loss": 0.2604, + "step": 24703 + }, + { + "epoch": 0.72, + "grad_norm": 1.4792107809712143, + "learning_rate": 1.963309034517751e-06, + "loss": 0.2805, + "step": 24704 + }, + { + "epoch": 0.72, + "grad_norm": 1.5218344101588, + "learning_rate": 1.9629358917733683e-06, + "loss": 0.2928, + "step": 24705 + }, + { + "epoch": 0.72, + "grad_norm": 1.3018248126688632, + "learning_rate": 1.962562775831108e-06, + "loss": 0.2849, + "step": 24706 + }, + { + "epoch": 0.72, + "grad_norm": 1.2123945480965674, + "learning_rate": 1.962189686694266e-06, + "loss": 0.2625, + "step": 24707 + }, + { + "epoch": 0.72, + "grad_norm": 1.6588028778290633, + "learning_rate": 1.961816624366133e-06, + "loss": 0.2954, + "step": 24708 + }, + { + "epoch": 0.72, + "grad_norm": 1.642614168499552, + "learning_rate": 1.9614435888500023e-06, + "loss": 0.298, + "step": 24709 + }, + { + "epoch": 0.72, + "grad_norm": 1.4487180793656822, + "learning_rate": 1.961070580149165e-06, + "loss": 0.3016, + "step": 24710 + }, + { + "epoch": 0.72, + "grad_norm": 1.251739473627745, + "learning_rate": 1.960697598266914e-06, + "loss": 0.2611, + "step": 24711 + }, + { + "epoch": 0.72, + "grad_norm": 1.4285226854530615, + "learning_rate": 1.9603246432065424e-06, + "loss": 0.2815, + "step": 24712 + }, + { + "epoch": 0.72, + "grad_norm": 1.2813444828112364, + "learning_rate": 1.959951714971338e-06, + "loss": 0.2748, + "step": 24713 + }, + { + "epoch": 0.72, + "grad_norm": 1.358024810811, + "learning_rate": 1.9595788135645927e-06, + "loss": 0.269, + "step": 24714 + }, + { + "epoch": 0.72, + "grad_norm": 1.3796744826136083, + "learning_rate": 1.9592059389896e-06, + "loss": 0.2928, + "step": 24715 + }, + { + "epoch": 0.72, + "grad_norm": 1.4782986990645843, + "learning_rate": 1.9588330912496474e-06, + "loss": 0.2646, + "step": 24716 + }, + { + "epoch": 0.72, + "grad_norm": 1.3263778085036453, + "learning_rate": 1.958460270348026e-06, + "loss": 0.2735, + "step": 24717 + }, + { + "epoch": 0.72, + "grad_norm": 1.262677022026564, + "learning_rate": 1.9580874762880266e-06, + "loss": 0.2606, + "step": 24718 + }, + { + "epoch": 0.72, + "grad_norm": 2.1803200944076147, + "learning_rate": 1.9577147090729386e-06, + "loss": 0.2865, + "step": 24719 + }, + { + "epoch": 0.72, + "grad_norm": 2.0200008522980517, + "learning_rate": 1.9573419687060524e-06, + "loss": 0.2898, + "step": 24720 + }, + { + "epoch": 0.72, + "grad_norm": 1.2715805952685266, + "learning_rate": 1.956969255190657e-06, + "loss": 0.2821, + "step": 24721 + }, + { + "epoch": 0.72, + "grad_norm": 0.9195158613425968, + "learning_rate": 1.956596568530042e-06, + "loss": 0.5365, + "step": 24722 + }, + { + "epoch": 0.72, + "grad_norm": 1.2425681860647049, + "learning_rate": 1.9562239087274953e-06, + "loss": 0.279, + "step": 24723 + }, + { + "epoch": 0.72, + "grad_norm": 1.3491474528189513, + "learning_rate": 1.955851275786309e-06, + "loss": 0.2778, + "step": 24724 + }, + { + "epoch": 0.72, + "grad_norm": 1.2538156561631115, + "learning_rate": 1.9554786697097668e-06, + "loss": 0.2868, + "step": 24725 + }, + { + "epoch": 0.72, + "grad_norm": 1.3773183381299476, + "learning_rate": 1.9551060905011593e-06, + "loss": 0.2785, + "step": 24726 + }, + { + "epoch": 0.72, + "grad_norm": 1.4822656587718297, + "learning_rate": 1.954733538163774e-06, + "loss": 0.2815, + "step": 24727 + }, + { + "epoch": 0.72, + "grad_norm": 1.0023173198118107, + "learning_rate": 1.954361012700899e-06, + "loss": 0.564, + "step": 24728 + }, + { + "epoch": 0.72, + "grad_norm": 1.2649302942442937, + "learning_rate": 1.9539885141158223e-06, + "loss": 0.283, + "step": 24729 + }, + { + "epoch": 0.72, + "grad_norm": 1.4232708631094553, + "learning_rate": 1.95361604241183e-06, + "loss": 0.2725, + "step": 24730 + }, + { + "epoch": 0.72, + "grad_norm": 1.4146140089261197, + "learning_rate": 1.9532435975922106e-06, + "loss": 0.2758, + "step": 24731 + }, + { + "epoch": 0.72, + "grad_norm": 1.6149588055848731, + "learning_rate": 1.9528711796602517e-06, + "loss": 0.287, + "step": 24732 + }, + { + "epoch": 0.72, + "grad_norm": 1.3133592948374055, + "learning_rate": 1.952498788619237e-06, + "loss": 0.2715, + "step": 24733 + }, + { + "epoch": 0.72, + "grad_norm": 1.793635004332468, + "learning_rate": 1.9521264244724543e-06, + "loss": 0.288, + "step": 24734 + }, + { + "epoch": 0.72, + "grad_norm": 1.3779998921824748, + "learning_rate": 1.9517540872231895e-06, + "loss": 0.2978, + "step": 24735 + }, + { + "epoch": 0.72, + "grad_norm": 1.4717719139320729, + "learning_rate": 1.9513817768747284e-06, + "loss": 0.2864, + "step": 24736 + }, + { + "epoch": 0.72, + "grad_norm": 1.339578839319714, + "learning_rate": 1.9510094934303568e-06, + "loss": 0.2741, + "step": 24737 + }, + { + "epoch": 0.72, + "grad_norm": 1.7661081526839444, + "learning_rate": 1.9506372368933614e-06, + "loss": 0.2809, + "step": 24738 + }, + { + "epoch": 0.72, + "grad_norm": 1.4246505417934197, + "learning_rate": 1.9502650072670254e-06, + "loss": 0.2932, + "step": 24739 + }, + { + "epoch": 0.72, + "grad_norm": 1.1910910811700608, + "learning_rate": 1.9498928045546362e-06, + "loss": 0.2548, + "step": 24740 + }, + { + "epoch": 0.72, + "grad_norm": 1.710248190782546, + "learning_rate": 1.949520628759475e-06, + "loss": 0.2896, + "step": 24741 + }, + { + "epoch": 0.72, + "grad_norm": 1.9078212112623119, + "learning_rate": 1.9491484798848287e-06, + "loss": 0.2965, + "step": 24742 + }, + { + "epoch": 0.72, + "grad_norm": 1.3013839493450705, + "learning_rate": 1.9487763579339826e-06, + "loss": 0.2712, + "step": 24743 + }, + { + "epoch": 0.72, + "grad_norm": 1.8948498686510071, + "learning_rate": 1.948404262910217e-06, + "loss": 0.306, + "step": 24744 + }, + { + "epoch": 0.72, + "grad_norm": 1.2635842131329145, + "learning_rate": 1.9480321948168175e-06, + "loss": 0.2638, + "step": 24745 + }, + { + "epoch": 0.72, + "grad_norm": 1.2717474102086173, + "learning_rate": 1.947660153657068e-06, + "loss": 0.2754, + "step": 24746 + }, + { + "epoch": 0.72, + "grad_norm": 1.2368452959887548, + "learning_rate": 1.9472881394342508e-06, + "loss": 0.2645, + "step": 24747 + }, + { + "epoch": 0.72, + "grad_norm": 1.3441493106263405, + "learning_rate": 1.94691615215165e-06, + "loss": 0.2923, + "step": 24748 + }, + { + "epoch": 0.72, + "grad_norm": 1.4450603461276181, + "learning_rate": 1.9465441918125484e-06, + "loss": 0.2668, + "step": 24749 + }, + { + "epoch": 0.72, + "grad_norm": 1.3050850312322095, + "learning_rate": 1.946172258420228e-06, + "loss": 0.273, + "step": 24750 + }, + { + "epoch": 0.72, + "grad_norm": 1.4044249452043773, + "learning_rate": 1.945800351977973e-06, + "loss": 0.2663, + "step": 24751 + }, + { + "epoch": 0.72, + "grad_norm": 1.3023254508500417, + "learning_rate": 1.945428472489062e-06, + "loss": 0.2797, + "step": 24752 + }, + { + "epoch": 0.72, + "grad_norm": 1.298658039202987, + "learning_rate": 1.945056619956779e-06, + "loss": 0.2876, + "step": 24753 + }, + { + "epoch": 0.72, + "grad_norm": 1.2897990830942259, + "learning_rate": 1.944684794384404e-06, + "loss": 0.294, + "step": 24754 + }, + { + "epoch": 0.72, + "grad_norm": 2.012318316168094, + "learning_rate": 1.944312995775221e-06, + "loss": 0.2796, + "step": 24755 + }, + { + "epoch": 0.72, + "grad_norm": 1.4588346135430545, + "learning_rate": 1.943941224132509e-06, + "loss": 0.2672, + "step": 24756 + }, + { + "epoch": 0.72, + "grad_norm": 1.3081587243236457, + "learning_rate": 1.94356947945955e-06, + "loss": 0.274, + "step": 24757 + }, + { + "epoch": 0.72, + "grad_norm": 1.4139124277457409, + "learning_rate": 1.9431977617596235e-06, + "loss": 0.2789, + "step": 24758 + }, + { + "epoch": 0.72, + "grad_norm": 1.284540077466258, + "learning_rate": 1.9428260710360113e-06, + "loss": 0.2718, + "step": 24759 + }, + { + "epoch": 0.72, + "grad_norm": 1.4116968956347675, + "learning_rate": 1.942454407291995e-06, + "loss": 0.2953, + "step": 24760 + }, + { + "epoch": 0.72, + "grad_norm": 1.3028672056361656, + "learning_rate": 1.9420827705308503e-06, + "loss": 0.2802, + "step": 24761 + }, + { + "epoch": 0.72, + "grad_norm": 1.2618874661865367, + "learning_rate": 1.9417111607558594e-06, + "loss": 0.2832, + "step": 24762 + }, + { + "epoch": 0.72, + "grad_norm": 1.3716172746514803, + "learning_rate": 1.941339577970301e-06, + "loss": 0.2886, + "step": 24763 + }, + { + "epoch": 0.72, + "grad_norm": 1.946776042455671, + "learning_rate": 1.9409680221774546e-06, + "loss": 0.2618, + "step": 24764 + }, + { + "epoch": 0.72, + "grad_norm": 1.2968689033965113, + "learning_rate": 1.9405964933805994e-06, + "loss": 0.2642, + "step": 24765 + }, + { + "epoch": 0.72, + "grad_norm": 1.6913396615449836, + "learning_rate": 1.9402249915830147e-06, + "loss": 0.2827, + "step": 24766 + }, + { + "epoch": 0.72, + "grad_norm": 1.3490603825136778, + "learning_rate": 1.939853516787978e-06, + "loss": 0.2909, + "step": 24767 + }, + { + "epoch": 0.72, + "grad_norm": 0.9817156653445883, + "learning_rate": 1.939482068998769e-06, + "loss": 0.6064, + "step": 24768 + }, + { + "epoch": 0.72, + "grad_norm": 1.3455405800447664, + "learning_rate": 1.939110648218664e-06, + "loss": 0.2765, + "step": 24769 + }, + { + "epoch": 0.72, + "grad_norm": 1.6258462898608714, + "learning_rate": 1.938739254450943e-06, + "loss": 0.2776, + "step": 24770 + }, + { + "epoch": 0.72, + "grad_norm": 1.27779884555093, + "learning_rate": 1.9383678876988797e-06, + "loss": 0.2703, + "step": 24771 + }, + { + "epoch": 0.72, + "grad_norm": 2.838745240470738, + "learning_rate": 1.9379965479657546e-06, + "loss": 0.2804, + "step": 24772 + }, + { + "epoch": 0.72, + "grad_norm": 1.3012449249354838, + "learning_rate": 1.9376252352548435e-06, + "loss": 0.2795, + "step": 24773 + }, + { + "epoch": 0.72, + "grad_norm": 1.5356103418109162, + "learning_rate": 1.9372539495694236e-06, + "loss": 0.2783, + "step": 24774 + }, + { + "epoch": 0.72, + "grad_norm": 1.3511863810970601, + "learning_rate": 1.9368826909127717e-06, + "loss": 0.2666, + "step": 24775 + }, + { + "epoch": 0.72, + "grad_norm": 1.2434703406752252, + "learning_rate": 1.936511459288164e-06, + "loss": 0.2803, + "step": 24776 + }, + { + "epoch": 0.72, + "grad_norm": 1.4707102501592992, + "learning_rate": 1.936140254698877e-06, + "loss": 0.288, + "step": 24777 + }, + { + "epoch": 0.72, + "grad_norm": 1.3768616158596014, + "learning_rate": 1.9357690771481856e-06, + "loss": 0.2893, + "step": 24778 + }, + { + "epoch": 0.72, + "grad_norm": 1.2514931389375357, + "learning_rate": 1.935397926639368e-06, + "loss": 0.2759, + "step": 24779 + }, + { + "epoch": 0.72, + "grad_norm": 1.6410671224618139, + "learning_rate": 1.9350268031756963e-06, + "loss": 0.2579, + "step": 24780 + }, + { + "epoch": 0.72, + "grad_norm": 1.2887182671012507, + "learning_rate": 1.934655706760447e-06, + "loss": 0.3008, + "step": 24781 + }, + { + "epoch": 0.72, + "grad_norm": 1.4371329553841246, + "learning_rate": 1.934284637396895e-06, + "loss": 0.2892, + "step": 24782 + }, + { + "epoch": 0.72, + "grad_norm": 1.296886330293263, + "learning_rate": 1.9339135950883147e-06, + "loss": 0.2831, + "step": 24783 + }, + { + "epoch": 0.72, + "grad_norm": 1.5853579010024699, + "learning_rate": 1.9335425798379813e-06, + "loss": 0.2781, + "step": 24784 + }, + { + "epoch": 0.72, + "grad_norm": 2.03631367095467, + "learning_rate": 1.933171591649169e-06, + "loss": 0.2665, + "step": 24785 + }, + { + "epoch": 0.72, + "grad_norm": 1.2587108793808888, + "learning_rate": 1.9328006305251506e-06, + "loss": 0.2831, + "step": 24786 + }, + { + "epoch": 0.72, + "grad_norm": 1.4104610415795904, + "learning_rate": 1.932429696469203e-06, + "loss": 0.3062, + "step": 24787 + }, + { + "epoch": 0.72, + "grad_norm": 1.385536296511894, + "learning_rate": 1.9320587894845956e-06, + "loss": 0.2932, + "step": 24788 + }, + { + "epoch": 0.72, + "grad_norm": 1.3983341009852794, + "learning_rate": 1.931687909574603e-06, + "loss": 0.3065, + "step": 24789 + }, + { + "epoch": 0.72, + "grad_norm": 1.4873695920779764, + "learning_rate": 1.9313170567424994e-06, + "loss": 0.2755, + "step": 24790 + }, + { + "epoch": 0.72, + "grad_norm": 1.4061938976241404, + "learning_rate": 1.930946230991556e-06, + "loss": 0.2783, + "step": 24791 + }, + { + "epoch": 0.72, + "grad_norm": 1.4986567864808737, + "learning_rate": 1.9305754323250464e-06, + "loss": 0.2709, + "step": 24792 + }, + { + "epoch": 0.72, + "grad_norm": 0.9567183467231207, + "learning_rate": 1.9302046607462434e-06, + "loss": 0.5703, + "step": 24793 + }, + { + "epoch": 0.72, + "grad_norm": 1.3177770308452181, + "learning_rate": 1.9298339162584174e-06, + "loss": 0.2721, + "step": 24794 + }, + { + "epoch": 0.72, + "grad_norm": 1.416493010236265, + "learning_rate": 1.929463198864841e-06, + "loss": 0.3116, + "step": 24795 + }, + { + "epoch": 0.72, + "grad_norm": 1.6760919404359194, + "learning_rate": 1.9290925085687884e-06, + "loss": 0.2743, + "step": 24796 + }, + { + "epoch": 0.72, + "grad_norm": 1.2464038555766404, + "learning_rate": 1.928721845373526e-06, + "loss": 0.27, + "step": 24797 + }, + { + "epoch": 0.72, + "grad_norm": 1.3544517363394382, + "learning_rate": 1.9283512092823296e-06, + "loss": 0.2789, + "step": 24798 + }, + { + "epoch": 0.72, + "grad_norm": 1.2788000906031638, + "learning_rate": 1.9279806002984664e-06, + "loss": 0.2848, + "step": 24799 + }, + { + "epoch": 0.72, + "grad_norm": 1.318502647045381, + "learning_rate": 1.9276100184252085e-06, + "loss": 0.2659, + "step": 24800 + }, + { + "epoch": 0.72, + "grad_norm": 1.2855344606599326, + "learning_rate": 1.927239463665826e-06, + "loss": 0.2909, + "step": 24801 + }, + { + "epoch": 0.72, + "grad_norm": 1.3247005950058552, + "learning_rate": 1.9268689360235887e-06, + "loss": 0.2722, + "step": 24802 + }, + { + "epoch": 0.72, + "grad_norm": 1.469862125410609, + "learning_rate": 1.926498435501768e-06, + "loss": 0.2802, + "step": 24803 + }, + { + "epoch": 0.72, + "grad_norm": 1.219220377276852, + "learning_rate": 1.926127962103632e-06, + "loss": 0.2684, + "step": 24804 + }, + { + "epoch": 0.72, + "grad_norm": 1.9499855518617832, + "learning_rate": 1.9257575158324517e-06, + "loss": 0.2781, + "step": 24805 + }, + { + "epoch": 0.72, + "grad_norm": 1.5600211271424214, + "learning_rate": 1.9253870966914965e-06, + "loss": 0.2732, + "step": 24806 + }, + { + "epoch": 0.72, + "grad_norm": 2.0840829104798737, + "learning_rate": 1.9250167046840327e-06, + "loss": 0.2851, + "step": 24807 + }, + { + "epoch": 0.72, + "grad_norm": 1.8894805519851772, + "learning_rate": 1.924646339813331e-06, + "loss": 0.2742, + "step": 24808 + }, + { + "epoch": 0.72, + "grad_norm": 1.28185463609132, + "learning_rate": 1.924276002082659e-06, + "loss": 0.284, + "step": 24809 + }, + { + "epoch": 0.72, + "grad_norm": 1.2630977158739936, + "learning_rate": 1.9239056914952855e-06, + "loss": 0.2715, + "step": 24810 + }, + { + "epoch": 0.72, + "grad_norm": 1.3894052420402994, + "learning_rate": 1.923535408054478e-06, + "loss": 0.2659, + "step": 24811 + }, + { + "epoch": 0.72, + "grad_norm": 1.281221642211108, + "learning_rate": 1.9231651517635053e-06, + "loss": 0.2731, + "step": 24812 + }, + { + "epoch": 0.72, + "grad_norm": 1.438145928916186, + "learning_rate": 1.922794922625634e-06, + "loss": 0.2786, + "step": 24813 + }, + { + "epoch": 0.72, + "grad_norm": 1.367301981546782, + "learning_rate": 1.9224247206441316e-06, + "loss": 0.2871, + "step": 24814 + }, + { + "epoch": 0.72, + "grad_norm": 1.4551071609022317, + "learning_rate": 1.9220545458222665e-06, + "loss": 0.274, + "step": 24815 + }, + { + "epoch": 0.72, + "grad_norm": 1.3271169132850094, + "learning_rate": 1.9216843981633034e-06, + "loss": 0.2812, + "step": 24816 + }, + { + "epoch": 0.72, + "grad_norm": 1.9360844211875399, + "learning_rate": 1.921314277670509e-06, + "loss": 0.301, + "step": 24817 + }, + { + "epoch": 0.72, + "grad_norm": 1.5328138953311548, + "learning_rate": 1.9209441843471504e-06, + "loss": 0.2701, + "step": 24818 + }, + { + "epoch": 0.72, + "grad_norm": 1.3248644009043302, + "learning_rate": 1.9205741181964937e-06, + "loss": 0.2821, + "step": 24819 + }, + { + "epoch": 0.72, + "grad_norm": 1.1905287373738986, + "learning_rate": 1.920204079221804e-06, + "loss": 0.2674, + "step": 24820 + }, + { + "epoch": 0.72, + "grad_norm": 2.0484534208337033, + "learning_rate": 1.9198340674263484e-06, + "loss": 0.2707, + "step": 24821 + }, + { + "epoch": 0.72, + "grad_norm": 0.9540132589148193, + "learning_rate": 1.9194640828133904e-06, + "loss": 0.5543, + "step": 24822 + }, + { + "epoch": 0.72, + "grad_norm": 1.3771705105970637, + "learning_rate": 1.9190941253861985e-06, + "loss": 0.2963, + "step": 24823 + }, + { + "epoch": 0.72, + "grad_norm": 1.3528885586200425, + "learning_rate": 1.9187241951480327e-06, + "loss": 0.2793, + "step": 24824 + }, + { + "epoch": 0.72, + "grad_norm": 1.2800404040881772, + "learning_rate": 1.9183542921021627e-06, + "loss": 0.2809, + "step": 24825 + }, + { + "epoch": 0.72, + "grad_norm": 1.2988039022753894, + "learning_rate": 1.9179844162518474e-06, + "loss": 0.2815, + "step": 24826 + }, + { + "epoch": 0.72, + "grad_norm": 1.3350280313786944, + "learning_rate": 1.917614567600354e-06, + "loss": 0.2845, + "step": 24827 + }, + { + "epoch": 0.72, + "grad_norm": 3.226259786629503, + "learning_rate": 1.9172447461509476e-06, + "loss": 0.2703, + "step": 24828 + }, + { + "epoch": 0.72, + "grad_norm": 1.4014461031602967, + "learning_rate": 1.916874951906889e-06, + "loss": 0.3159, + "step": 24829 + }, + { + "epoch": 0.72, + "grad_norm": 1.6488014884117774, + "learning_rate": 1.916505184871444e-06, + "loss": 0.2872, + "step": 24830 + }, + { + "epoch": 0.72, + "grad_norm": 1.2860446434902328, + "learning_rate": 1.916135445047874e-06, + "loss": 0.2868, + "step": 24831 + }, + { + "epoch": 0.72, + "grad_norm": 1.5601421379031712, + "learning_rate": 1.9157657324394442e-06, + "loss": 0.2864, + "step": 24832 + }, + { + "epoch": 0.72, + "grad_norm": 1.2683445312788566, + "learning_rate": 1.9153960470494156e-06, + "loss": 0.295, + "step": 24833 + }, + { + "epoch": 0.72, + "grad_norm": 1.405162786321312, + "learning_rate": 1.9150263888810522e-06, + "loss": 0.2771, + "step": 24834 + }, + { + "epoch": 0.72, + "grad_norm": 1.3951143320988522, + "learning_rate": 1.914656757937614e-06, + "loss": 0.2911, + "step": 24835 + }, + { + "epoch": 0.72, + "grad_norm": 1.4378380491568858, + "learning_rate": 1.914287154222364e-06, + "loss": 0.2762, + "step": 24836 + }, + { + "epoch": 0.72, + "grad_norm": 1.2998086691590887, + "learning_rate": 1.9139175777385637e-06, + "loss": 0.2845, + "step": 24837 + }, + { + "epoch": 0.72, + "grad_norm": 1.6570259992699083, + "learning_rate": 1.9135480284894755e-06, + "loss": 0.2896, + "step": 24838 + }, + { + "epoch": 0.72, + "grad_norm": 1.7921849364862776, + "learning_rate": 1.9131785064783596e-06, + "loss": 0.2755, + "step": 24839 + }, + { + "epoch": 0.72, + "grad_norm": 1.4031862450986603, + "learning_rate": 1.9128090117084786e-06, + "loss": 0.263, + "step": 24840 + }, + { + "epoch": 0.72, + "grad_norm": 1.3307008379910357, + "learning_rate": 1.9124395441830917e-06, + "loss": 0.2956, + "step": 24841 + }, + { + "epoch": 0.72, + "grad_norm": 1.3177731779681474, + "learning_rate": 1.9120701039054616e-06, + "loss": 0.2802, + "step": 24842 + }, + { + "epoch": 0.72, + "grad_norm": 1.276455144078788, + "learning_rate": 1.911700690878846e-06, + "loss": 0.2615, + "step": 24843 + }, + { + "epoch": 0.72, + "grad_norm": 1.2255136224844083, + "learning_rate": 1.9113313051065057e-06, + "loss": 0.2672, + "step": 24844 + }, + { + "epoch": 0.72, + "grad_norm": 1.4187929040105607, + "learning_rate": 1.9109619465917006e-06, + "loss": 0.2958, + "step": 24845 + }, + { + "epoch": 0.72, + "grad_norm": 1.3071773036960126, + "learning_rate": 1.9105926153376913e-06, + "loss": 0.271, + "step": 24846 + }, + { + "epoch": 0.72, + "grad_norm": 1.3844203566311506, + "learning_rate": 1.9102233113477357e-06, + "loss": 0.2946, + "step": 24847 + }, + { + "epoch": 0.72, + "grad_norm": 1.3468266960793545, + "learning_rate": 1.9098540346250943e-06, + "loss": 0.2835, + "step": 24848 + }, + { + "epoch": 0.72, + "grad_norm": 1.435582779776105, + "learning_rate": 1.9094847851730245e-06, + "loss": 0.2975, + "step": 24849 + }, + { + "epoch": 0.72, + "grad_norm": 1.875978916085102, + "learning_rate": 1.909115562994787e-06, + "loss": 0.2804, + "step": 24850 + }, + { + "epoch": 0.72, + "grad_norm": 1.346663479002196, + "learning_rate": 1.9087463680936396e-06, + "loss": 0.2968, + "step": 24851 + }, + { + "epoch": 0.72, + "grad_norm": 1.4200031042462355, + "learning_rate": 1.90837720047284e-06, + "loss": 0.2644, + "step": 24852 + }, + { + "epoch": 0.72, + "grad_norm": 1.325689782461566, + "learning_rate": 1.908008060135645e-06, + "loss": 0.2721, + "step": 24853 + }, + { + "epoch": 0.72, + "grad_norm": 1.2877233634248628, + "learning_rate": 1.907638947085312e-06, + "loss": 0.2921, + "step": 24854 + }, + { + "epoch": 0.72, + "grad_norm": 1.3234115950025556, + "learning_rate": 1.9072698613251001e-06, + "loss": 0.2846, + "step": 24855 + }, + { + "epoch": 0.72, + "grad_norm": 1.26057493083711, + "learning_rate": 1.906900802858266e-06, + "loss": 0.29, + "step": 24856 + }, + { + "epoch": 0.72, + "grad_norm": 1.4092657226418097, + "learning_rate": 1.9065317716880665e-06, + "loss": 0.2873, + "step": 24857 + }, + { + "epoch": 0.72, + "grad_norm": 1.3255341412148345, + "learning_rate": 1.9061627678177586e-06, + "loss": 0.2658, + "step": 24858 + }, + { + "epoch": 0.72, + "grad_norm": 1.6781601144658853, + "learning_rate": 1.9057937912505986e-06, + "loss": 0.3014, + "step": 24859 + }, + { + "epoch": 0.72, + "grad_norm": 1.4319150992004708, + "learning_rate": 1.9054248419898425e-06, + "loss": 0.2683, + "step": 24860 + }, + { + "epoch": 0.72, + "grad_norm": 1.2842752960720616, + "learning_rate": 1.9050559200387485e-06, + "loss": 0.283, + "step": 24861 + }, + { + "epoch": 0.72, + "grad_norm": 1.2949769609370845, + "learning_rate": 1.9046870254005684e-06, + "loss": 0.2753, + "step": 24862 + }, + { + "epoch": 0.72, + "grad_norm": 2.69047178159792, + "learning_rate": 1.9043181580785597e-06, + "loss": 0.2755, + "step": 24863 + }, + { + "epoch": 0.72, + "grad_norm": 1.270178861701313, + "learning_rate": 1.9039493180759777e-06, + "loss": 0.2747, + "step": 24864 + }, + { + "epoch": 0.72, + "grad_norm": 0.9350346669911, + "learning_rate": 1.903580505396077e-06, + "loss": 0.5871, + "step": 24865 + }, + { + "epoch": 0.72, + "grad_norm": 1.5003039180693594, + "learning_rate": 1.903211720042113e-06, + "loss": 0.2704, + "step": 24866 + }, + { + "epoch": 0.72, + "grad_norm": 1.9375671070396119, + "learning_rate": 1.90284296201734e-06, + "loss": 0.28, + "step": 24867 + }, + { + "epoch": 0.72, + "grad_norm": 1.2170681891607962, + "learning_rate": 1.9024742313250123e-06, + "loss": 0.2662, + "step": 24868 + }, + { + "epoch": 0.72, + "grad_norm": 1.318815307341359, + "learning_rate": 1.9021055279683838e-06, + "loss": 0.3031, + "step": 24869 + }, + { + "epoch": 0.72, + "grad_norm": 1.4091338251193575, + "learning_rate": 1.9017368519507097e-06, + "loss": 0.3023, + "step": 24870 + }, + { + "epoch": 0.72, + "grad_norm": 1.333288925901285, + "learning_rate": 1.9013682032752407e-06, + "loss": 0.3025, + "step": 24871 + }, + { + "epoch": 0.72, + "grad_norm": 1.6483130125696368, + "learning_rate": 1.9009995819452315e-06, + "loss": 0.2736, + "step": 24872 + }, + { + "epoch": 0.72, + "grad_norm": 1.371825615554427, + "learning_rate": 1.9006309879639357e-06, + "loss": 0.2666, + "step": 24873 + }, + { + "epoch": 0.72, + "grad_norm": 1.3974436932963912, + "learning_rate": 1.900262421334606e-06, + "loss": 0.2927, + "step": 24874 + }, + { + "epoch": 0.72, + "grad_norm": 1.2667591996982224, + "learning_rate": 1.8998938820604945e-06, + "loss": 0.2864, + "step": 24875 + }, + { + "epoch": 0.72, + "grad_norm": 1.3795247061986102, + "learning_rate": 1.899525370144854e-06, + "loss": 0.2516, + "step": 24876 + }, + { + "epoch": 0.72, + "grad_norm": 1.2565011061121834, + "learning_rate": 1.8991568855909365e-06, + "loss": 0.2871, + "step": 24877 + }, + { + "epoch": 0.72, + "grad_norm": 1.4872572824939105, + "learning_rate": 1.8987884284019954e-06, + "loss": 0.2666, + "step": 24878 + }, + { + "epoch": 0.72, + "grad_norm": 1.4596801928524237, + "learning_rate": 1.898419998581279e-06, + "loss": 0.2699, + "step": 24879 + }, + { + "epoch": 0.72, + "grad_norm": 1.2623000094699008, + "learning_rate": 1.8980515961320427e-06, + "loss": 0.2758, + "step": 24880 + }, + { + "epoch": 0.72, + "grad_norm": 1.382134549485469, + "learning_rate": 1.8976832210575335e-06, + "loss": 0.2717, + "step": 24881 + }, + { + "epoch": 0.72, + "grad_norm": 1.4579311815788722, + "learning_rate": 1.8973148733610043e-06, + "loss": 0.3003, + "step": 24882 + }, + { + "epoch": 0.72, + "grad_norm": 1.4579690698618415, + "learning_rate": 1.8969465530457059e-06, + "loss": 0.2664, + "step": 24883 + }, + { + "epoch": 0.72, + "grad_norm": 2.5034404815399123, + "learning_rate": 1.8965782601148885e-06, + "loss": 0.277, + "step": 24884 + }, + { + "epoch": 0.72, + "grad_norm": 1.2411656702585532, + "learning_rate": 1.896209994571802e-06, + "loss": 0.2794, + "step": 24885 + }, + { + "epoch": 0.72, + "grad_norm": 1.349777692290011, + "learning_rate": 1.8958417564196974e-06, + "loss": 0.2608, + "step": 24886 + }, + { + "epoch": 0.72, + "grad_norm": 1.2867269778715729, + "learning_rate": 1.8954735456618234e-06, + "loss": 0.3272, + "step": 24887 + }, + { + "epoch": 0.72, + "grad_norm": 1.2604885407482012, + "learning_rate": 1.8951053623014315e-06, + "loss": 0.2932, + "step": 24888 + }, + { + "epoch": 0.72, + "grad_norm": 1.2604764799608033, + "learning_rate": 1.8947372063417673e-06, + "loss": 0.2786, + "step": 24889 + }, + { + "epoch": 0.72, + "grad_norm": 1.4127294082084971, + "learning_rate": 1.8943690777860823e-06, + "loss": 0.2916, + "step": 24890 + }, + { + "epoch": 0.72, + "grad_norm": 1.2868688511462698, + "learning_rate": 1.8940009766376243e-06, + "loss": 0.2974, + "step": 24891 + }, + { + "epoch": 0.72, + "grad_norm": 1.8231598158023872, + "learning_rate": 1.893632902899642e-06, + "loss": 0.2885, + "step": 24892 + }, + { + "epoch": 0.72, + "grad_norm": 0.9947606631878442, + "learning_rate": 1.8932648565753841e-06, + "loss": 0.6348, + "step": 24893 + }, + { + "epoch": 0.72, + "grad_norm": 1.253352627695686, + "learning_rate": 1.8928968376680978e-06, + "loss": 0.2889, + "step": 24894 + }, + { + "epoch": 0.72, + "grad_norm": 1.3338993082140622, + "learning_rate": 1.892528846181032e-06, + "loss": 0.2964, + "step": 24895 + }, + { + "epoch": 0.72, + "grad_norm": 1.489712773896378, + "learning_rate": 1.8921608821174331e-06, + "loss": 0.3391, + "step": 24896 + }, + { + "epoch": 0.72, + "grad_norm": 1.215747445470959, + "learning_rate": 1.8917929454805512e-06, + "loss": 0.2788, + "step": 24897 + }, + { + "epoch": 0.72, + "grad_norm": 1.3202412402805836, + "learning_rate": 1.8914250362736287e-06, + "loss": 0.2908, + "step": 24898 + }, + { + "epoch": 0.72, + "grad_norm": 1.4190799096395665, + "learning_rate": 1.891057154499915e-06, + "loss": 0.2906, + "step": 24899 + }, + { + "epoch": 0.72, + "grad_norm": 1.2875030889670676, + "learning_rate": 1.8906893001626565e-06, + "loss": 0.2797, + "step": 24900 + }, + { + "epoch": 0.72, + "grad_norm": 1.31999722003175, + "learning_rate": 1.8903214732650993e-06, + "loss": 0.2739, + "step": 24901 + }, + { + "epoch": 0.72, + "grad_norm": 1.9722308462993634, + "learning_rate": 1.8899536738104895e-06, + "loss": 0.3185, + "step": 24902 + }, + { + "epoch": 0.72, + "grad_norm": 1.398740376279256, + "learning_rate": 1.889585901802073e-06, + "loss": 0.2932, + "step": 24903 + }, + { + "epoch": 0.72, + "grad_norm": 1.2048561981621477, + "learning_rate": 1.8892181572430957e-06, + "loss": 0.2848, + "step": 24904 + }, + { + "epoch": 0.72, + "grad_norm": 1.4552602968342223, + "learning_rate": 1.8888504401368024e-06, + "loss": 0.3049, + "step": 24905 + }, + { + "epoch": 0.72, + "grad_norm": 1.2154706632532348, + "learning_rate": 1.8884827504864405e-06, + "loss": 0.2904, + "step": 24906 + }, + { + "epoch": 0.72, + "grad_norm": 0.9569375589225357, + "learning_rate": 1.888115088295251e-06, + "loss": 0.5472, + "step": 24907 + }, + { + "epoch": 0.72, + "grad_norm": 1.2334569651373488, + "learning_rate": 1.8877474535664802e-06, + "loss": 0.2757, + "step": 24908 + }, + { + "epoch": 0.72, + "grad_norm": 1.9691114407317196, + "learning_rate": 1.8873798463033742e-06, + "loss": 0.2851, + "step": 24909 + }, + { + "epoch": 0.72, + "grad_norm": 1.2322226458340553, + "learning_rate": 1.8870122665091745e-06, + "loss": 0.2668, + "step": 24910 + }, + { + "epoch": 0.72, + "grad_norm": 1.2465731856154438, + "learning_rate": 1.8866447141871257e-06, + "loss": 0.2817, + "step": 24911 + }, + { + "epoch": 0.72, + "grad_norm": 1.8396150283708492, + "learning_rate": 1.8862771893404719e-06, + "loss": 0.2821, + "step": 24912 + }, + { + "epoch": 0.72, + "grad_norm": 1.2741300150592634, + "learning_rate": 1.8859096919724562e-06, + "loss": 0.2609, + "step": 24913 + }, + { + "epoch": 0.72, + "grad_norm": 1.3422028056787445, + "learning_rate": 1.8855422220863222e-06, + "loss": 0.2921, + "step": 24914 + }, + { + "epoch": 0.72, + "grad_norm": 1.4909804650090193, + "learning_rate": 1.8851747796853126e-06, + "loss": 0.302, + "step": 24915 + }, + { + "epoch": 0.72, + "grad_norm": 1.4537724445123736, + "learning_rate": 1.8848073647726723e-06, + "loss": 0.3032, + "step": 24916 + }, + { + "epoch": 0.72, + "grad_norm": 1.2875971736519838, + "learning_rate": 1.8844399773516391e-06, + "loss": 0.281, + "step": 24917 + }, + { + "epoch": 0.72, + "grad_norm": 1.3096445363498568, + "learning_rate": 1.884072617425458e-06, + "loss": 0.2884, + "step": 24918 + }, + { + "epoch": 0.72, + "grad_norm": 1.3032268349159077, + "learning_rate": 1.8837052849973709e-06, + "loss": 0.2777, + "step": 24919 + }, + { + "epoch": 0.72, + "grad_norm": 1.5244619899809195, + "learning_rate": 1.8833379800706186e-06, + "loss": 0.2805, + "step": 24920 + }, + { + "epoch": 0.72, + "grad_norm": 1.3577821237466114, + "learning_rate": 1.8829707026484434e-06, + "loss": 0.2636, + "step": 24921 + }, + { + "epoch": 0.72, + "grad_norm": 1.5274224768104225, + "learning_rate": 1.882603452734087e-06, + "loss": 0.2949, + "step": 24922 + }, + { + "epoch": 0.72, + "grad_norm": 1.2873947433749442, + "learning_rate": 1.882236230330789e-06, + "loss": 0.2763, + "step": 24923 + }, + { + "epoch": 0.72, + "grad_norm": 1.295562166055535, + "learning_rate": 1.8818690354417928e-06, + "loss": 0.2833, + "step": 24924 + }, + { + "epoch": 0.72, + "grad_norm": 1.313595030630339, + "learning_rate": 1.881501868070335e-06, + "loss": 0.2885, + "step": 24925 + }, + { + "epoch": 0.72, + "grad_norm": 1.430752345183849, + "learning_rate": 1.881134728219658e-06, + "loss": 0.2746, + "step": 24926 + }, + { + "epoch": 0.72, + "grad_norm": 1.2484781906748135, + "learning_rate": 1.8807676158930016e-06, + "loss": 0.2747, + "step": 24927 + }, + { + "epoch": 0.72, + "grad_norm": 1.499927359978309, + "learning_rate": 1.8804005310936058e-06, + "loss": 0.2693, + "step": 24928 + }, + { + "epoch": 0.72, + "grad_norm": 1.287620040771691, + "learning_rate": 1.8800334738247094e-06, + "loss": 0.2978, + "step": 24929 + }, + { + "epoch": 0.72, + "grad_norm": 1.9413005769976102, + "learning_rate": 1.879666444089553e-06, + "loss": 0.293, + "step": 24930 + }, + { + "epoch": 0.72, + "grad_norm": 1.6321524807641448, + "learning_rate": 1.879299441891374e-06, + "loss": 0.2592, + "step": 24931 + }, + { + "epoch": 0.72, + "grad_norm": 1.473219773061389, + "learning_rate": 1.8789324672334125e-06, + "loss": 0.2959, + "step": 24932 + }, + { + "epoch": 0.72, + "grad_norm": 1.8673805822019187, + "learning_rate": 1.8785655201189086e-06, + "loss": 0.2924, + "step": 24933 + }, + { + "epoch": 0.72, + "grad_norm": 1.361162521700264, + "learning_rate": 1.8781986005510967e-06, + "loss": 0.2868, + "step": 24934 + }, + { + "epoch": 0.72, + "grad_norm": 1.357496832427408, + "learning_rate": 1.8778317085332164e-06, + "loss": 0.2714, + "step": 24935 + }, + { + "epoch": 0.72, + "grad_norm": 1.3582710646332283, + "learning_rate": 1.877464844068506e-06, + "loss": 0.2826, + "step": 24936 + }, + { + "epoch": 0.72, + "grad_norm": 1.288484853266976, + "learning_rate": 1.8770980071602051e-06, + "loss": 0.2888, + "step": 24937 + }, + { + "epoch": 0.72, + "grad_norm": 1.4077788570740182, + "learning_rate": 1.876731197811547e-06, + "loss": 0.3005, + "step": 24938 + }, + { + "epoch": 0.72, + "grad_norm": 1.2967194830669457, + "learning_rate": 1.8763644160257704e-06, + "loss": 0.2705, + "step": 24939 + }, + { + "epoch": 0.72, + "grad_norm": 1.5162373217677798, + "learning_rate": 1.8759976618061126e-06, + "loss": 0.298, + "step": 24940 + }, + { + "epoch": 0.72, + "grad_norm": 0.965318076049014, + "learning_rate": 1.8756309351558104e-06, + "loss": 0.5809, + "step": 24941 + }, + { + "epoch": 0.72, + "grad_norm": 1.334874704947084, + "learning_rate": 1.8752642360780998e-06, + "loss": 0.2668, + "step": 24942 + }, + { + "epoch": 0.72, + "grad_norm": 1.2144515028184375, + "learning_rate": 1.8748975645762186e-06, + "loss": 0.2758, + "step": 24943 + }, + { + "epoch": 0.72, + "grad_norm": 0.9627508210207634, + "learning_rate": 1.8745309206533997e-06, + "loss": 0.5795, + "step": 24944 + }, + { + "epoch": 0.72, + "grad_norm": 1.288737200151492, + "learning_rate": 1.8741643043128792e-06, + "loss": 0.2779, + "step": 24945 + }, + { + "epoch": 0.72, + "grad_norm": 1.2663990683748518, + "learning_rate": 1.873797715557894e-06, + "loss": 0.2807, + "step": 24946 + }, + { + "epoch": 0.72, + "grad_norm": 1.4209699469771007, + "learning_rate": 1.873431154391679e-06, + "loss": 0.2635, + "step": 24947 + }, + { + "epoch": 0.72, + "grad_norm": 1.4400779227820228, + "learning_rate": 1.873064620817468e-06, + "loss": 0.2616, + "step": 24948 + }, + { + "epoch": 0.72, + "grad_norm": 1.6147143917952276, + "learning_rate": 1.872698114838497e-06, + "loss": 0.3058, + "step": 24949 + }, + { + "epoch": 0.72, + "grad_norm": 1.5171779466120543, + "learning_rate": 1.8723316364579997e-06, + "loss": 0.2745, + "step": 24950 + }, + { + "epoch": 0.72, + "grad_norm": 1.3177939668977632, + "learning_rate": 1.8719651856792104e-06, + "loss": 0.2712, + "step": 24951 + }, + { + "epoch": 0.72, + "grad_norm": 1.3675677262491506, + "learning_rate": 1.8715987625053644e-06, + "loss": 0.2847, + "step": 24952 + }, + { + "epoch": 0.72, + "grad_norm": 1.2987946724786008, + "learning_rate": 1.8712323669396925e-06, + "loss": 0.249, + "step": 24953 + }, + { + "epoch": 0.72, + "grad_norm": 1.255897643953996, + "learning_rate": 1.87086599898543e-06, + "loss": 0.2612, + "step": 24954 + }, + { + "epoch": 0.72, + "grad_norm": 1.277564182381071, + "learning_rate": 1.870499658645809e-06, + "loss": 0.2704, + "step": 24955 + }, + { + "epoch": 0.72, + "grad_norm": 1.3671998697610495, + "learning_rate": 1.8701333459240639e-06, + "loss": 0.2887, + "step": 24956 + }, + { + "epoch": 0.72, + "grad_norm": 1.2165045217992414, + "learning_rate": 1.8697670608234264e-06, + "loss": 0.2861, + "step": 24957 + }, + { + "epoch": 0.72, + "grad_norm": 1.2120939606779297, + "learning_rate": 1.8694008033471295e-06, + "loss": 0.2744, + "step": 24958 + }, + { + "epoch": 0.72, + "grad_norm": 1.3010963463056298, + "learning_rate": 1.8690345734984056e-06, + "loss": 0.2763, + "step": 24959 + }, + { + "epoch": 0.72, + "grad_norm": 1.369236343650927, + "learning_rate": 1.8686683712804877e-06, + "loss": 0.274, + "step": 24960 + }, + { + "epoch": 0.72, + "grad_norm": 1.295107679944435, + "learning_rate": 1.8683021966966042e-06, + "loss": 0.2709, + "step": 24961 + }, + { + "epoch": 0.72, + "grad_norm": 0.9458763141635926, + "learning_rate": 1.8679360497499888e-06, + "loss": 0.5844, + "step": 24962 + }, + { + "epoch": 0.72, + "grad_norm": 1.6455145812309127, + "learning_rate": 1.867569930443872e-06, + "loss": 0.2724, + "step": 24963 + }, + { + "epoch": 0.72, + "grad_norm": 1.2560081220785526, + "learning_rate": 1.8672038387814862e-06, + "loss": 0.2965, + "step": 24964 + }, + { + "epoch": 0.72, + "grad_norm": 1.343243356540131, + "learning_rate": 1.8668377747660626e-06, + "loss": 0.2906, + "step": 24965 + }, + { + "epoch": 0.72, + "grad_norm": 3.995124428111298, + "learning_rate": 1.8664717384008285e-06, + "loss": 0.2925, + "step": 24966 + }, + { + "epoch": 0.72, + "grad_norm": 1.2390398609239643, + "learning_rate": 1.8661057296890161e-06, + "loss": 0.2709, + "step": 24967 + }, + { + "epoch": 0.72, + "grad_norm": 1.3724979481932162, + "learning_rate": 1.8657397486338557e-06, + "loss": 0.2691, + "step": 24968 + }, + { + "epoch": 0.72, + "grad_norm": 1.4005720500032546, + "learning_rate": 1.8653737952385764e-06, + "loss": 0.2614, + "step": 24969 + }, + { + "epoch": 0.72, + "grad_norm": 1.241937955306468, + "learning_rate": 1.8650078695064084e-06, + "loss": 0.2764, + "step": 24970 + }, + { + "epoch": 0.72, + "grad_norm": 1.2766685387845755, + "learning_rate": 1.8646419714405823e-06, + "loss": 0.2938, + "step": 24971 + }, + { + "epoch": 0.72, + "grad_norm": 1.432968317701335, + "learning_rate": 1.8642761010443239e-06, + "loss": 0.2764, + "step": 24972 + }, + { + "epoch": 0.72, + "grad_norm": 3.1567962839660835, + "learning_rate": 1.8639102583208641e-06, + "loss": 0.3089, + "step": 24973 + }, + { + "epoch": 0.72, + "grad_norm": 0.970055182540937, + "learning_rate": 1.8635444432734307e-06, + "loss": 0.5904, + "step": 24974 + }, + { + "epoch": 0.72, + "grad_norm": 1.3067287459577617, + "learning_rate": 1.8631786559052523e-06, + "loss": 0.2574, + "step": 24975 + }, + { + "epoch": 0.72, + "grad_norm": 1.6067419771228497, + "learning_rate": 1.8628128962195568e-06, + "loss": 0.2977, + "step": 24976 + }, + { + "epoch": 0.72, + "grad_norm": 1.3088832125046421, + "learning_rate": 1.862447164219573e-06, + "loss": 0.2868, + "step": 24977 + }, + { + "epoch": 0.72, + "grad_norm": 1.2779410147089811, + "learning_rate": 1.8620814599085274e-06, + "loss": 0.2751, + "step": 24978 + }, + { + "epoch": 0.72, + "grad_norm": 1.344994459544317, + "learning_rate": 1.8617157832896493e-06, + "loss": 0.2845, + "step": 24979 + }, + { + "epoch": 0.72, + "grad_norm": 1.3252653174282225, + "learning_rate": 1.8613501343661626e-06, + "loss": 0.2721, + "step": 24980 + }, + { + "epoch": 0.72, + "grad_norm": 1.6709632388705626, + "learning_rate": 1.8609845131412958e-06, + "loss": 0.3138, + "step": 24981 + }, + { + "epoch": 0.72, + "grad_norm": 1.2358416874293114, + "learning_rate": 1.8606189196182755e-06, + "loss": 0.272, + "step": 24982 + }, + { + "epoch": 0.72, + "grad_norm": 1.5549369645805298, + "learning_rate": 1.860253353800328e-06, + "loss": 0.2783, + "step": 24983 + }, + { + "epoch": 0.72, + "grad_norm": 1.5368585006496036, + "learning_rate": 1.8598878156906797e-06, + "loss": 0.3048, + "step": 24984 + }, + { + "epoch": 0.72, + "grad_norm": 1.3713931092535896, + "learning_rate": 1.8595223052925566e-06, + "loss": 0.2895, + "step": 24985 + }, + { + "epoch": 0.72, + "grad_norm": 1.2776689176144571, + "learning_rate": 1.8591568226091833e-06, + "loss": 0.2616, + "step": 24986 + }, + { + "epoch": 0.72, + "grad_norm": 2.7124029876817266, + "learning_rate": 1.8587913676437864e-06, + "loss": 0.293, + "step": 24987 + }, + { + "epoch": 0.72, + "grad_norm": 1.4082449667254777, + "learning_rate": 1.858425940399592e-06, + "loss": 0.2628, + "step": 24988 + }, + { + "epoch": 0.72, + "grad_norm": 1.561617837729542, + "learning_rate": 1.8580605408798214e-06, + "loss": 0.3046, + "step": 24989 + }, + { + "epoch": 0.72, + "grad_norm": 1.322594717489361, + "learning_rate": 1.8576951690877015e-06, + "loss": 0.265, + "step": 24990 + }, + { + "epoch": 0.72, + "grad_norm": 1.286364029523856, + "learning_rate": 1.8573298250264571e-06, + "loss": 0.2792, + "step": 24991 + }, + { + "epoch": 0.72, + "grad_norm": 1.512436366916522, + "learning_rate": 1.8569645086993116e-06, + "loss": 0.2742, + "step": 24992 + }, + { + "epoch": 0.72, + "grad_norm": 1.2411581445230349, + "learning_rate": 1.8565992201094911e-06, + "loss": 0.2819, + "step": 24993 + }, + { + "epoch": 0.72, + "grad_norm": 2.0448630364762055, + "learning_rate": 1.8562339592602153e-06, + "loss": 0.2886, + "step": 24994 + }, + { + "epoch": 0.72, + "grad_norm": 1.2836978187366541, + "learning_rate": 1.8558687261547099e-06, + "loss": 0.2834, + "step": 24995 + }, + { + "epoch": 0.73, + "grad_norm": 1.2411713007458702, + "learning_rate": 1.855503520796198e-06, + "loss": 0.2894, + "step": 24996 + }, + { + "epoch": 0.73, + "grad_norm": 1.2574638146126818, + "learning_rate": 1.8551383431879023e-06, + "loss": 0.2827, + "step": 24997 + }, + { + "epoch": 0.73, + "grad_norm": 3.0202765405713814, + "learning_rate": 1.854773193333047e-06, + "loss": 0.2738, + "step": 24998 + }, + { + "epoch": 0.73, + "grad_norm": 1.6724979361226708, + "learning_rate": 1.8544080712348517e-06, + "loss": 0.2894, + "step": 24999 + }, + { + "epoch": 0.73, + "grad_norm": 1.316183262984611, + "learning_rate": 1.8540429768965401e-06, + "loss": 0.2582, + "step": 25000 + }, + { + "epoch": 0.73, + "grad_norm": 1.387448474146151, + "learning_rate": 1.8536779103213336e-06, + "loss": 0.2935, + "step": 25001 + }, + { + "epoch": 0.73, + "grad_norm": 1.212148683220443, + "learning_rate": 1.8533128715124555e-06, + "loss": 0.2742, + "step": 25002 + }, + { + "epoch": 0.73, + "grad_norm": 1.2149024193717906, + "learning_rate": 1.8529478604731256e-06, + "loss": 0.2772, + "step": 25003 + }, + { + "epoch": 0.73, + "grad_norm": 2.0155490402458334, + "learning_rate": 1.8525828772065658e-06, + "loss": 0.2832, + "step": 25004 + }, + { + "epoch": 0.73, + "grad_norm": 1.4796312631296678, + "learning_rate": 1.8522179217159969e-06, + "loss": 0.288, + "step": 25005 + }, + { + "epoch": 0.73, + "grad_norm": 1.379159721086646, + "learning_rate": 1.8518529940046398e-06, + "loss": 0.2662, + "step": 25006 + }, + { + "epoch": 0.73, + "grad_norm": 1.2925273995098783, + "learning_rate": 1.851488094075717e-06, + "loss": 0.2949, + "step": 25007 + }, + { + "epoch": 0.73, + "grad_norm": 1.707663432389763, + "learning_rate": 1.8511232219324444e-06, + "loss": 0.3289, + "step": 25008 + }, + { + "epoch": 0.73, + "grad_norm": 1.4788946588676661, + "learning_rate": 1.8507583775780447e-06, + "loss": 0.2616, + "step": 25009 + }, + { + "epoch": 0.73, + "grad_norm": 3.8355264243704403, + "learning_rate": 1.850393561015737e-06, + "loss": 0.292, + "step": 25010 + }, + { + "epoch": 0.73, + "grad_norm": 1.2638821095539672, + "learning_rate": 1.8500287722487414e-06, + "loss": 0.2904, + "step": 25011 + }, + { + "epoch": 0.73, + "grad_norm": 1.2460668532274533, + "learning_rate": 1.8496640112802767e-06, + "loss": 0.2715, + "step": 25012 + }, + { + "epoch": 0.73, + "grad_norm": 1.2874343616512922, + "learning_rate": 1.8492992781135617e-06, + "loss": 0.2849, + "step": 25013 + }, + { + "epoch": 0.73, + "grad_norm": 1.4475279717505298, + "learning_rate": 1.848934572751816e-06, + "loss": 0.2654, + "step": 25014 + }, + { + "epoch": 0.73, + "grad_norm": 1.427814574124187, + "learning_rate": 1.848569895198259e-06, + "loss": 0.2744, + "step": 25015 + }, + { + "epoch": 0.73, + "grad_norm": 1.3009730456106783, + "learning_rate": 1.848205245456106e-06, + "loss": 0.2917, + "step": 25016 + }, + { + "epoch": 0.73, + "grad_norm": 1.2725297794039345, + "learning_rate": 1.8478406235285773e-06, + "loss": 0.3055, + "step": 25017 + }, + { + "epoch": 0.73, + "grad_norm": 1.3810920754077873, + "learning_rate": 1.8474760294188892e-06, + "loss": 0.2912, + "step": 25018 + }, + { + "epoch": 0.73, + "grad_norm": 1.3501761473245477, + "learning_rate": 1.8471114631302606e-06, + "loss": 0.2734, + "step": 25019 + }, + { + "epoch": 0.73, + "grad_norm": 1.57358261711917, + "learning_rate": 1.846746924665908e-06, + "loss": 0.2668, + "step": 25020 + }, + { + "epoch": 0.73, + "grad_norm": 1.2880953078510207, + "learning_rate": 1.8463824140290509e-06, + "loss": 0.289, + "step": 25021 + }, + { + "epoch": 0.73, + "grad_norm": 1.3436089692014668, + "learning_rate": 1.8460179312229021e-06, + "loss": 0.2781, + "step": 25022 + }, + { + "epoch": 0.73, + "grad_norm": 1.6702994668496505, + "learning_rate": 1.84565347625068e-06, + "loss": 0.2941, + "step": 25023 + }, + { + "epoch": 0.73, + "grad_norm": 1.3759660160109675, + "learning_rate": 1.8452890491156006e-06, + "loss": 0.2696, + "step": 25024 + }, + { + "epoch": 0.73, + "grad_norm": 1.7099622637017666, + "learning_rate": 1.844924649820881e-06, + "loss": 0.271, + "step": 25025 + }, + { + "epoch": 0.73, + "grad_norm": 1.251996728628344, + "learning_rate": 1.8445602783697375e-06, + "loss": 0.2692, + "step": 25026 + }, + { + "epoch": 0.73, + "grad_norm": 0.935368176309834, + "learning_rate": 1.8441959347653831e-06, + "loss": 0.5415, + "step": 25027 + }, + { + "epoch": 0.73, + "grad_norm": 1.3928794998137402, + "learning_rate": 1.8438316190110346e-06, + "loss": 0.2912, + "step": 25028 + }, + { + "epoch": 0.73, + "grad_norm": 1.3619516403662524, + "learning_rate": 1.8434673311099072e-06, + "loss": 0.2738, + "step": 25029 + }, + { + "epoch": 0.73, + "grad_norm": 1.2970883026827602, + "learning_rate": 1.8431030710652154e-06, + "loss": 0.3195, + "step": 25030 + }, + { + "epoch": 0.73, + "grad_norm": 1.386949135096833, + "learning_rate": 1.8427388388801742e-06, + "loss": 0.2716, + "step": 25031 + }, + { + "epoch": 0.73, + "grad_norm": 1.286686734740873, + "learning_rate": 1.8423746345579974e-06, + "loss": 0.2685, + "step": 25032 + }, + { + "epoch": 0.73, + "grad_norm": 1.2947900236998924, + "learning_rate": 1.8420104581019e-06, + "loss": 0.2865, + "step": 25033 + }, + { + "epoch": 0.73, + "grad_norm": 1.5656953885108489, + "learning_rate": 1.8416463095150967e-06, + "loss": 0.2875, + "step": 25034 + }, + { + "epoch": 0.73, + "grad_norm": 1.46385280556, + "learning_rate": 1.8412821888007981e-06, + "loss": 0.2656, + "step": 25035 + }, + { + "epoch": 0.73, + "grad_norm": 1.6714281377443825, + "learning_rate": 1.8409180959622192e-06, + "loss": 0.2844, + "step": 25036 + }, + { + "epoch": 0.73, + "grad_norm": 1.7905981984512132, + "learning_rate": 1.8405540310025733e-06, + "loss": 0.2794, + "step": 25037 + }, + { + "epoch": 0.73, + "grad_norm": 1.3024473729860513, + "learning_rate": 1.8401899939250733e-06, + "loss": 0.2998, + "step": 25038 + }, + { + "epoch": 0.73, + "grad_norm": 1.4389220789895882, + "learning_rate": 1.8398259847329314e-06, + "loss": 0.3025, + "step": 25039 + }, + { + "epoch": 0.73, + "grad_norm": 1.5063962708350096, + "learning_rate": 1.8394620034293604e-06, + "loss": 0.2669, + "step": 25040 + }, + { + "epoch": 0.73, + "grad_norm": 1.5812057542074565, + "learning_rate": 1.839098050017572e-06, + "loss": 0.2894, + "step": 25041 + }, + { + "epoch": 0.73, + "grad_norm": 1.3607731190347776, + "learning_rate": 1.8387341245007784e-06, + "loss": 0.2873, + "step": 25042 + }, + { + "epoch": 0.73, + "grad_norm": 1.4698086443947758, + "learning_rate": 1.838370226882193e-06, + "loss": 0.2712, + "step": 25043 + }, + { + "epoch": 0.73, + "grad_norm": 1.498964535096974, + "learning_rate": 1.8380063571650237e-06, + "loss": 0.2762, + "step": 25044 + }, + { + "epoch": 0.73, + "grad_norm": 1.2876124918690304, + "learning_rate": 1.8376425153524836e-06, + "loss": 0.2709, + "step": 25045 + }, + { + "epoch": 0.73, + "grad_norm": 1.204703994189978, + "learning_rate": 1.8372787014477828e-06, + "loss": 0.2668, + "step": 25046 + }, + { + "epoch": 0.73, + "grad_norm": 1.2369976885343, + "learning_rate": 1.8369149154541333e-06, + "loss": 0.2675, + "step": 25047 + }, + { + "epoch": 0.73, + "grad_norm": 1.3335086706581674, + "learning_rate": 1.8365511573747442e-06, + "loss": 0.2625, + "step": 25048 + }, + { + "epoch": 0.73, + "grad_norm": 1.3582604154528013, + "learning_rate": 1.8361874272128284e-06, + "loss": 0.2814, + "step": 25049 + }, + { + "epoch": 0.73, + "grad_norm": 1.4832899410787699, + "learning_rate": 1.8358237249715916e-06, + "loss": 0.2856, + "step": 25050 + }, + { + "epoch": 0.73, + "grad_norm": 1.4474962862884884, + "learning_rate": 1.8354600506542453e-06, + "loss": 0.3198, + "step": 25051 + }, + { + "epoch": 0.73, + "grad_norm": 0.9652564851137466, + "learning_rate": 1.8350964042639995e-06, + "loss": 0.6216, + "step": 25052 + }, + { + "epoch": 0.73, + "grad_norm": 1.2750646660068559, + "learning_rate": 1.8347327858040649e-06, + "loss": 0.2791, + "step": 25053 + }, + { + "epoch": 0.73, + "grad_norm": 1.4865395923646192, + "learning_rate": 1.8343691952776465e-06, + "loss": 0.2766, + "step": 25054 + }, + { + "epoch": 0.73, + "grad_norm": 1.2137567266010927, + "learning_rate": 1.8340056326879551e-06, + "loss": 0.2648, + "step": 25055 + }, + { + "epoch": 0.73, + "grad_norm": 1.291918710261285, + "learning_rate": 1.8336420980381996e-06, + "loss": 0.2681, + "step": 25056 + }, + { + "epoch": 0.73, + "grad_norm": 1.327933971254959, + "learning_rate": 1.833278591331587e-06, + "loss": 0.2903, + "step": 25057 + }, + { + "epoch": 0.73, + "grad_norm": 1.3056711097982627, + "learning_rate": 1.8329151125713267e-06, + "loss": 0.2614, + "step": 25058 + }, + { + "epoch": 0.73, + "grad_norm": 1.237389481975691, + "learning_rate": 1.8325516617606253e-06, + "loss": 0.2842, + "step": 25059 + }, + { + "epoch": 0.73, + "grad_norm": 1.2084650758967144, + "learning_rate": 1.8321882389026908e-06, + "loss": 0.2831, + "step": 25060 + }, + { + "epoch": 0.73, + "grad_norm": 1.2133178234564235, + "learning_rate": 1.8318248440007296e-06, + "loss": 0.2674, + "step": 25061 + }, + { + "epoch": 0.73, + "grad_norm": 1.6414634589769432, + "learning_rate": 1.831461477057952e-06, + "loss": 0.3269, + "step": 25062 + }, + { + "epoch": 0.73, + "grad_norm": 1.3471904628508469, + "learning_rate": 1.8310981380775594e-06, + "loss": 0.278, + "step": 25063 + }, + { + "epoch": 0.73, + "grad_norm": 1.3645996548037522, + "learning_rate": 1.8307348270627612e-06, + "loss": 0.2886, + "step": 25064 + }, + { + "epoch": 0.73, + "grad_norm": 1.4919562845710812, + "learning_rate": 1.8303715440167636e-06, + "loss": 0.2919, + "step": 25065 + }, + { + "epoch": 0.73, + "grad_norm": 1.6636073817922217, + "learning_rate": 1.8300082889427723e-06, + "loss": 0.2805, + "step": 25066 + }, + { + "epoch": 0.73, + "grad_norm": 1.3118732039520258, + "learning_rate": 1.8296450618439926e-06, + "loss": 0.2733, + "step": 25067 + }, + { + "epoch": 0.73, + "grad_norm": 1.4173365320716147, + "learning_rate": 1.8292818627236303e-06, + "loss": 0.2895, + "step": 25068 + }, + { + "epoch": 0.73, + "grad_norm": 1.243764091848194, + "learning_rate": 1.8289186915848912e-06, + "loss": 0.2843, + "step": 25069 + }, + { + "epoch": 0.73, + "grad_norm": 1.3129243229552303, + "learning_rate": 1.8285555484309814e-06, + "loss": 0.2834, + "step": 25070 + }, + { + "epoch": 0.73, + "grad_norm": 1.3696711182354209, + "learning_rate": 1.8281924332651024e-06, + "loss": 0.2723, + "step": 25071 + }, + { + "epoch": 0.73, + "grad_norm": 1.2797963124767204, + "learning_rate": 1.82782934609046e-06, + "loss": 0.2667, + "step": 25072 + }, + { + "epoch": 0.73, + "grad_norm": 1.3786867724274567, + "learning_rate": 1.8274662869102595e-06, + "loss": 0.2905, + "step": 25073 + }, + { + "epoch": 0.73, + "grad_norm": 1.8630397274654396, + "learning_rate": 1.8271032557277035e-06, + "loss": 0.2749, + "step": 25074 + }, + { + "epoch": 0.73, + "grad_norm": 1.3822890515324453, + "learning_rate": 1.8267402525459965e-06, + "loss": 0.298, + "step": 25075 + }, + { + "epoch": 0.73, + "grad_norm": 1.239652308634355, + "learning_rate": 1.8263772773683425e-06, + "loss": 0.2767, + "step": 25076 + }, + { + "epoch": 0.73, + "grad_norm": 1.462694585994034, + "learning_rate": 1.8260143301979455e-06, + "loss": 0.3001, + "step": 25077 + }, + { + "epoch": 0.73, + "grad_norm": 1.5883762982140057, + "learning_rate": 1.8256514110380057e-06, + "loss": 0.2773, + "step": 25078 + }, + { + "epoch": 0.73, + "grad_norm": 1.2633074939070255, + "learning_rate": 1.8252885198917276e-06, + "loss": 0.2756, + "step": 25079 + }, + { + "epoch": 0.73, + "grad_norm": 1.2893481831686053, + "learning_rate": 1.8249256567623131e-06, + "loss": 0.2803, + "step": 25080 + }, + { + "epoch": 0.73, + "grad_norm": 1.5779158806503741, + "learning_rate": 1.824562821652967e-06, + "loss": 0.3225, + "step": 25081 + }, + { + "epoch": 0.73, + "grad_norm": 1.5638761947791584, + "learning_rate": 1.8242000145668875e-06, + "loss": 0.293, + "step": 25082 + }, + { + "epoch": 0.73, + "grad_norm": 1.2972439393945143, + "learning_rate": 1.8238372355072782e-06, + "loss": 0.291, + "step": 25083 + }, + { + "epoch": 0.73, + "grad_norm": 3.9512233368323235, + "learning_rate": 1.82347448447734e-06, + "loss": 0.2722, + "step": 25084 + }, + { + "epoch": 0.73, + "grad_norm": 1.2821671373632113, + "learning_rate": 1.823111761480275e-06, + "loss": 0.3223, + "step": 25085 + }, + { + "epoch": 0.73, + "grad_norm": 1.4712202578387803, + "learning_rate": 1.8227490665192838e-06, + "loss": 0.2736, + "step": 25086 + }, + { + "epoch": 0.73, + "grad_norm": 1.2566675251431707, + "learning_rate": 1.8223863995975678e-06, + "loss": 0.2748, + "step": 25087 + }, + { + "epoch": 0.73, + "grad_norm": 1.1823028695967903, + "learning_rate": 1.8220237607183266e-06, + "loss": 0.2642, + "step": 25088 + }, + { + "epoch": 0.73, + "grad_norm": 1.322504729044164, + "learning_rate": 1.821661149884763e-06, + "loss": 0.2755, + "step": 25089 + }, + { + "epoch": 0.73, + "grad_norm": 1.3693643073606643, + "learning_rate": 1.8212985671000732e-06, + "loss": 0.3059, + "step": 25090 + }, + { + "epoch": 0.73, + "grad_norm": 1.3173285236615717, + "learning_rate": 1.8209360123674585e-06, + "loss": 0.2627, + "step": 25091 + }, + { + "epoch": 0.73, + "grad_norm": 1.306236644188076, + "learning_rate": 1.820573485690119e-06, + "loss": 0.2616, + "step": 25092 + }, + { + "epoch": 0.73, + "grad_norm": 1.570887822230736, + "learning_rate": 1.8202109870712542e-06, + "loss": 0.2654, + "step": 25093 + }, + { + "epoch": 0.73, + "grad_norm": 1.294808385859915, + "learning_rate": 1.8198485165140623e-06, + "loss": 0.2874, + "step": 25094 + }, + { + "epoch": 0.73, + "grad_norm": 1.2637070223050677, + "learning_rate": 1.8194860740217423e-06, + "loss": 0.2719, + "step": 25095 + }, + { + "epoch": 0.73, + "grad_norm": 1.4231490643581293, + "learning_rate": 1.819123659597493e-06, + "loss": 0.2981, + "step": 25096 + }, + { + "epoch": 0.73, + "grad_norm": 1.2877924026449916, + "learning_rate": 1.818761273244513e-06, + "loss": 0.2912, + "step": 25097 + }, + { + "epoch": 0.73, + "grad_norm": 1.5340805600725744, + "learning_rate": 1.818398914966002e-06, + "loss": 0.3156, + "step": 25098 + }, + { + "epoch": 0.73, + "grad_norm": 4.339630854994888, + "learning_rate": 1.8180365847651538e-06, + "loss": 0.2766, + "step": 25099 + }, + { + "epoch": 0.73, + "grad_norm": 1.6379144853935854, + "learning_rate": 1.8176742826451677e-06, + "loss": 0.2791, + "step": 25100 + }, + { + "epoch": 0.73, + "grad_norm": 2.228160191307944, + "learning_rate": 1.8173120086092417e-06, + "loss": 0.268, + "step": 25101 + }, + { + "epoch": 0.73, + "grad_norm": 1.3429476236293736, + "learning_rate": 1.8169497626605726e-06, + "loss": 0.2761, + "step": 25102 + }, + { + "epoch": 0.73, + "grad_norm": 1.2885567451849804, + "learning_rate": 1.8165875448023573e-06, + "loss": 0.2816, + "step": 25103 + }, + { + "epoch": 0.73, + "grad_norm": 1.3400377274603397, + "learning_rate": 1.8162253550377917e-06, + "loss": 0.2944, + "step": 25104 + }, + { + "epoch": 0.73, + "grad_norm": 0.9789521076243157, + "learning_rate": 1.8158631933700749e-06, + "loss": 0.5458, + "step": 25105 + }, + { + "epoch": 0.73, + "grad_norm": 1.5578720526426573, + "learning_rate": 1.8155010598023987e-06, + "loss": 0.2876, + "step": 25106 + }, + { + "epoch": 0.73, + "grad_norm": 1.2737217979875295, + "learning_rate": 1.8151389543379611e-06, + "loss": 0.2617, + "step": 25107 + }, + { + "epoch": 0.73, + "grad_norm": 1.3579642925744098, + "learning_rate": 1.8147768769799595e-06, + "loss": 0.2705, + "step": 25108 + }, + { + "epoch": 0.73, + "grad_norm": 1.332748839788704, + "learning_rate": 1.8144148277315854e-06, + "loss": 0.2735, + "step": 25109 + }, + { + "epoch": 0.73, + "grad_norm": 1.3810243369047754, + "learning_rate": 1.8140528065960355e-06, + "loss": 0.2664, + "step": 25110 + }, + { + "epoch": 0.73, + "grad_norm": 1.5010690582310355, + "learning_rate": 1.8136908135765052e-06, + "loss": 0.2624, + "step": 25111 + }, + { + "epoch": 0.73, + "grad_norm": 1.3419563116154618, + "learning_rate": 1.813328848676189e-06, + "loss": 0.2789, + "step": 25112 + }, + { + "epoch": 0.73, + "grad_norm": 1.824986443366424, + "learning_rate": 1.8129669118982807e-06, + "loss": 0.3132, + "step": 25113 + }, + { + "epoch": 0.73, + "grad_norm": 0.9497928871060826, + "learning_rate": 1.8126050032459746e-06, + "loss": 0.5988, + "step": 25114 + }, + { + "epoch": 0.73, + "grad_norm": 1.4338948775977973, + "learning_rate": 1.8122431227224652e-06, + "loss": 0.277, + "step": 25115 + }, + { + "epoch": 0.73, + "grad_norm": 3.626938105642605, + "learning_rate": 1.8118812703309457e-06, + "loss": 0.2852, + "step": 25116 + }, + { + "epoch": 0.73, + "grad_norm": 1.328504490535427, + "learning_rate": 1.8115194460746104e-06, + "loss": 0.2763, + "step": 25117 + }, + { + "epoch": 0.73, + "grad_norm": 1.3071375396895573, + "learning_rate": 1.8111576499566501e-06, + "loss": 0.2834, + "step": 25118 + }, + { + "epoch": 0.73, + "grad_norm": 1.5668115514372323, + "learning_rate": 1.810795881980259e-06, + "loss": 0.2832, + "step": 25119 + }, + { + "epoch": 0.73, + "grad_norm": 1.2994138649564484, + "learning_rate": 1.8104341421486293e-06, + "loss": 0.2771, + "step": 25120 + }, + { + "epoch": 0.73, + "grad_norm": 1.4909522157748685, + "learning_rate": 1.8100724304649542e-06, + "loss": 0.2853, + "step": 25121 + }, + { + "epoch": 0.73, + "grad_norm": 1.5436175492773827, + "learning_rate": 1.809710746932425e-06, + "loss": 0.268, + "step": 25122 + }, + { + "epoch": 0.73, + "grad_norm": 1.3193772040989304, + "learning_rate": 1.8093490915542334e-06, + "loss": 0.2735, + "step": 25123 + }, + { + "epoch": 0.73, + "grad_norm": 1.8709976542531237, + "learning_rate": 1.808987464333572e-06, + "loss": 0.3017, + "step": 25124 + }, + { + "epoch": 0.73, + "grad_norm": 1.481917295362247, + "learning_rate": 1.8086258652736332e-06, + "loss": 0.3178, + "step": 25125 + }, + { + "epoch": 0.73, + "grad_norm": 1.6848857888811564, + "learning_rate": 1.8082642943776047e-06, + "loss": 0.2805, + "step": 25126 + }, + { + "epoch": 0.73, + "grad_norm": 1.2633118264861423, + "learning_rate": 1.807902751648679e-06, + "loss": 0.2991, + "step": 25127 + }, + { + "epoch": 0.73, + "grad_norm": 1.6036714935638543, + "learning_rate": 1.8075412370900475e-06, + "loss": 0.3154, + "step": 25128 + }, + { + "epoch": 0.73, + "grad_norm": 1.3290394217391304, + "learning_rate": 1.8071797507049e-06, + "loss": 0.2997, + "step": 25129 + }, + { + "epoch": 0.73, + "grad_norm": 1.3932469835573955, + "learning_rate": 1.8068182924964262e-06, + "loss": 0.2863, + "step": 25130 + }, + { + "epoch": 0.73, + "grad_norm": 1.4418226188625483, + "learning_rate": 1.8064568624678163e-06, + "loss": 0.2897, + "step": 25131 + }, + { + "epoch": 0.73, + "grad_norm": 1.4439337921208317, + "learning_rate": 1.806095460622261e-06, + "loss": 0.2869, + "step": 25132 + }, + { + "epoch": 0.73, + "grad_norm": 1.4140047592416156, + "learning_rate": 1.805734086962949e-06, + "loss": 0.2877, + "step": 25133 + }, + { + "epoch": 0.73, + "grad_norm": 2.2805967033654926, + "learning_rate": 1.8053727414930683e-06, + "loss": 0.2852, + "step": 25134 + }, + { + "epoch": 0.73, + "grad_norm": 1.7929308751797386, + "learning_rate": 1.8050114242158083e-06, + "loss": 0.3109, + "step": 25135 + }, + { + "epoch": 0.73, + "grad_norm": 1.279419027799878, + "learning_rate": 1.8046501351343599e-06, + "loss": 0.2698, + "step": 25136 + }, + { + "epoch": 0.73, + "grad_norm": 1.2306235393990963, + "learning_rate": 1.8042888742519077e-06, + "loss": 0.2816, + "step": 25137 + }, + { + "epoch": 0.73, + "grad_norm": 1.5883978827097085, + "learning_rate": 1.8039276415716411e-06, + "loss": 0.2635, + "step": 25138 + }, + { + "epoch": 0.73, + "grad_norm": 1.2375070730799052, + "learning_rate": 1.8035664370967493e-06, + "loss": 0.267, + "step": 25139 + }, + { + "epoch": 0.73, + "grad_norm": 1.2921587322020749, + "learning_rate": 1.803205260830419e-06, + "loss": 0.2655, + "step": 25140 + }, + { + "epoch": 0.73, + "grad_norm": 1.343726221095622, + "learning_rate": 1.8028441127758373e-06, + "loss": 0.2786, + "step": 25141 + }, + { + "epoch": 0.73, + "grad_norm": 1.254574555953447, + "learning_rate": 1.802482992936192e-06, + "loss": 0.2667, + "step": 25142 + }, + { + "epoch": 0.73, + "grad_norm": 1.3683906139876791, + "learning_rate": 1.8021219013146696e-06, + "loss": 0.2665, + "step": 25143 + }, + { + "epoch": 0.73, + "grad_norm": 1.271072895440581, + "learning_rate": 1.801760837914459e-06, + "loss": 0.2824, + "step": 25144 + }, + { + "epoch": 0.73, + "grad_norm": 1.4718397230045188, + "learning_rate": 1.8013998027387425e-06, + "loss": 0.2866, + "step": 25145 + }, + { + "epoch": 0.73, + "grad_norm": 1.9175291710540738, + "learning_rate": 1.8010387957907082e-06, + "loss": 0.276, + "step": 25146 + }, + { + "epoch": 0.73, + "grad_norm": 1.5482809034048068, + "learning_rate": 1.8006778170735422e-06, + "loss": 0.2649, + "step": 25147 + }, + { + "epoch": 0.73, + "grad_norm": 1.3751183800871931, + "learning_rate": 1.8003168665904297e-06, + "loss": 0.2892, + "step": 25148 + }, + { + "epoch": 0.73, + "grad_norm": 1.3750962057190774, + "learning_rate": 1.7999559443445569e-06, + "loss": 0.2679, + "step": 25149 + }, + { + "epoch": 0.73, + "grad_norm": 1.3436683977202681, + "learning_rate": 1.7995950503391079e-06, + "loss": 0.2783, + "step": 25150 + }, + { + "epoch": 0.73, + "grad_norm": 1.2798734215506264, + "learning_rate": 1.7992341845772682e-06, + "loss": 0.2863, + "step": 25151 + }, + { + "epoch": 0.73, + "grad_norm": 1.275243907396234, + "learning_rate": 1.7988733470622222e-06, + "loss": 0.279, + "step": 25152 + }, + { + "epoch": 0.73, + "grad_norm": 1.4369053574291353, + "learning_rate": 1.7985125377971563e-06, + "loss": 0.2945, + "step": 25153 + }, + { + "epoch": 0.73, + "grad_norm": 1.3233427564032334, + "learning_rate": 1.7981517567852507e-06, + "loss": 0.2904, + "step": 25154 + }, + { + "epoch": 0.73, + "grad_norm": 1.2770195603883188, + "learning_rate": 1.797791004029692e-06, + "loss": 0.2849, + "step": 25155 + }, + { + "epoch": 0.73, + "grad_norm": 1.3626212029837497, + "learning_rate": 1.797430279533663e-06, + "loss": 0.2722, + "step": 25156 + }, + { + "epoch": 0.73, + "grad_norm": 1.8325163320455262, + "learning_rate": 1.797069583300347e-06, + "loss": 0.313, + "step": 25157 + }, + { + "epoch": 0.73, + "grad_norm": 1.3676884741494053, + "learning_rate": 1.7967089153329275e-06, + "loss": 0.2814, + "step": 25158 + }, + { + "epoch": 0.73, + "grad_norm": 1.2197953473752443, + "learning_rate": 1.796348275634588e-06, + "loss": 0.2761, + "step": 25159 + }, + { + "epoch": 0.73, + "grad_norm": 1.376221867918255, + "learning_rate": 1.79598766420851e-06, + "loss": 0.2792, + "step": 25160 + }, + { + "epoch": 0.73, + "grad_norm": 1.3594694910195033, + "learning_rate": 1.795627081057878e-06, + "loss": 0.2906, + "step": 25161 + }, + { + "epoch": 0.73, + "grad_norm": 1.3014004887325816, + "learning_rate": 1.795266526185871e-06, + "loss": 0.2769, + "step": 25162 + }, + { + "epoch": 0.73, + "grad_norm": 1.3305450631439357, + "learning_rate": 1.794905999595674e-06, + "loss": 0.2992, + "step": 25163 + }, + { + "epoch": 0.73, + "grad_norm": 1.2561051811469996, + "learning_rate": 1.7945455012904655e-06, + "loss": 0.2759, + "step": 25164 + }, + { + "epoch": 0.73, + "grad_norm": 0.9838232537224828, + "learning_rate": 1.7941850312734287e-06, + "loss": 0.6084, + "step": 25165 + }, + { + "epoch": 0.73, + "grad_norm": 1.355519552596093, + "learning_rate": 1.793824589547744e-06, + "loss": 0.262, + "step": 25166 + }, + { + "epoch": 0.73, + "grad_norm": 1.4150495253648256, + "learning_rate": 1.7934641761165938e-06, + "loss": 0.2662, + "step": 25167 + }, + { + "epoch": 0.73, + "grad_norm": 1.2667505150734932, + "learning_rate": 1.7931037909831573e-06, + "loss": 0.2619, + "step": 25168 + }, + { + "epoch": 0.73, + "grad_norm": 1.389559376676607, + "learning_rate": 1.792743434150615e-06, + "loss": 0.293, + "step": 25169 + }, + { + "epoch": 0.73, + "grad_norm": 1.2577797127404242, + "learning_rate": 1.792383105622148e-06, + "loss": 0.2869, + "step": 25170 + }, + { + "epoch": 0.73, + "grad_norm": 1.4107657884221576, + "learning_rate": 1.7920228054009358e-06, + "loss": 0.2762, + "step": 25171 + }, + { + "epoch": 0.73, + "grad_norm": 1.2734377328009416, + "learning_rate": 1.7916625334901594e-06, + "loss": 0.3049, + "step": 25172 + }, + { + "epoch": 0.73, + "grad_norm": 1.2873189475483018, + "learning_rate": 1.791302289892995e-06, + "loss": 0.277, + "step": 25173 + }, + { + "epoch": 0.73, + "grad_norm": 1.9578360088820375, + "learning_rate": 1.7909420746126232e-06, + "loss": 0.2698, + "step": 25174 + }, + { + "epoch": 0.73, + "grad_norm": 1.5097369758196098, + "learning_rate": 1.7905818876522242e-06, + "loss": 0.2726, + "step": 25175 + }, + { + "epoch": 0.73, + "grad_norm": 1.592197175376899, + "learning_rate": 1.7902217290149749e-06, + "loss": 0.3042, + "step": 25176 + }, + { + "epoch": 0.73, + "grad_norm": 1.3837825781894737, + "learning_rate": 1.789861598704054e-06, + "loss": 0.2714, + "step": 25177 + }, + { + "epoch": 0.73, + "grad_norm": 1.5907578758001777, + "learning_rate": 1.789501496722641e-06, + "loss": 0.2882, + "step": 25178 + }, + { + "epoch": 0.73, + "grad_norm": 1.4716559332057175, + "learning_rate": 1.7891414230739123e-06, + "loss": 0.2826, + "step": 25179 + }, + { + "epoch": 0.73, + "grad_norm": 1.2846812736859883, + "learning_rate": 1.788781377761048e-06, + "loss": 0.2807, + "step": 25180 + }, + { + "epoch": 0.73, + "grad_norm": 1.293378947585154, + "learning_rate": 1.788421360787222e-06, + "loss": 0.2591, + "step": 25181 + }, + { + "epoch": 0.73, + "grad_norm": 1.3939732521925983, + "learning_rate": 1.7880613721556134e-06, + "loss": 0.2832, + "step": 25182 + }, + { + "epoch": 0.73, + "grad_norm": 1.373970476500958, + "learning_rate": 1.7877014118693986e-06, + "loss": 0.3379, + "step": 25183 + }, + { + "epoch": 0.73, + "grad_norm": 1.5388108213297091, + "learning_rate": 1.7873414799317545e-06, + "loss": 0.2751, + "step": 25184 + }, + { + "epoch": 0.73, + "grad_norm": 1.9917201366931843, + "learning_rate": 1.7869815763458576e-06, + "loss": 0.2916, + "step": 25185 + }, + { + "epoch": 0.73, + "grad_norm": 1.3212303867002588, + "learning_rate": 1.7866217011148835e-06, + "loss": 0.2735, + "step": 25186 + }, + { + "epoch": 0.73, + "grad_norm": 1.3847974922430442, + "learning_rate": 1.786261854242009e-06, + "loss": 0.3031, + "step": 25187 + }, + { + "epoch": 0.73, + "grad_norm": 1.306790145190154, + "learning_rate": 1.785902035730409e-06, + "loss": 0.2766, + "step": 25188 + }, + { + "epoch": 0.73, + "grad_norm": 1.3400862822497968, + "learning_rate": 1.785542245583261e-06, + "loss": 0.285, + "step": 25189 + }, + { + "epoch": 0.73, + "grad_norm": 1.4414034403960294, + "learning_rate": 1.7851824838037385e-06, + "loss": 0.2951, + "step": 25190 + }, + { + "epoch": 0.73, + "grad_norm": 1.545371056590212, + "learning_rate": 1.7848227503950138e-06, + "loss": 0.3101, + "step": 25191 + }, + { + "epoch": 0.73, + "grad_norm": 1.6898876999514707, + "learning_rate": 1.7844630453602646e-06, + "loss": 0.2734, + "step": 25192 + }, + { + "epoch": 0.73, + "grad_norm": 1.705466913710647, + "learning_rate": 1.7841033687026648e-06, + "loss": 0.289, + "step": 25193 + }, + { + "epoch": 0.73, + "grad_norm": 1.372409986433954, + "learning_rate": 1.7837437204253882e-06, + "loss": 0.283, + "step": 25194 + }, + { + "epoch": 0.73, + "grad_norm": 1.3717253040981852, + "learning_rate": 1.7833841005316094e-06, + "loss": 0.289, + "step": 25195 + }, + { + "epoch": 0.73, + "grad_norm": 1.3770944589746363, + "learning_rate": 1.7830245090245013e-06, + "loss": 0.2589, + "step": 25196 + }, + { + "epoch": 0.73, + "grad_norm": 1.2614291917818226, + "learning_rate": 1.7826649459072376e-06, + "loss": 0.2818, + "step": 25197 + }, + { + "epoch": 0.73, + "grad_norm": 1.2673778200890595, + "learning_rate": 1.7823054111829912e-06, + "loss": 0.2868, + "step": 25198 + }, + { + "epoch": 0.73, + "grad_norm": 1.2459943923815857, + "learning_rate": 1.781945904854937e-06, + "loss": 0.2786, + "step": 25199 + }, + { + "epoch": 0.73, + "grad_norm": 1.4772873353456348, + "learning_rate": 1.7815864269262444e-06, + "loss": 0.2784, + "step": 25200 + }, + { + "epoch": 0.73, + "grad_norm": 1.5176088731117843, + "learning_rate": 1.7812269774000872e-06, + "loss": 0.2918, + "step": 25201 + }, + { + "epoch": 0.73, + "grad_norm": 1.7330910671056707, + "learning_rate": 1.780867556279638e-06, + "loss": 0.2686, + "step": 25202 + }, + { + "epoch": 0.73, + "grad_norm": 1.0085095581007295, + "learning_rate": 1.7805081635680677e-06, + "loss": 0.601, + "step": 25203 + }, + { + "epoch": 0.73, + "grad_norm": 1.4109267934423684, + "learning_rate": 1.780148799268549e-06, + "loss": 0.3034, + "step": 25204 + }, + { + "epoch": 0.73, + "grad_norm": 1.3860134339440953, + "learning_rate": 1.7797894633842528e-06, + "loss": 0.2886, + "step": 25205 + }, + { + "epoch": 0.73, + "grad_norm": 1.5769348209714826, + "learning_rate": 1.7794301559183496e-06, + "loss": 0.2774, + "step": 25206 + }, + { + "epoch": 0.73, + "grad_norm": 1.655184671177166, + "learning_rate": 1.7790708768740116e-06, + "loss": 0.2852, + "step": 25207 + }, + { + "epoch": 0.73, + "grad_norm": 1.041712705030098, + "learning_rate": 1.7787116262544108e-06, + "loss": 0.5656, + "step": 25208 + }, + { + "epoch": 0.73, + "grad_norm": 1.2317584297942712, + "learning_rate": 1.7783524040627136e-06, + "loss": 0.2601, + "step": 25209 + }, + { + "epoch": 0.73, + "grad_norm": 1.9545589979083293, + "learning_rate": 1.7779932103020925e-06, + "loss": 0.2956, + "step": 25210 + }, + { + "epoch": 0.73, + "grad_norm": 1.2925046507354199, + "learning_rate": 1.7776340449757167e-06, + "loss": 0.2677, + "step": 25211 + }, + { + "epoch": 0.73, + "grad_norm": 1.639042237594346, + "learning_rate": 1.7772749080867562e-06, + "loss": 0.3113, + "step": 25212 + }, + { + "epoch": 0.73, + "grad_norm": 1.3281305803991887, + "learning_rate": 1.7769157996383807e-06, + "loss": 0.2698, + "step": 25213 + }, + { + "epoch": 0.73, + "grad_norm": 1.5696633076279487, + "learning_rate": 1.776556719633759e-06, + "loss": 0.2633, + "step": 25214 + }, + { + "epoch": 0.73, + "grad_norm": 2.092548235641696, + "learning_rate": 1.7761976680760596e-06, + "loss": 0.2584, + "step": 25215 + }, + { + "epoch": 0.73, + "grad_norm": 2.0448367428150274, + "learning_rate": 1.7758386449684534e-06, + "loss": 0.2939, + "step": 25216 + }, + { + "epoch": 0.73, + "grad_norm": 1.353811613752057, + "learning_rate": 1.7754796503141053e-06, + "loss": 0.271, + "step": 25217 + }, + { + "epoch": 0.73, + "grad_norm": 1.3972601042147124, + "learning_rate": 1.7751206841161871e-06, + "loss": 0.3336, + "step": 25218 + }, + { + "epoch": 0.73, + "grad_norm": 1.4797183100816451, + "learning_rate": 1.7747617463778627e-06, + "loss": 0.2643, + "step": 25219 + }, + { + "epoch": 0.73, + "grad_norm": 1.5388367828995104, + "learning_rate": 1.7744028371023014e-06, + "loss": 0.2659, + "step": 25220 + }, + { + "epoch": 0.73, + "grad_norm": 1.3043153679804245, + "learning_rate": 1.774043956292671e-06, + "loss": 0.2656, + "step": 25221 + }, + { + "epoch": 0.73, + "grad_norm": 1.2268008630705132, + "learning_rate": 1.7736851039521386e-06, + "loss": 0.279, + "step": 25222 + }, + { + "epoch": 0.73, + "grad_norm": 1.3353575557732515, + "learning_rate": 1.7733262800838707e-06, + "loss": 0.2654, + "step": 25223 + }, + { + "epoch": 0.73, + "grad_norm": 1.4094868545222965, + "learning_rate": 1.7729674846910345e-06, + "loss": 0.3128, + "step": 25224 + }, + { + "epoch": 0.73, + "grad_norm": 1.4063156229723615, + "learning_rate": 1.7726087177767959e-06, + "loss": 0.2691, + "step": 25225 + }, + { + "epoch": 0.73, + "grad_norm": 1.6343540819623406, + "learning_rate": 1.7722499793443226e-06, + "loss": 0.2966, + "step": 25226 + }, + { + "epoch": 0.73, + "grad_norm": 1.397253050642576, + "learning_rate": 1.7718912693967777e-06, + "loss": 0.2868, + "step": 25227 + }, + { + "epoch": 0.73, + "grad_norm": 1.3973476655248882, + "learning_rate": 1.771532587937328e-06, + "loss": 0.2868, + "step": 25228 + }, + { + "epoch": 0.73, + "grad_norm": 1.281602094822467, + "learning_rate": 1.7711739349691386e-06, + "loss": 0.3025, + "step": 25229 + }, + { + "epoch": 0.73, + "grad_norm": 1.4651296393021587, + "learning_rate": 1.7708153104953757e-06, + "loss": 0.2837, + "step": 25230 + }, + { + "epoch": 0.73, + "grad_norm": 1.5782110940130818, + "learning_rate": 1.7704567145192036e-06, + "loss": 0.2898, + "step": 25231 + }, + { + "epoch": 0.73, + "grad_norm": 1.4452412333491713, + "learning_rate": 1.770098147043786e-06, + "loss": 0.28, + "step": 25232 + }, + { + "epoch": 0.73, + "grad_norm": 1.8882190355558304, + "learning_rate": 1.769739608072289e-06, + "loss": 0.277, + "step": 25233 + }, + { + "epoch": 0.73, + "grad_norm": 1.441721470868897, + "learning_rate": 1.7693810976078752e-06, + "loss": 0.2968, + "step": 25234 + }, + { + "epoch": 0.73, + "grad_norm": 1.3923385171542433, + "learning_rate": 1.7690226156537111e-06, + "loss": 0.2916, + "step": 25235 + }, + { + "epoch": 0.73, + "grad_norm": 1.1888976323334337, + "learning_rate": 1.7686641622129563e-06, + "loss": 0.2762, + "step": 25236 + }, + { + "epoch": 0.73, + "grad_norm": 1.9831106172743829, + "learning_rate": 1.7683057372887762e-06, + "loss": 0.2761, + "step": 25237 + }, + { + "epoch": 0.73, + "grad_norm": 1.6914353629954104, + "learning_rate": 1.767947340884334e-06, + "loss": 0.3021, + "step": 25238 + }, + { + "epoch": 0.73, + "grad_norm": 1.3720408953292171, + "learning_rate": 1.7675889730027924e-06, + "loss": 0.2891, + "step": 25239 + }, + { + "epoch": 0.73, + "grad_norm": 1.370738563362021, + "learning_rate": 1.7672306336473139e-06, + "loss": 0.2709, + "step": 25240 + }, + { + "epoch": 0.73, + "grad_norm": 1.2022168444541024, + "learning_rate": 1.7668723228210605e-06, + "loss": 0.2691, + "step": 25241 + }, + { + "epoch": 0.73, + "grad_norm": 1.309581503736131, + "learning_rate": 1.7665140405271953e-06, + "loss": 0.2823, + "step": 25242 + }, + { + "epoch": 0.73, + "grad_norm": 0.8956174300844051, + "learning_rate": 1.766155786768879e-06, + "loss": 0.5393, + "step": 25243 + }, + { + "epoch": 0.73, + "grad_norm": 1.2345174209398555, + "learning_rate": 1.7657975615492757e-06, + "loss": 0.2748, + "step": 25244 + }, + { + "epoch": 0.73, + "grad_norm": 1.1986432384054542, + "learning_rate": 1.7654393648715435e-06, + "loss": 0.2743, + "step": 25245 + }, + { + "epoch": 0.73, + "grad_norm": 1.248011501009083, + "learning_rate": 1.765081196738846e-06, + "loss": 0.2816, + "step": 25246 + }, + { + "epoch": 0.73, + "grad_norm": 1.488091879040128, + "learning_rate": 1.7647230571543412e-06, + "loss": 0.2743, + "step": 25247 + }, + { + "epoch": 0.73, + "grad_norm": 0.9127792118873911, + "learning_rate": 1.7643649461211915e-06, + "loss": 0.5957, + "step": 25248 + }, + { + "epoch": 0.73, + "grad_norm": 1.3701400007188251, + "learning_rate": 1.7640068636425566e-06, + "loss": 0.2706, + "step": 25249 + }, + { + "epoch": 0.73, + "grad_norm": 1.3876328192895235, + "learning_rate": 1.7636488097215976e-06, + "loss": 0.2687, + "step": 25250 + }, + { + "epoch": 0.73, + "grad_norm": 1.3110481911582774, + "learning_rate": 1.7632907843614734e-06, + "loss": 0.2752, + "step": 25251 + }, + { + "epoch": 0.73, + "grad_norm": 1.2799689783705257, + "learning_rate": 1.762932787565344e-06, + "loss": 0.2548, + "step": 25252 + }, + { + "epoch": 0.73, + "grad_norm": 1.3118325785945364, + "learning_rate": 1.7625748193363683e-06, + "loss": 0.2791, + "step": 25253 + }, + { + "epoch": 0.73, + "grad_norm": 1.4950842729306095, + "learning_rate": 1.7622168796777083e-06, + "loss": 0.2698, + "step": 25254 + }, + { + "epoch": 0.73, + "grad_norm": 1.2254901934596203, + "learning_rate": 1.761858968592518e-06, + "loss": 0.2656, + "step": 25255 + }, + { + "epoch": 0.73, + "grad_norm": 0.9174341593101291, + "learning_rate": 1.7615010860839582e-06, + "loss": 0.5453, + "step": 25256 + }, + { + "epoch": 0.73, + "grad_norm": 1.3982177684467847, + "learning_rate": 1.7611432321551875e-06, + "loss": 0.2876, + "step": 25257 + }, + { + "epoch": 0.73, + "grad_norm": 1.2146890488867876, + "learning_rate": 1.7607854068093632e-06, + "loss": 0.2725, + "step": 25258 + }, + { + "epoch": 0.73, + "grad_norm": 1.3165641621227786, + "learning_rate": 1.7604276100496443e-06, + "loss": 0.268, + "step": 25259 + }, + { + "epoch": 0.73, + "grad_norm": 1.3281374330135596, + "learning_rate": 1.7600698418791872e-06, + "loss": 0.2902, + "step": 25260 + }, + { + "epoch": 0.73, + "grad_norm": 1.249936406807943, + "learning_rate": 1.7597121023011498e-06, + "loss": 0.2885, + "step": 25261 + }, + { + "epoch": 0.73, + "grad_norm": 1.3055763162561083, + "learning_rate": 1.7593543913186912e-06, + "loss": 0.2658, + "step": 25262 + }, + { + "epoch": 0.73, + "grad_norm": 1.2722080320916143, + "learning_rate": 1.7589967089349636e-06, + "loss": 0.279, + "step": 25263 + }, + { + "epoch": 0.73, + "grad_norm": 1.3136355127613435, + "learning_rate": 1.7586390551531263e-06, + "loss": 0.2686, + "step": 25264 + }, + { + "epoch": 0.73, + "grad_norm": 1.259830556689212, + "learning_rate": 1.7582814299763351e-06, + "loss": 0.28, + "step": 25265 + }, + { + "epoch": 0.73, + "grad_norm": 1.5442389536783712, + "learning_rate": 1.757923833407747e-06, + "loss": 0.2852, + "step": 25266 + }, + { + "epoch": 0.73, + "grad_norm": 1.2805129174489576, + "learning_rate": 1.7575662654505161e-06, + "loss": 0.2627, + "step": 25267 + }, + { + "epoch": 0.73, + "grad_norm": 1.4633763110837839, + "learning_rate": 1.7572087261077991e-06, + "loss": 0.3008, + "step": 25268 + }, + { + "epoch": 0.73, + "grad_norm": 1.2997647062763766, + "learning_rate": 1.7568512153827516e-06, + "loss": 0.2838, + "step": 25269 + }, + { + "epoch": 0.73, + "grad_norm": 1.588411496026635, + "learning_rate": 1.7564937332785276e-06, + "loss": 0.3027, + "step": 25270 + }, + { + "epoch": 0.73, + "grad_norm": 1.3720206913548278, + "learning_rate": 1.7561362797982845e-06, + "loss": 0.269, + "step": 25271 + }, + { + "epoch": 0.73, + "grad_norm": 1.373775395153502, + "learning_rate": 1.7557788549451727e-06, + "loss": 0.2983, + "step": 25272 + }, + { + "epoch": 0.73, + "grad_norm": 1.48909169030384, + "learning_rate": 1.7554214587223483e-06, + "loss": 0.2682, + "step": 25273 + }, + { + "epoch": 0.73, + "grad_norm": 1.2007222914989586, + "learning_rate": 1.7550640911329675e-06, + "loss": 0.2676, + "step": 25274 + }, + { + "epoch": 0.73, + "grad_norm": 1.627055590119946, + "learning_rate": 1.7547067521801803e-06, + "loss": 0.29, + "step": 25275 + }, + { + "epoch": 0.73, + "grad_norm": 1.281058244074254, + "learning_rate": 1.754349441867142e-06, + "loss": 0.2727, + "step": 25276 + }, + { + "epoch": 0.73, + "grad_norm": 1.3692727463085175, + "learning_rate": 1.753992160197006e-06, + "loss": 0.2834, + "step": 25277 + }, + { + "epoch": 0.73, + "grad_norm": 1.2161255615384448, + "learning_rate": 1.7536349071729253e-06, + "loss": 0.276, + "step": 25278 + }, + { + "epoch": 0.73, + "grad_norm": 1.2962424328981663, + "learning_rate": 1.7532776827980524e-06, + "loss": 0.2901, + "step": 25279 + }, + { + "epoch": 0.73, + "grad_norm": 1.4024318205492834, + "learning_rate": 1.7529204870755402e-06, + "loss": 0.2843, + "step": 25280 + }, + { + "epoch": 0.73, + "grad_norm": 1.4384978782009603, + "learning_rate": 1.7525633200085418e-06, + "loss": 0.2692, + "step": 25281 + }, + { + "epoch": 0.73, + "grad_norm": 1.3400112222121037, + "learning_rate": 1.7522061816002067e-06, + "loss": 0.2766, + "step": 25282 + }, + { + "epoch": 0.73, + "grad_norm": 1.330768627734205, + "learning_rate": 1.7518490718536884e-06, + "loss": 0.2753, + "step": 25283 + }, + { + "epoch": 0.73, + "grad_norm": 1.2689434834682913, + "learning_rate": 1.751491990772138e-06, + "loss": 0.2844, + "step": 25284 + }, + { + "epoch": 0.73, + "grad_norm": 2.728009619887355, + "learning_rate": 1.7511349383587068e-06, + "loss": 0.2863, + "step": 25285 + }, + { + "epoch": 0.73, + "grad_norm": 1.3708661409739837, + "learning_rate": 1.7507779146165455e-06, + "loss": 0.2718, + "step": 25286 + }, + { + "epoch": 0.73, + "grad_norm": 1.2748779441453382, + "learning_rate": 1.7504209195488053e-06, + "loss": 0.2773, + "step": 25287 + }, + { + "epoch": 0.73, + "grad_norm": 1.9743243955442087, + "learning_rate": 1.7500639531586365e-06, + "loss": 0.2787, + "step": 25288 + }, + { + "epoch": 0.73, + "grad_norm": 3.04000679459097, + "learning_rate": 1.7497070154491897e-06, + "loss": 0.307, + "step": 25289 + }, + { + "epoch": 0.73, + "grad_norm": 1.1754862655702332, + "learning_rate": 1.7493501064236158e-06, + "loss": 0.2753, + "step": 25290 + }, + { + "epoch": 0.73, + "grad_norm": 1.2039325662178852, + "learning_rate": 1.7489932260850617e-06, + "loss": 0.2528, + "step": 25291 + }, + { + "epoch": 0.73, + "grad_norm": 1.4100267003488254, + "learning_rate": 1.7486363744366786e-06, + "loss": 0.2835, + "step": 25292 + }, + { + "epoch": 0.73, + "grad_norm": 2.122617154116135, + "learning_rate": 1.7482795514816154e-06, + "loss": 0.2797, + "step": 25293 + }, + { + "epoch": 0.73, + "grad_norm": 1.3285811002213848, + "learning_rate": 1.7479227572230211e-06, + "loss": 0.2817, + "step": 25294 + }, + { + "epoch": 0.73, + "grad_norm": 1.5079426101440048, + "learning_rate": 1.7475659916640448e-06, + "loss": 0.2796, + "step": 25295 + }, + { + "epoch": 0.73, + "grad_norm": 1.5566061047528308, + "learning_rate": 1.7472092548078345e-06, + "loss": 0.3139, + "step": 25296 + }, + { + "epoch": 0.73, + "grad_norm": 1.3573345859283268, + "learning_rate": 1.7468525466575392e-06, + "loss": 0.2633, + "step": 25297 + }, + { + "epoch": 0.73, + "grad_norm": 1.2278290413442328, + "learning_rate": 1.7464958672163073e-06, + "loss": 0.3095, + "step": 25298 + }, + { + "epoch": 0.73, + "grad_norm": 1.2789634211823229, + "learning_rate": 1.7461392164872841e-06, + "loss": 0.2752, + "step": 25299 + }, + { + "epoch": 0.73, + "grad_norm": 1.6323383112183645, + "learning_rate": 1.7457825944736185e-06, + "loss": 0.3372, + "step": 25300 + }, + { + "epoch": 0.73, + "grad_norm": 1.413921507626464, + "learning_rate": 1.7454260011784574e-06, + "loss": 0.2925, + "step": 25301 + }, + { + "epoch": 0.73, + "grad_norm": 1.5280198140672492, + "learning_rate": 1.7450694366049491e-06, + "loss": 0.3182, + "step": 25302 + }, + { + "epoch": 0.73, + "grad_norm": 1.381437865409879, + "learning_rate": 1.744712900756238e-06, + "loss": 0.2809, + "step": 25303 + }, + { + "epoch": 0.73, + "grad_norm": 1.2315630455406508, + "learning_rate": 1.7443563936354717e-06, + "loss": 0.273, + "step": 25304 + }, + { + "epoch": 0.73, + "grad_norm": 1.234103092727736, + "learning_rate": 1.7439999152457964e-06, + "loss": 0.2721, + "step": 25305 + }, + { + "epoch": 0.73, + "grad_norm": 1.2775553451315724, + "learning_rate": 1.7436434655903578e-06, + "loss": 0.2928, + "step": 25306 + }, + { + "epoch": 0.73, + "grad_norm": 1.3918298002807565, + "learning_rate": 1.7432870446723022e-06, + "loss": 0.2756, + "step": 25307 + }, + { + "epoch": 0.73, + "grad_norm": 1.2386265348919587, + "learning_rate": 1.7429306524947736e-06, + "loss": 0.2837, + "step": 25308 + }, + { + "epoch": 0.73, + "grad_norm": 1.1863435735157983, + "learning_rate": 1.7425742890609204e-06, + "loss": 0.2574, + "step": 25309 + }, + { + "epoch": 0.73, + "grad_norm": 1.8829362171871131, + "learning_rate": 1.7422179543738837e-06, + "loss": 0.2808, + "step": 25310 + }, + { + "epoch": 0.73, + "grad_norm": 1.2938377409221955, + "learning_rate": 1.7418616484368094e-06, + "loss": 0.2785, + "step": 25311 + }, + { + "epoch": 0.73, + "grad_norm": 1.3575640444387036, + "learning_rate": 1.7415053712528424e-06, + "loss": 0.3221, + "step": 25312 + }, + { + "epoch": 0.73, + "grad_norm": 1.5463743404720744, + "learning_rate": 1.7411491228251264e-06, + "loss": 0.2998, + "step": 25313 + }, + { + "epoch": 0.73, + "grad_norm": 1.3795758786009271, + "learning_rate": 1.740792903156806e-06, + "loss": 0.2903, + "step": 25314 + }, + { + "epoch": 0.73, + "grad_norm": 1.3588146885956909, + "learning_rate": 1.740436712251024e-06, + "loss": 0.2689, + "step": 25315 + }, + { + "epoch": 0.73, + "grad_norm": 1.3813451066846039, + "learning_rate": 1.740080550110924e-06, + "loss": 0.2684, + "step": 25316 + }, + { + "epoch": 0.73, + "grad_norm": 1.3318469415966259, + "learning_rate": 1.7397244167396515e-06, + "loss": 0.2742, + "step": 25317 + }, + { + "epoch": 0.73, + "grad_norm": 1.3245947835720557, + "learning_rate": 1.7393683121403454e-06, + "loss": 0.2801, + "step": 25318 + }, + { + "epoch": 0.73, + "grad_norm": 1.2338560478945624, + "learning_rate": 1.7390122363161499e-06, + "loss": 0.2683, + "step": 25319 + }, + { + "epoch": 0.73, + "grad_norm": 2.708407643134297, + "learning_rate": 1.7386561892702075e-06, + "loss": 0.2924, + "step": 25320 + }, + { + "epoch": 0.73, + "grad_norm": 1.4056270918570108, + "learning_rate": 1.7383001710056608e-06, + "loss": 0.2851, + "step": 25321 + }, + { + "epoch": 0.73, + "grad_norm": 1.393667051135013, + "learning_rate": 1.7379441815256508e-06, + "loss": 0.281, + "step": 25322 + }, + { + "epoch": 0.73, + "grad_norm": 1.2586881707732522, + "learning_rate": 1.73758822083332e-06, + "loss": 0.2736, + "step": 25323 + }, + { + "epoch": 0.73, + "grad_norm": 1.2490214736583722, + "learning_rate": 1.7372322889318088e-06, + "loss": 0.2755, + "step": 25324 + }, + { + "epoch": 0.73, + "grad_norm": 1.3816239368332985, + "learning_rate": 1.7368763858242592e-06, + "loss": 0.2758, + "step": 25325 + }, + { + "epoch": 0.73, + "grad_norm": 1.5766821119807468, + "learning_rate": 1.7365205115138134e-06, + "loss": 0.2887, + "step": 25326 + }, + { + "epoch": 0.73, + "grad_norm": 1.521135001028738, + "learning_rate": 1.7361646660036091e-06, + "loss": 0.2997, + "step": 25327 + }, + { + "epoch": 0.73, + "grad_norm": 1.461536513475106, + "learning_rate": 1.7358088492967872e-06, + "loss": 0.2648, + "step": 25328 + }, + { + "epoch": 0.73, + "grad_norm": 1.2821414978783392, + "learning_rate": 1.7354530613964881e-06, + "loss": 0.2723, + "step": 25329 + }, + { + "epoch": 0.73, + "grad_norm": 1.5202509481481867, + "learning_rate": 1.7350973023058544e-06, + "loss": 0.2886, + "step": 25330 + }, + { + "epoch": 0.73, + "grad_norm": 1.3284119660388285, + "learning_rate": 1.7347415720280215e-06, + "loss": 0.2693, + "step": 25331 + }, + { + "epoch": 0.73, + "grad_norm": 1.689388154229199, + "learning_rate": 1.7343858705661298e-06, + "loss": 0.3184, + "step": 25332 + }, + { + "epoch": 0.73, + "grad_norm": 1.2939219772219233, + "learning_rate": 1.7340301979233193e-06, + "loss": 0.3091, + "step": 25333 + }, + { + "epoch": 0.73, + "grad_norm": 0.9976093618855955, + "learning_rate": 1.7336745541027288e-06, + "loss": 0.5866, + "step": 25334 + }, + { + "epoch": 0.73, + "grad_norm": 1.2026091792838098, + "learning_rate": 1.733318939107496e-06, + "loss": 0.2725, + "step": 25335 + }, + { + "epoch": 0.73, + "grad_norm": 1.475725030304591, + "learning_rate": 1.7329633529407624e-06, + "loss": 0.3233, + "step": 25336 + }, + { + "epoch": 0.73, + "grad_norm": 1.4448633519216556, + "learning_rate": 1.7326077956056613e-06, + "loss": 0.2951, + "step": 25337 + }, + { + "epoch": 0.73, + "grad_norm": 1.8176950873005804, + "learning_rate": 1.7322522671053327e-06, + "loss": 0.299, + "step": 25338 + }, + { + "epoch": 0.73, + "grad_norm": 1.2397331547623671, + "learning_rate": 1.7318967674429139e-06, + "loss": 0.2676, + "step": 25339 + }, + { + "epoch": 0.73, + "grad_norm": 1.3198684141670682, + "learning_rate": 1.7315412966215423e-06, + "loss": 0.2628, + "step": 25340 + }, + { + "epoch": 0.74, + "grad_norm": 1.2643894898710109, + "learning_rate": 1.7311858546443555e-06, + "loss": 0.2852, + "step": 25341 + }, + { + "epoch": 0.74, + "grad_norm": 1.4205029582255415, + "learning_rate": 1.7308304415144895e-06, + "loss": 0.2755, + "step": 25342 + }, + { + "epoch": 0.74, + "grad_norm": 1.2902648303495883, + "learning_rate": 1.730475057235081e-06, + "loss": 0.2803, + "step": 25343 + }, + { + "epoch": 0.74, + "grad_norm": 2.0650984155127112, + "learning_rate": 1.7301197018092665e-06, + "loss": 0.2902, + "step": 25344 + }, + { + "epoch": 0.74, + "grad_norm": 1.2276508803180675, + "learning_rate": 1.7297643752401832e-06, + "loss": 0.2652, + "step": 25345 + }, + { + "epoch": 0.74, + "grad_norm": 1.3157312925776852, + "learning_rate": 1.7294090775309646e-06, + "loss": 0.2632, + "step": 25346 + }, + { + "epoch": 0.74, + "grad_norm": 1.3093628374863908, + "learning_rate": 1.729053808684747e-06, + "loss": 0.2747, + "step": 25347 + }, + { + "epoch": 0.74, + "grad_norm": 1.2803335119041954, + "learning_rate": 1.7286985687046653e-06, + "loss": 0.2695, + "step": 25348 + }, + { + "epoch": 0.74, + "grad_norm": 1.3881264893932337, + "learning_rate": 1.7283433575938557e-06, + "loss": 0.2711, + "step": 25349 + }, + { + "epoch": 0.74, + "grad_norm": 1.2837584804665385, + "learning_rate": 1.7279881753554517e-06, + "loss": 0.2714, + "step": 25350 + }, + { + "epoch": 0.74, + "grad_norm": 1.358637966066632, + "learning_rate": 1.727633021992589e-06, + "loss": 0.2785, + "step": 25351 + }, + { + "epoch": 0.74, + "grad_norm": 1.3508042859413805, + "learning_rate": 1.7272778975084003e-06, + "loss": 0.2825, + "step": 25352 + }, + { + "epoch": 0.74, + "grad_norm": 1.3719428892920524, + "learning_rate": 1.726922801906023e-06, + "loss": 0.2997, + "step": 25353 + }, + { + "epoch": 0.74, + "grad_norm": 1.5863300079440326, + "learning_rate": 1.7265677351885862e-06, + "loss": 0.3018, + "step": 25354 + }, + { + "epoch": 0.74, + "grad_norm": 1.8550082367666563, + "learning_rate": 1.7262126973592253e-06, + "loss": 0.2946, + "step": 25355 + }, + { + "epoch": 0.74, + "grad_norm": 1.4482369052573592, + "learning_rate": 1.7258576884210742e-06, + "loss": 0.2807, + "step": 25356 + }, + { + "epoch": 0.74, + "grad_norm": 1.3406138421029188, + "learning_rate": 1.7255027083772646e-06, + "loss": 0.2903, + "step": 25357 + }, + { + "epoch": 0.74, + "grad_norm": 0.9507065376776748, + "learning_rate": 1.7251477572309322e-06, + "loss": 0.5938, + "step": 25358 + }, + { + "epoch": 0.74, + "grad_norm": 1.378123822747524, + "learning_rate": 1.7247928349852057e-06, + "loss": 0.2739, + "step": 25359 + }, + { + "epoch": 0.74, + "grad_norm": 1.3426538036734053, + "learning_rate": 1.7244379416432178e-06, + "loss": 0.2942, + "step": 25360 + }, + { + "epoch": 0.74, + "grad_norm": 1.5567790186658796, + "learning_rate": 1.7240830772081019e-06, + "loss": 0.2605, + "step": 25361 + }, + { + "epoch": 0.74, + "grad_norm": 1.2218234147278186, + "learning_rate": 1.7237282416829893e-06, + "loss": 0.2758, + "step": 25362 + }, + { + "epoch": 0.74, + "grad_norm": 1.4125527211803528, + "learning_rate": 1.7233734350710112e-06, + "loss": 0.2843, + "step": 25363 + }, + { + "epoch": 0.74, + "grad_norm": 1.6273396764708352, + "learning_rate": 1.7230186573753005e-06, + "loss": 0.2801, + "step": 25364 + }, + { + "epoch": 0.74, + "grad_norm": 1.3273342252907574, + "learning_rate": 1.7226639085989848e-06, + "loss": 0.2866, + "step": 25365 + }, + { + "epoch": 0.74, + "grad_norm": 1.3892322533875716, + "learning_rate": 1.722309188745197e-06, + "loss": 0.2777, + "step": 25366 + }, + { + "epoch": 0.74, + "grad_norm": 1.3456729403643775, + "learning_rate": 1.7219544978170666e-06, + "loss": 0.3006, + "step": 25367 + }, + { + "epoch": 0.74, + "grad_norm": 1.3088239637747299, + "learning_rate": 1.7215998358177238e-06, + "loss": 0.2791, + "step": 25368 + }, + { + "epoch": 0.74, + "grad_norm": 1.4026743926378415, + "learning_rate": 1.721245202750299e-06, + "loss": 0.2722, + "step": 25369 + }, + { + "epoch": 0.74, + "grad_norm": 1.5512245901826014, + "learning_rate": 1.7208905986179219e-06, + "loss": 0.2987, + "step": 25370 + }, + { + "epoch": 0.74, + "grad_norm": 1.41078668811717, + "learning_rate": 1.7205360234237217e-06, + "loss": 0.2773, + "step": 25371 + }, + { + "epoch": 0.74, + "grad_norm": 2.8622590533234535, + "learning_rate": 1.720181477170829e-06, + "loss": 0.3105, + "step": 25372 + }, + { + "epoch": 0.74, + "grad_norm": 1.517783700224373, + "learning_rate": 1.7198269598623696e-06, + "loss": 0.2743, + "step": 25373 + }, + { + "epoch": 0.74, + "grad_norm": 1.464654535930355, + "learning_rate": 1.7194724715014731e-06, + "loss": 0.2674, + "step": 25374 + }, + { + "epoch": 0.74, + "grad_norm": 1.3168034439741392, + "learning_rate": 1.7191180120912687e-06, + "loss": 0.2762, + "step": 25375 + }, + { + "epoch": 0.74, + "grad_norm": 1.2651852038928821, + "learning_rate": 1.7187635816348847e-06, + "loss": 0.2959, + "step": 25376 + }, + { + "epoch": 0.74, + "grad_norm": 1.4574939376616476, + "learning_rate": 1.718409180135448e-06, + "loss": 0.2868, + "step": 25377 + }, + { + "epoch": 0.74, + "grad_norm": 0.9567219145804434, + "learning_rate": 1.7180548075960867e-06, + "loss": 0.5635, + "step": 25378 + }, + { + "epoch": 0.74, + "grad_norm": 1.2188779371085847, + "learning_rate": 1.7177004640199284e-06, + "loss": 0.2821, + "step": 25379 + }, + { + "epoch": 0.74, + "grad_norm": 1.2896355618600714, + "learning_rate": 1.7173461494101002e-06, + "loss": 0.2684, + "step": 25380 + }, + { + "epoch": 0.74, + "grad_norm": 1.339504725489613, + "learning_rate": 1.7169918637697296e-06, + "loss": 0.2854, + "step": 25381 + }, + { + "epoch": 0.74, + "grad_norm": 1.4335959354490364, + "learning_rate": 1.7166376071019408e-06, + "loss": 0.2795, + "step": 25382 + }, + { + "epoch": 0.74, + "grad_norm": 1.299786532285333, + "learning_rate": 1.7162833794098616e-06, + "loss": 0.2599, + "step": 25383 + }, + { + "epoch": 0.74, + "grad_norm": 1.4079670899522163, + "learning_rate": 1.7159291806966182e-06, + "loss": 0.2626, + "step": 25384 + }, + { + "epoch": 0.74, + "grad_norm": 1.3673942950653248, + "learning_rate": 1.7155750109653358e-06, + "loss": 0.2715, + "step": 25385 + }, + { + "epoch": 0.74, + "grad_norm": 1.9046381309842866, + "learning_rate": 1.7152208702191425e-06, + "loss": 0.2754, + "step": 25386 + }, + { + "epoch": 0.74, + "grad_norm": 1.250897451691736, + "learning_rate": 1.7148667584611595e-06, + "loss": 0.2729, + "step": 25387 + }, + { + "epoch": 0.74, + "grad_norm": 1.2980379938033542, + "learning_rate": 1.7145126756945134e-06, + "loss": 0.2921, + "step": 25388 + }, + { + "epoch": 0.74, + "grad_norm": 8.49001651206095, + "learning_rate": 1.71415862192233e-06, + "loss": 0.2825, + "step": 25389 + }, + { + "epoch": 0.74, + "grad_norm": 1.2367254214206904, + "learning_rate": 1.713804597147733e-06, + "loss": 0.2824, + "step": 25390 + }, + { + "epoch": 0.74, + "grad_norm": 1.2946328223752843, + "learning_rate": 1.7134506013738488e-06, + "loss": 0.2811, + "step": 25391 + }, + { + "epoch": 0.74, + "grad_norm": 1.3288463259038537, + "learning_rate": 1.7130966346037975e-06, + "loss": 0.2643, + "step": 25392 + }, + { + "epoch": 0.74, + "grad_norm": 1.243292663869785, + "learning_rate": 1.7127426968407052e-06, + "loss": 0.2716, + "step": 25393 + }, + { + "epoch": 0.74, + "grad_norm": 1.3510795892213676, + "learning_rate": 1.712388788087695e-06, + "loss": 0.2783, + "step": 25394 + }, + { + "epoch": 0.74, + "grad_norm": 0.9973427452183422, + "learning_rate": 1.7120349083478899e-06, + "loss": 0.5991, + "step": 25395 + }, + { + "epoch": 0.74, + "grad_norm": 1.2751949665524265, + "learning_rate": 1.711681057624413e-06, + "loss": 0.2893, + "step": 25396 + }, + { + "epoch": 0.74, + "grad_norm": 1.3050218663623532, + "learning_rate": 1.7113272359203882e-06, + "loss": 0.2715, + "step": 25397 + }, + { + "epoch": 0.74, + "grad_norm": 1.9465756499861664, + "learning_rate": 1.7109734432389363e-06, + "loss": 0.28, + "step": 25398 + }, + { + "epoch": 0.74, + "grad_norm": 1.3154350824990038, + "learning_rate": 1.7106196795831804e-06, + "loss": 0.2765, + "step": 25399 + }, + { + "epoch": 0.74, + "grad_norm": 1.3674711178291485, + "learning_rate": 1.7102659449562437e-06, + "loss": 0.2803, + "step": 25400 + }, + { + "epoch": 0.74, + "grad_norm": 1.3533092315983446, + "learning_rate": 1.7099122393612455e-06, + "loss": 0.2709, + "step": 25401 + }, + { + "epoch": 0.74, + "grad_norm": 1.446360032451615, + "learning_rate": 1.709558562801308e-06, + "loss": 0.2762, + "step": 25402 + }, + { + "epoch": 0.74, + "grad_norm": 1.5761965689660034, + "learning_rate": 1.7092049152795527e-06, + "loss": 0.2589, + "step": 25403 + }, + { + "epoch": 0.74, + "grad_norm": 1.2782081269573597, + "learning_rate": 1.7088512967991e-06, + "loss": 0.2815, + "step": 25404 + }, + { + "epoch": 0.74, + "grad_norm": 1.3107256797987483, + "learning_rate": 1.7084977073630714e-06, + "loss": 0.2754, + "step": 25405 + }, + { + "epoch": 0.74, + "grad_norm": 2.0331838277166057, + "learning_rate": 1.7081441469745875e-06, + "loss": 0.2836, + "step": 25406 + }, + { + "epoch": 0.74, + "grad_norm": 1.5140542882977863, + "learning_rate": 1.707790615636768e-06, + "loss": 0.2521, + "step": 25407 + }, + { + "epoch": 0.74, + "grad_norm": 1.2568304184916623, + "learning_rate": 1.707437113352734e-06, + "loss": 0.2781, + "step": 25408 + }, + { + "epoch": 0.74, + "grad_norm": 1.397116246031744, + "learning_rate": 1.7070836401256024e-06, + "loss": 0.2893, + "step": 25409 + }, + { + "epoch": 0.74, + "grad_norm": 1.2234541782512371, + "learning_rate": 1.7067301959584947e-06, + "loss": 0.2839, + "step": 25410 + }, + { + "epoch": 0.74, + "grad_norm": 1.5572949989487446, + "learning_rate": 1.706376780854529e-06, + "loss": 0.2945, + "step": 25411 + }, + { + "epoch": 0.74, + "grad_norm": 1.2622835632919784, + "learning_rate": 1.7060233948168254e-06, + "loss": 0.2691, + "step": 25412 + }, + { + "epoch": 0.74, + "grad_norm": 0.9460499673282086, + "learning_rate": 1.7056700378485014e-06, + "loss": 0.6045, + "step": 25413 + }, + { + "epoch": 0.74, + "grad_norm": 1.4585802881617105, + "learning_rate": 1.7053167099526774e-06, + "loss": 0.2966, + "step": 25414 + }, + { + "epoch": 0.74, + "grad_norm": 1.298491489735858, + "learning_rate": 1.7049634111324687e-06, + "loss": 0.2782, + "step": 25415 + }, + { + "epoch": 0.74, + "grad_norm": 1.328906046887445, + "learning_rate": 1.7046101413909938e-06, + "loss": 0.2748, + "step": 25416 + }, + { + "epoch": 0.74, + "grad_norm": 1.2100186601659757, + "learning_rate": 1.7042569007313713e-06, + "loss": 0.2719, + "step": 25417 + }, + { + "epoch": 0.74, + "grad_norm": 1.8303502301103913, + "learning_rate": 1.7039036891567185e-06, + "loss": 0.2758, + "step": 25418 + }, + { + "epoch": 0.74, + "grad_norm": 1.2276285798738464, + "learning_rate": 1.7035505066701542e-06, + "loss": 0.2747, + "step": 25419 + }, + { + "epoch": 0.74, + "grad_norm": 1.639028931744602, + "learning_rate": 1.703197353274791e-06, + "loss": 0.2677, + "step": 25420 + }, + { + "epoch": 0.74, + "grad_norm": 1.2948885775394903, + "learning_rate": 1.7028442289737473e-06, + "loss": 0.2799, + "step": 25421 + }, + { + "epoch": 0.74, + "grad_norm": 1.2222438746794018, + "learning_rate": 1.7024911337701405e-06, + "loss": 0.2788, + "step": 25422 + }, + { + "epoch": 0.74, + "grad_norm": 1.2517320825437912, + "learning_rate": 1.7021380676670862e-06, + "loss": 0.2824, + "step": 25423 + }, + { + "epoch": 0.74, + "grad_norm": 1.3754011523246479, + "learning_rate": 1.7017850306676998e-06, + "loss": 0.2937, + "step": 25424 + }, + { + "epoch": 0.74, + "grad_norm": 2.8943560935259733, + "learning_rate": 1.7014320227750968e-06, + "loss": 0.2712, + "step": 25425 + }, + { + "epoch": 0.74, + "grad_norm": 1.4945856397017574, + "learning_rate": 1.701079043992393e-06, + "loss": 0.2854, + "step": 25426 + }, + { + "epoch": 0.74, + "grad_norm": 1.4747281786796242, + "learning_rate": 1.700726094322705e-06, + "loss": 0.3116, + "step": 25427 + }, + { + "epoch": 0.74, + "grad_norm": 1.3768632395795457, + "learning_rate": 1.7003731737691443e-06, + "loss": 0.296, + "step": 25428 + }, + { + "epoch": 0.74, + "grad_norm": 1.9685482053776828, + "learning_rate": 1.7000202823348265e-06, + "loss": 0.2936, + "step": 25429 + }, + { + "epoch": 0.74, + "grad_norm": 1.8694548126362356, + "learning_rate": 1.6996674200228663e-06, + "loss": 0.2805, + "step": 25430 + }, + { + "epoch": 0.74, + "grad_norm": 1.2544835746832008, + "learning_rate": 1.699314586836378e-06, + "loss": 0.2888, + "step": 25431 + }, + { + "epoch": 0.74, + "grad_norm": 1.2762663307584, + "learning_rate": 1.698961782778475e-06, + "loss": 0.2998, + "step": 25432 + }, + { + "epoch": 0.74, + "grad_norm": 1.5727299002727906, + "learning_rate": 1.698609007852271e-06, + "loss": 0.273, + "step": 25433 + }, + { + "epoch": 0.74, + "grad_norm": 1.429356326596663, + "learning_rate": 1.6982562620608789e-06, + "loss": 0.2852, + "step": 25434 + }, + { + "epoch": 0.74, + "grad_norm": 1.2773508389196768, + "learning_rate": 1.6979035454074122e-06, + "loss": 0.2701, + "step": 25435 + }, + { + "epoch": 0.74, + "grad_norm": 1.3456140491103799, + "learning_rate": 1.6975508578949852e-06, + "loss": 0.2872, + "step": 25436 + }, + { + "epoch": 0.74, + "grad_norm": 1.3485986272543293, + "learning_rate": 1.6971981995267062e-06, + "loss": 0.2799, + "step": 25437 + }, + { + "epoch": 0.74, + "grad_norm": 1.2242905194113385, + "learning_rate": 1.69684557030569e-06, + "loss": 0.2582, + "step": 25438 + }, + { + "epoch": 0.74, + "grad_norm": 1.4620889821455632, + "learning_rate": 1.6964929702350486e-06, + "loss": 0.2853, + "step": 25439 + }, + { + "epoch": 0.74, + "grad_norm": 1.384115041678673, + "learning_rate": 1.6961403993178932e-06, + "loss": 0.2894, + "step": 25440 + }, + { + "epoch": 0.74, + "grad_norm": 0.9527157649128791, + "learning_rate": 1.695787857557335e-06, + "loss": 0.6061, + "step": 25441 + }, + { + "epoch": 0.74, + "grad_norm": 1.2657197218416647, + "learning_rate": 1.6954353449564863e-06, + "loss": 0.2592, + "step": 25442 + }, + { + "epoch": 0.74, + "grad_norm": 1.3564102747964193, + "learning_rate": 1.6950828615184583e-06, + "loss": 0.2967, + "step": 25443 + }, + { + "epoch": 0.74, + "grad_norm": 1.3041908672580027, + "learning_rate": 1.6947304072463599e-06, + "loss": 0.3032, + "step": 25444 + }, + { + "epoch": 0.74, + "grad_norm": 1.5643462714515148, + "learning_rate": 1.6943779821433015e-06, + "loss": 0.2719, + "step": 25445 + }, + { + "epoch": 0.74, + "grad_norm": 1.3359605926004228, + "learning_rate": 1.6940255862123961e-06, + "loss": 0.2796, + "step": 25446 + }, + { + "epoch": 0.74, + "grad_norm": 1.5090147008140258, + "learning_rate": 1.6936732194567502e-06, + "loss": 0.2726, + "step": 25447 + }, + { + "epoch": 0.74, + "grad_norm": 1.2626520232720377, + "learning_rate": 1.6933208818794745e-06, + "loss": 0.2784, + "step": 25448 + }, + { + "epoch": 0.74, + "grad_norm": 1.4735896978880725, + "learning_rate": 1.6929685734836786e-06, + "loss": 0.2795, + "step": 25449 + }, + { + "epoch": 0.74, + "grad_norm": 1.3384480838677244, + "learning_rate": 1.6926162942724716e-06, + "loss": 0.2807, + "step": 25450 + }, + { + "epoch": 0.74, + "grad_norm": 1.5081311131352377, + "learning_rate": 1.692264044248963e-06, + "loss": 0.3216, + "step": 25451 + }, + { + "epoch": 0.74, + "grad_norm": 1.349169743448452, + "learning_rate": 1.6919118234162606e-06, + "loss": 0.2618, + "step": 25452 + }, + { + "epoch": 0.74, + "grad_norm": 1.4767668660051252, + "learning_rate": 1.691559631777473e-06, + "loss": 0.3155, + "step": 25453 + }, + { + "epoch": 0.74, + "grad_norm": 1.3657632355440315, + "learning_rate": 1.6912074693357083e-06, + "loss": 0.28, + "step": 25454 + }, + { + "epoch": 0.74, + "grad_norm": 1.5630996938979402, + "learning_rate": 1.6908553360940765e-06, + "loss": 0.2942, + "step": 25455 + }, + { + "epoch": 0.74, + "grad_norm": 1.435480623082333, + "learning_rate": 1.6905032320556814e-06, + "loss": 0.2711, + "step": 25456 + }, + { + "epoch": 0.74, + "grad_norm": 1.1829651872499005, + "learning_rate": 1.6901511572236312e-06, + "loss": 0.2676, + "step": 25457 + }, + { + "epoch": 0.74, + "grad_norm": 1.3780147278700359, + "learning_rate": 1.6897991116010342e-06, + "loss": 0.2856, + "step": 25458 + }, + { + "epoch": 0.74, + "grad_norm": 1.3189076951294683, + "learning_rate": 1.689447095190997e-06, + "loss": 0.297, + "step": 25459 + }, + { + "epoch": 0.74, + "grad_norm": 1.2492019087192034, + "learning_rate": 1.6890951079966255e-06, + "loss": 0.2738, + "step": 25460 + }, + { + "epoch": 0.74, + "grad_norm": 1.0145491723500106, + "learning_rate": 1.6887431500210272e-06, + "loss": 0.5491, + "step": 25461 + }, + { + "epoch": 0.74, + "grad_norm": 1.5076948470536053, + "learning_rate": 1.6883912212673065e-06, + "loss": 0.2753, + "step": 25462 + }, + { + "epoch": 0.74, + "grad_norm": 1.3269539957269325, + "learning_rate": 1.688039321738572e-06, + "loss": 0.2871, + "step": 25463 + }, + { + "epoch": 0.74, + "grad_norm": 1.4797991322780941, + "learning_rate": 1.6876874514379254e-06, + "loss": 0.2893, + "step": 25464 + }, + { + "epoch": 0.74, + "grad_norm": 1.2655884703295286, + "learning_rate": 1.687335610368474e-06, + "loss": 0.2758, + "step": 25465 + }, + { + "epoch": 0.74, + "grad_norm": 1.3545778972694207, + "learning_rate": 1.6869837985333226e-06, + "loss": 0.3089, + "step": 25466 + }, + { + "epoch": 0.74, + "grad_norm": 1.3329763809939363, + "learning_rate": 1.6866320159355753e-06, + "loss": 0.2663, + "step": 25467 + }, + { + "epoch": 0.74, + "grad_norm": 1.2808220419126892, + "learning_rate": 1.686280262578338e-06, + "loss": 0.2784, + "step": 25468 + }, + { + "epoch": 0.74, + "grad_norm": 1.2723479658849044, + "learning_rate": 1.685928538464714e-06, + "loss": 0.273, + "step": 25469 + }, + { + "epoch": 0.74, + "grad_norm": 1.7222756849281335, + "learning_rate": 1.685576843597807e-06, + "loss": 0.299, + "step": 25470 + }, + { + "epoch": 0.74, + "grad_norm": 1.3344758540511907, + "learning_rate": 1.6852251779807233e-06, + "loss": 0.2657, + "step": 25471 + }, + { + "epoch": 0.74, + "grad_norm": 1.3107458653639557, + "learning_rate": 1.6848735416165623e-06, + "loss": 0.2811, + "step": 25472 + }, + { + "epoch": 0.74, + "grad_norm": 2.0494207221486005, + "learning_rate": 1.684521934508429e-06, + "loss": 0.3011, + "step": 25473 + }, + { + "epoch": 0.74, + "grad_norm": 1.6769805740874741, + "learning_rate": 1.684170356659428e-06, + "loss": 0.2787, + "step": 25474 + }, + { + "epoch": 0.74, + "grad_norm": 1.4823544429187663, + "learning_rate": 1.6838188080726586e-06, + "loss": 0.2828, + "step": 25475 + }, + { + "epoch": 0.74, + "grad_norm": 1.215774866413357, + "learning_rate": 1.6834672887512255e-06, + "loss": 0.2751, + "step": 25476 + }, + { + "epoch": 0.74, + "grad_norm": 1.3840141240808197, + "learning_rate": 1.68311579869823e-06, + "loss": 0.2769, + "step": 25477 + }, + { + "epoch": 0.74, + "grad_norm": 1.3697498978964973, + "learning_rate": 1.6827643379167746e-06, + "loss": 0.2871, + "step": 25478 + }, + { + "epoch": 0.74, + "grad_norm": 1.3089288964944346, + "learning_rate": 1.682412906409961e-06, + "loss": 0.2625, + "step": 25479 + }, + { + "epoch": 0.74, + "grad_norm": 0.9149102830292336, + "learning_rate": 1.6820615041808897e-06, + "loss": 0.6308, + "step": 25480 + }, + { + "epoch": 0.74, + "grad_norm": 1.4061188092112558, + "learning_rate": 1.6817101312326623e-06, + "loss": 0.3014, + "step": 25481 + }, + { + "epoch": 0.74, + "grad_norm": 1.3541327412524566, + "learning_rate": 1.6813587875683818e-06, + "loss": 0.2678, + "step": 25482 + }, + { + "epoch": 0.74, + "grad_norm": 1.3685666046909388, + "learning_rate": 1.6810074731911452e-06, + "loss": 0.2882, + "step": 25483 + }, + { + "epoch": 0.74, + "grad_norm": 1.4398199765239257, + "learning_rate": 1.6806561881040546e-06, + "loss": 0.3049, + "step": 25484 + }, + { + "epoch": 0.74, + "grad_norm": 1.295282178363147, + "learning_rate": 1.68030493231021e-06, + "loss": 0.2849, + "step": 25485 + }, + { + "epoch": 0.74, + "grad_norm": 1.4128485228798644, + "learning_rate": 1.6799537058127109e-06, + "loss": 0.3016, + "step": 25486 + }, + { + "epoch": 0.74, + "grad_norm": 1.3524752198514178, + "learning_rate": 1.6796025086146573e-06, + "loss": 0.2757, + "step": 25487 + }, + { + "epoch": 0.74, + "grad_norm": 1.460315616526719, + "learning_rate": 1.679251340719148e-06, + "loss": 0.2663, + "step": 25488 + }, + { + "epoch": 0.74, + "grad_norm": 1.6530093751423396, + "learning_rate": 1.6789002021292833e-06, + "loss": 0.2694, + "step": 25489 + }, + { + "epoch": 0.74, + "grad_norm": 1.5248265482166963, + "learning_rate": 1.6785490928481602e-06, + "loss": 0.3166, + "step": 25490 + }, + { + "epoch": 0.74, + "grad_norm": 1.3400681316211254, + "learning_rate": 1.6781980128788806e-06, + "loss": 0.2795, + "step": 25491 + }, + { + "epoch": 0.74, + "grad_norm": 1.233048064955568, + "learning_rate": 1.6778469622245386e-06, + "loss": 0.265, + "step": 25492 + }, + { + "epoch": 0.74, + "grad_norm": 1.4491825540776513, + "learning_rate": 1.6774959408882336e-06, + "loss": 0.2914, + "step": 25493 + }, + { + "epoch": 0.74, + "grad_norm": 1.3989541382990531, + "learning_rate": 1.677144948873064e-06, + "loss": 0.2792, + "step": 25494 + }, + { + "epoch": 0.74, + "grad_norm": 1.3674431595035836, + "learning_rate": 1.6767939861821275e-06, + "loss": 0.2907, + "step": 25495 + }, + { + "epoch": 0.74, + "grad_norm": 1.376375891044377, + "learning_rate": 1.6764430528185204e-06, + "loss": 0.2675, + "step": 25496 + }, + { + "epoch": 0.74, + "grad_norm": 1.363468869437989, + "learning_rate": 1.6760921487853404e-06, + "loss": 0.2857, + "step": 25497 + }, + { + "epoch": 0.74, + "grad_norm": 1.4473928245270278, + "learning_rate": 1.6757412740856843e-06, + "loss": 0.2896, + "step": 25498 + }, + { + "epoch": 0.74, + "grad_norm": 0.9468039658723537, + "learning_rate": 1.6753904287226496e-06, + "loss": 0.5645, + "step": 25499 + }, + { + "epoch": 0.74, + "grad_norm": 1.3116972386104955, + "learning_rate": 1.6750396126993296e-06, + "loss": 0.2861, + "step": 25500 + }, + { + "epoch": 0.74, + "grad_norm": 1.2799794184660567, + "learning_rate": 1.6746888260188238e-06, + "loss": 0.2547, + "step": 25501 + }, + { + "epoch": 0.74, + "grad_norm": 1.228234751796137, + "learning_rate": 1.6743380686842241e-06, + "loss": 0.2588, + "step": 25502 + }, + { + "epoch": 0.74, + "grad_norm": 0.9493502043320542, + "learning_rate": 1.673987340698628e-06, + "loss": 0.5465, + "step": 25503 + }, + { + "epoch": 0.74, + "grad_norm": 1.2978650253269317, + "learning_rate": 1.6736366420651306e-06, + "loss": 0.2726, + "step": 25504 + }, + { + "epoch": 0.74, + "grad_norm": 1.865569348065688, + "learning_rate": 1.6732859727868268e-06, + "loss": 0.2542, + "step": 25505 + }, + { + "epoch": 0.74, + "grad_norm": 1.3393272471759219, + "learning_rate": 1.6729353328668107e-06, + "loss": 0.2678, + "step": 25506 + }, + { + "epoch": 0.74, + "grad_norm": 1.2485142288652966, + "learning_rate": 1.6725847223081776e-06, + "loss": 0.2758, + "step": 25507 + }, + { + "epoch": 0.74, + "grad_norm": 1.5184865131536616, + "learning_rate": 1.6722341411140214e-06, + "loss": 0.2697, + "step": 25508 + }, + { + "epoch": 0.74, + "grad_norm": 1.3590800281751236, + "learning_rate": 1.6718835892874359e-06, + "loss": 0.2869, + "step": 25509 + }, + { + "epoch": 0.74, + "grad_norm": 1.520205623566743, + "learning_rate": 1.6715330668315154e-06, + "loss": 0.2715, + "step": 25510 + }, + { + "epoch": 0.74, + "grad_norm": 1.4683316558548125, + "learning_rate": 1.6711825737493514e-06, + "loss": 0.2738, + "step": 25511 + }, + { + "epoch": 0.74, + "grad_norm": 1.29654187055147, + "learning_rate": 1.6708321100440383e-06, + "loss": 0.2823, + "step": 25512 + }, + { + "epoch": 0.74, + "grad_norm": 1.3154836660587517, + "learning_rate": 1.6704816757186688e-06, + "loss": 0.2912, + "step": 25513 + }, + { + "epoch": 0.74, + "grad_norm": 0.9405559417426856, + "learning_rate": 1.6701312707763352e-06, + "loss": 0.5643, + "step": 25514 + }, + { + "epoch": 0.74, + "grad_norm": 1.4565016954526822, + "learning_rate": 1.66978089522013e-06, + "loss": 0.2978, + "step": 25515 + }, + { + "epoch": 0.74, + "grad_norm": 1.289594896014922, + "learning_rate": 1.669430549053146e-06, + "loss": 0.2741, + "step": 25516 + }, + { + "epoch": 0.74, + "grad_norm": 1.4040075609360578, + "learning_rate": 1.6690802322784732e-06, + "loss": 0.2897, + "step": 25517 + }, + { + "epoch": 0.74, + "grad_norm": 1.5381058566852477, + "learning_rate": 1.6687299448992068e-06, + "loss": 0.294, + "step": 25518 + }, + { + "epoch": 0.74, + "grad_norm": 1.4553790064395338, + "learning_rate": 1.6683796869184338e-06, + "loss": 0.2701, + "step": 25519 + }, + { + "epoch": 0.74, + "grad_norm": 1.4089434187612362, + "learning_rate": 1.668029458339247e-06, + "loss": 0.2686, + "step": 25520 + }, + { + "epoch": 0.74, + "grad_norm": 1.2800844259488224, + "learning_rate": 1.6676792591647373e-06, + "loss": 0.2693, + "step": 25521 + }, + { + "epoch": 0.74, + "grad_norm": 1.557299963391527, + "learning_rate": 1.6673290893979948e-06, + "loss": 0.2993, + "step": 25522 + }, + { + "epoch": 0.74, + "grad_norm": 1.354056632667376, + "learning_rate": 1.66697894904211e-06, + "loss": 0.258, + "step": 25523 + }, + { + "epoch": 0.74, + "grad_norm": 1.2089196362334684, + "learning_rate": 1.6666288381001734e-06, + "loss": 0.2725, + "step": 25524 + }, + { + "epoch": 0.74, + "grad_norm": 1.2685926635036062, + "learning_rate": 1.6662787565752741e-06, + "loss": 0.275, + "step": 25525 + }, + { + "epoch": 0.74, + "grad_norm": 1.37782569685547, + "learning_rate": 1.665928704470502e-06, + "loss": 0.262, + "step": 25526 + }, + { + "epoch": 0.74, + "grad_norm": 1.2551125143654736, + "learning_rate": 1.6655786817889469e-06, + "loss": 0.2702, + "step": 25527 + }, + { + "epoch": 0.74, + "grad_norm": 1.287307573244776, + "learning_rate": 1.665228688533696e-06, + "loss": 0.2723, + "step": 25528 + }, + { + "epoch": 0.74, + "grad_norm": 1.6980965203591103, + "learning_rate": 1.6648787247078403e-06, + "loss": 0.2674, + "step": 25529 + }, + { + "epoch": 0.74, + "grad_norm": 1.6706852824132556, + "learning_rate": 1.6645287903144652e-06, + "loss": 0.2641, + "step": 25530 + }, + { + "epoch": 0.74, + "grad_norm": 1.3678866215999286, + "learning_rate": 1.6641788853566608e-06, + "loss": 0.2774, + "step": 25531 + }, + { + "epoch": 0.74, + "grad_norm": 1.3093479666270655, + "learning_rate": 1.6638290098375144e-06, + "loss": 0.2822, + "step": 25532 + }, + { + "epoch": 0.74, + "grad_norm": 1.3991617289382685, + "learning_rate": 1.663479163760114e-06, + "loss": 0.3029, + "step": 25533 + }, + { + "epoch": 0.74, + "grad_norm": 1.3155409921781263, + "learning_rate": 1.6631293471275473e-06, + "loss": 0.2708, + "step": 25534 + }, + { + "epoch": 0.74, + "grad_norm": 2.214950283252951, + "learning_rate": 1.6627795599429008e-06, + "loss": 0.3105, + "step": 25535 + }, + { + "epoch": 0.74, + "grad_norm": 1.3083561411102687, + "learning_rate": 1.662429802209262e-06, + "loss": 0.2835, + "step": 25536 + }, + { + "epoch": 0.74, + "grad_norm": 1.4583913356994855, + "learning_rate": 1.6620800739297188e-06, + "loss": 0.2726, + "step": 25537 + }, + { + "epoch": 0.74, + "grad_norm": 1.5625666063031507, + "learning_rate": 1.6617303751073543e-06, + "loss": 0.3188, + "step": 25538 + }, + { + "epoch": 0.74, + "grad_norm": 1.2497782641329762, + "learning_rate": 1.6613807057452563e-06, + "loss": 0.2724, + "step": 25539 + }, + { + "epoch": 0.74, + "grad_norm": 1.4065904298947876, + "learning_rate": 1.6610310658465101e-06, + "loss": 0.278, + "step": 25540 + }, + { + "epoch": 0.74, + "grad_norm": 1.3657187729045648, + "learning_rate": 1.6606814554142025e-06, + "loss": 0.3004, + "step": 25541 + }, + { + "epoch": 0.74, + "grad_norm": 1.6086915725998248, + "learning_rate": 1.660331874451418e-06, + "loss": 0.2517, + "step": 25542 + }, + { + "epoch": 0.74, + "grad_norm": 1.2738782176196894, + "learning_rate": 1.659982322961241e-06, + "loss": 0.2724, + "step": 25543 + }, + { + "epoch": 0.74, + "grad_norm": 1.446581388480283, + "learning_rate": 1.6596328009467577e-06, + "loss": 0.2911, + "step": 25544 + }, + { + "epoch": 0.74, + "grad_norm": 1.4128402739190251, + "learning_rate": 1.6592833084110521e-06, + "loss": 0.2734, + "step": 25545 + }, + { + "epoch": 0.74, + "grad_norm": 1.2838891070321667, + "learning_rate": 1.65893384535721e-06, + "loss": 0.2651, + "step": 25546 + }, + { + "epoch": 0.74, + "grad_norm": 1.3224888873359935, + "learning_rate": 1.6585844117883122e-06, + "loss": 0.2653, + "step": 25547 + }, + { + "epoch": 0.74, + "grad_norm": 1.5565371888294144, + "learning_rate": 1.6582350077074433e-06, + "loss": 0.3241, + "step": 25548 + }, + { + "epoch": 0.74, + "grad_norm": 1.3832412053047594, + "learning_rate": 1.6578856331176884e-06, + "loss": 0.271, + "step": 25549 + }, + { + "epoch": 0.74, + "grad_norm": 1.2583209262214008, + "learning_rate": 1.657536288022129e-06, + "loss": 0.2644, + "step": 25550 + }, + { + "epoch": 0.74, + "grad_norm": 1.5958688687874905, + "learning_rate": 1.65718697242385e-06, + "loss": 0.2821, + "step": 25551 + }, + { + "epoch": 0.74, + "grad_norm": 1.565467377348886, + "learning_rate": 1.6568376863259322e-06, + "loss": 0.2737, + "step": 25552 + }, + { + "epoch": 0.74, + "grad_norm": 1.308259604362412, + "learning_rate": 1.6564884297314593e-06, + "loss": 0.2926, + "step": 25553 + }, + { + "epoch": 0.74, + "grad_norm": 1.631061450506496, + "learning_rate": 1.6561392026435147e-06, + "loss": 0.269, + "step": 25554 + }, + { + "epoch": 0.74, + "grad_norm": 1.5315808183097108, + "learning_rate": 1.6557900050651766e-06, + "loss": 0.2741, + "step": 25555 + }, + { + "epoch": 0.74, + "grad_norm": 2.053649189405569, + "learning_rate": 1.6554408369995312e-06, + "loss": 0.2547, + "step": 25556 + }, + { + "epoch": 0.74, + "grad_norm": 1.346955969803412, + "learning_rate": 1.655091698449655e-06, + "loss": 0.3061, + "step": 25557 + }, + { + "epoch": 0.74, + "grad_norm": 0.9460972783797816, + "learning_rate": 1.6547425894186325e-06, + "loss": 0.5335, + "step": 25558 + }, + { + "epoch": 0.74, + "grad_norm": 1.481919253386785, + "learning_rate": 1.6543935099095438e-06, + "loss": 0.2729, + "step": 25559 + }, + { + "epoch": 0.74, + "grad_norm": 1.2630026702346802, + "learning_rate": 1.654044459925469e-06, + "loss": 0.2654, + "step": 25560 + }, + { + "epoch": 0.74, + "grad_norm": 1.3843833465754634, + "learning_rate": 1.6536954394694888e-06, + "loss": 0.2672, + "step": 25561 + }, + { + "epoch": 0.74, + "grad_norm": 1.4760300769747112, + "learning_rate": 1.6533464485446837e-06, + "loss": 0.2765, + "step": 25562 + }, + { + "epoch": 0.74, + "grad_norm": 1.336514126794728, + "learning_rate": 1.6529974871541337e-06, + "loss": 0.2837, + "step": 25563 + }, + { + "epoch": 0.74, + "grad_norm": 1.3446143811416862, + "learning_rate": 1.6526485553009175e-06, + "loss": 0.2883, + "step": 25564 + }, + { + "epoch": 0.74, + "grad_norm": 1.442608177323263, + "learning_rate": 1.652299652988117e-06, + "loss": 0.2905, + "step": 25565 + }, + { + "epoch": 0.74, + "grad_norm": 1.2533476882188717, + "learning_rate": 1.6519507802188072e-06, + "loss": 0.2604, + "step": 25566 + }, + { + "epoch": 0.74, + "grad_norm": 1.3329474872535096, + "learning_rate": 1.651601936996069e-06, + "loss": 0.2956, + "step": 25567 + }, + { + "epoch": 0.74, + "grad_norm": 1.4085806976310473, + "learning_rate": 1.6512531233229811e-06, + "loss": 0.2643, + "step": 25568 + }, + { + "epoch": 0.74, + "grad_norm": 1.1895275918031842, + "learning_rate": 1.6509043392026214e-06, + "loss": 0.2782, + "step": 25569 + }, + { + "epoch": 0.74, + "grad_norm": 1.3252857741516024, + "learning_rate": 1.6505555846380677e-06, + "loss": 0.2719, + "step": 25570 + }, + { + "epoch": 0.74, + "grad_norm": 1.9845690762108428, + "learning_rate": 1.6502068596323983e-06, + "loss": 0.3104, + "step": 25571 + }, + { + "epoch": 0.74, + "grad_norm": 1.3446500928077947, + "learning_rate": 1.6498581641886907e-06, + "loss": 0.2849, + "step": 25572 + }, + { + "epoch": 0.74, + "grad_norm": 1.2695183870881388, + "learning_rate": 1.6495094983100236e-06, + "loss": 0.2589, + "step": 25573 + }, + { + "epoch": 0.74, + "grad_norm": 1.3849804003599817, + "learning_rate": 1.649160861999471e-06, + "loss": 0.2759, + "step": 25574 + }, + { + "epoch": 0.74, + "grad_norm": 1.3146549801408465, + "learning_rate": 1.6488122552601106e-06, + "loss": 0.2763, + "step": 25575 + }, + { + "epoch": 0.74, + "grad_norm": 1.3830087228454229, + "learning_rate": 1.6484636780950192e-06, + "loss": 0.2816, + "step": 25576 + }, + { + "epoch": 0.74, + "grad_norm": 1.2341679332950386, + "learning_rate": 1.648115130507273e-06, + "loss": 0.2624, + "step": 25577 + }, + { + "epoch": 0.74, + "grad_norm": 1.308594019682777, + "learning_rate": 1.6477666124999476e-06, + "loss": 0.253, + "step": 25578 + }, + { + "epoch": 0.74, + "grad_norm": 1.2341082679581012, + "learning_rate": 1.6474181240761195e-06, + "loss": 0.2645, + "step": 25579 + }, + { + "epoch": 0.74, + "grad_norm": 1.3119676918190832, + "learning_rate": 1.6470696652388635e-06, + "loss": 0.2553, + "step": 25580 + }, + { + "epoch": 0.74, + "grad_norm": 1.5355937367113313, + "learning_rate": 1.6467212359912554e-06, + "loss": 0.2827, + "step": 25581 + }, + { + "epoch": 0.74, + "grad_norm": 1.4584478318197844, + "learning_rate": 1.6463728363363707e-06, + "loss": 0.27, + "step": 25582 + }, + { + "epoch": 0.74, + "grad_norm": 1.5119485342857788, + "learning_rate": 1.6460244662772828e-06, + "loss": 0.2843, + "step": 25583 + }, + { + "epoch": 0.74, + "grad_norm": 1.2861089130798022, + "learning_rate": 1.645676125817065e-06, + "loss": 0.273, + "step": 25584 + }, + { + "epoch": 0.74, + "grad_norm": 1.5261196825743304, + "learning_rate": 1.6453278149587926e-06, + "loss": 0.2679, + "step": 25585 + }, + { + "epoch": 0.74, + "grad_norm": 1.3953089412174544, + "learning_rate": 1.6449795337055392e-06, + "loss": 0.2735, + "step": 25586 + }, + { + "epoch": 0.74, + "grad_norm": 1.4176682541157044, + "learning_rate": 1.6446312820603783e-06, + "loss": 0.2757, + "step": 25587 + }, + { + "epoch": 0.74, + "grad_norm": 1.3348698305246471, + "learning_rate": 1.6442830600263843e-06, + "loss": 0.252, + "step": 25588 + }, + { + "epoch": 0.74, + "grad_norm": 1.3703366376600412, + "learning_rate": 1.6439348676066291e-06, + "loss": 0.2643, + "step": 25589 + }, + { + "epoch": 0.74, + "grad_norm": 1.2993467163711347, + "learning_rate": 1.6435867048041859e-06, + "loss": 0.2927, + "step": 25590 + }, + { + "epoch": 0.74, + "grad_norm": 1.454027194184791, + "learning_rate": 1.6432385716221271e-06, + "loss": 0.2899, + "step": 25591 + }, + { + "epoch": 0.74, + "grad_norm": 1.3526399393202568, + "learning_rate": 1.6428904680635272e-06, + "loss": 0.2926, + "step": 25592 + }, + { + "epoch": 0.74, + "grad_norm": 1.2429781519780092, + "learning_rate": 1.6425423941314544e-06, + "loss": 0.2658, + "step": 25593 + }, + { + "epoch": 0.74, + "grad_norm": 1.2904505288968129, + "learning_rate": 1.6421943498289821e-06, + "loss": 0.2915, + "step": 25594 + }, + { + "epoch": 0.74, + "grad_norm": 1.686205863873711, + "learning_rate": 1.6418463351591818e-06, + "loss": 0.2911, + "step": 25595 + }, + { + "epoch": 0.74, + "grad_norm": 1.3757625607601978, + "learning_rate": 1.6414983501251252e-06, + "loss": 0.2928, + "step": 25596 + }, + { + "epoch": 0.74, + "grad_norm": 1.2746788256558708, + "learning_rate": 1.6411503947298829e-06, + "loss": 0.2672, + "step": 25597 + }, + { + "epoch": 0.74, + "grad_norm": 1.3121487808507055, + "learning_rate": 1.640802468976525e-06, + "loss": 0.2871, + "step": 25598 + }, + { + "epoch": 0.74, + "grad_norm": 1.3223376641695694, + "learning_rate": 1.6404545728681232e-06, + "loss": 0.2755, + "step": 25599 + }, + { + "epoch": 0.74, + "grad_norm": 1.4926916937086903, + "learning_rate": 1.640106706407747e-06, + "loss": 0.2974, + "step": 25600 + }, + { + "epoch": 0.74, + "grad_norm": 1.1889166329764909, + "learning_rate": 1.6397588695984679e-06, + "loss": 0.3168, + "step": 25601 + }, + { + "epoch": 0.74, + "grad_norm": 1.2605661528744507, + "learning_rate": 1.6394110624433523e-06, + "loss": 0.2889, + "step": 25602 + }, + { + "epoch": 0.74, + "grad_norm": 1.3574343323639744, + "learning_rate": 1.6390632849454718e-06, + "loss": 0.3101, + "step": 25603 + }, + { + "epoch": 0.74, + "grad_norm": 1.6801548457739282, + "learning_rate": 1.638715537107895e-06, + "loss": 0.2787, + "step": 25604 + }, + { + "epoch": 0.74, + "grad_norm": 1.2460494400992168, + "learning_rate": 1.6383678189336906e-06, + "loss": 0.2674, + "step": 25605 + }, + { + "epoch": 0.74, + "grad_norm": 1.2862244472265996, + "learning_rate": 1.6380201304259275e-06, + "loss": 0.2718, + "step": 25606 + }, + { + "epoch": 0.74, + "grad_norm": 1.1977999280903135, + "learning_rate": 1.637672471587674e-06, + "loss": 0.2719, + "step": 25607 + }, + { + "epoch": 0.74, + "grad_norm": 1.323774984336872, + "learning_rate": 1.6373248424219985e-06, + "loss": 0.2975, + "step": 25608 + }, + { + "epoch": 0.74, + "grad_norm": 1.3623415259323963, + "learning_rate": 1.6369772429319696e-06, + "loss": 0.2593, + "step": 25609 + }, + { + "epoch": 0.74, + "grad_norm": 1.6839374650720567, + "learning_rate": 1.6366296731206527e-06, + "loss": 0.2965, + "step": 25610 + }, + { + "epoch": 0.74, + "grad_norm": 1.269661413636795, + "learning_rate": 1.6362821329911182e-06, + "loss": 0.2839, + "step": 25611 + }, + { + "epoch": 0.74, + "grad_norm": 1.9801477463879786, + "learning_rate": 1.6359346225464289e-06, + "loss": 0.2695, + "step": 25612 + }, + { + "epoch": 0.74, + "grad_norm": 11.5382242334798, + "learning_rate": 1.6355871417896546e-06, + "loss": 0.2692, + "step": 25613 + }, + { + "epoch": 0.74, + "grad_norm": 1.3695200015303468, + "learning_rate": 1.6352396907238604e-06, + "loss": 0.2828, + "step": 25614 + }, + { + "epoch": 0.74, + "grad_norm": 1.5684112330457483, + "learning_rate": 1.6348922693521135e-06, + "loss": 0.2878, + "step": 25615 + }, + { + "epoch": 0.74, + "grad_norm": 1.412600993106965, + "learning_rate": 1.6345448776774796e-06, + "loss": 0.2846, + "step": 25616 + }, + { + "epoch": 0.74, + "grad_norm": 1.3054850763393189, + "learning_rate": 1.6341975157030248e-06, + "loss": 0.2675, + "step": 25617 + }, + { + "epoch": 0.74, + "grad_norm": 1.2996358040304894, + "learning_rate": 1.6338501834318138e-06, + "loss": 0.2734, + "step": 25618 + }, + { + "epoch": 0.74, + "grad_norm": 1.5006917107032842, + "learning_rate": 1.633502880866914e-06, + "loss": 0.2838, + "step": 25619 + }, + { + "epoch": 0.74, + "grad_norm": 1.4036676946298585, + "learning_rate": 1.6331556080113875e-06, + "loss": 0.2809, + "step": 25620 + }, + { + "epoch": 0.74, + "grad_norm": 1.2150472032960353, + "learning_rate": 1.6328083648682997e-06, + "loss": 0.264, + "step": 25621 + }, + { + "epoch": 0.74, + "grad_norm": 1.7885676746607257, + "learning_rate": 1.632461151440715e-06, + "loss": 0.2892, + "step": 25622 + }, + { + "epoch": 0.74, + "grad_norm": 1.3801843928151456, + "learning_rate": 1.6321139677316988e-06, + "loss": 0.3176, + "step": 25623 + }, + { + "epoch": 0.74, + "grad_norm": 1.326579042567775, + "learning_rate": 1.6317668137443139e-06, + "loss": 0.2769, + "step": 25624 + }, + { + "epoch": 0.74, + "grad_norm": 1.59659941069545, + "learning_rate": 1.6314196894816241e-06, + "loss": 0.279, + "step": 25625 + }, + { + "epoch": 0.74, + "grad_norm": 1.3510548293339621, + "learning_rate": 1.6310725949466932e-06, + "loss": 0.2861, + "step": 25626 + }, + { + "epoch": 0.74, + "grad_norm": 1.8731580552053453, + "learning_rate": 1.6307255301425834e-06, + "loss": 0.2871, + "step": 25627 + }, + { + "epoch": 0.74, + "grad_norm": 1.2098454964367946, + "learning_rate": 1.6303784950723607e-06, + "loss": 0.276, + "step": 25628 + }, + { + "epoch": 0.74, + "grad_norm": 1.2527004108009394, + "learning_rate": 1.6300314897390829e-06, + "loss": 0.266, + "step": 25629 + }, + { + "epoch": 0.74, + "grad_norm": 1.2795741464171546, + "learning_rate": 1.6296845141458146e-06, + "loss": 0.2915, + "step": 25630 + }, + { + "epoch": 0.74, + "grad_norm": 1.3000803877920555, + "learning_rate": 1.629337568295618e-06, + "loss": 0.3, + "step": 25631 + }, + { + "epoch": 0.74, + "grad_norm": 1.2297258055997942, + "learning_rate": 1.628990652191555e-06, + "loss": 0.2692, + "step": 25632 + }, + { + "epoch": 0.74, + "grad_norm": 1.491383143994962, + "learning_rate": 1.6286437658366867e-06, + "loss": 0.2732, + "step": 25633 + }, + { + "epoch": 0.74, + "grad_norm": 1.2421554090801854, + "learning_rate": 1.6282969092340744e-06, + "loss": 0.2716, + "step": 25634 + }, + { + "epoch": 0.74, + "grad_norm": 1.6727304880509835, + "learning_rate": 1.6279500823867795e-06, + "loss": 0.2777, + "step": 25635 + }, + { + "epoch": 0.74, + "grad_norm": 2.7498126483318197, + "learning_rate": 1.6276032852978624e-06, + "loss": 0.2502, + "step": 25636 + }, + { + "epoch": 0.74, + "grad_norm": 1.539469655871081, + "learning_rate": 1.6272565179703853e-06, + "loss": 0.2842, + "step": 25637 + }, + { + "epoch": 0.74, + "grad_norm": 1.3454308747743489, + "learning_rate": 1.6269097804074052e-06, + "loss": 0.2714, + "step": 25638 + }, + { + "epoch": 0.74, + "grad_norm": 1.4848715104393178, + "learning_rate": 1.6265630726119852e-06, + "loss": 0.2626, + "step": 25639 + }, + { + "epoch": 0.74, + "grad_norm": 1.2559704917777244, + "learning_rate": 1.6262163945871823e-06, + "loss": 0.2749, + "step": 25640 + }, + { + "epoch": 0.74, + "grad_norm": 1.245124398137923, + "learning_rate": 1.6258697463360568e-06, + "loss": 0.2566, + "step": 25641 + }, + { + "epoch": 0.74, + "grad_norm": 1.4260144230329486, + "learning_rate": 1.6255231278616685e-06, + "loss": 0.273, + "step": 25642 + }, + { + "epoch": 0.74, + "grad_norm": 1.2996501251448316, + "learning_rate": 1.625176539167076e-06, + "loss": 0.2817, + "step": 25643 + }, + { + "epoch": 0.74, + "grad_norm": 1.703885433031722, + "learning_rate": 1.6248299802553375e-06, + "loss": 0.2985, + "step": 25644 + }, + { + "epoch": 0.74, + "grad_norm": 1.274353425219416, + "learning_rate": 1.624483451129512e-06, + "loss": 0.2686, + "step": 25645 + }, + { + "epoch": 0.74, + "grad_norm": 1.3373442173640602, + "learning_rate": 1.624136951792658e-06, + "loss": 0.273, + "step": 25646 + }, + { + "epoch": 0.74, + "grad_norm": 1.2660882963984328, + "learning_rate": 1.6237904822478344e-06, + "loss": 0.2862, + "step": 25647 + }, + { + "epoch": 0.74, + "grad_norm": 1.2329490802796836, + "learning_rate": 1.6234440424980953e-06, + "loss": 0.2727, + "step": 25648 + }, + { + "epoch": 0.74, + "grad_norm": 1.5463737637921937, + "learning_rate": 1.6230976325465004e-06, + "loss": 0.2972, + "step": 25649 + }, + { + "epoch": 0.74, + "grad_norm": 1.344109962230645, + "learning_rate": 1.6227512523961058e-06, + "loss": 0.2726, + "step": 25650 + }, + { + "epoch": 0.74, + "grad_norm": 1.2408822679208507, + "learning_rate": 1.6224049020499687e-06, + "loss": 0.2923, + "step": 25651 + }, + { + "epoch": 0.74, + "grad_norm": 1.3061366619987502, + "learning_rate": 1.6220585815111468e-06, + "loss": 0.273, + "step": 25652 + }, + { + "epoch": 0.74, + "grad_norm": 1.3636212363209115, + "learning_rate": 1.6217122907826948e-06, + "loss": 0.2562, + "step": 25653 + }, + { + "epoch": 0.74, + "grad_norm": 1.3640650361934634, + "learning_rate": 1.6213660298676692e-06, + "loss": 0.2686, + "step": 25654 + }, + { + "epoch": 0.74, + "grad_norm": 1.2704757249769294, + "learning_rate": 1.6210197987691278e-06, + "loss": 0.268, + "step": 25655 + }, + { + "epoch": 0.74, + "grad_norm": 1.270502862309992, + "learning_rate": 1.6206735974901221e-06, + "loss": 0.2759, + "step": 25656 + }, + { + "epoch": 0.74, + "grad_norm": 2.9942852391240513, + "learning_rate": 1.62032742603371e-06, + "loss": 0.281, + "step": 25657 + }, + { + "epoch": 0.74, + "grad_norm": 1.2357560893490813, + "learning_rate": 1.619981284402945e-06, + "loss": 0.2575, + "step": 25658 + }, + { + "epoch": 0.74, + "grad_norm": 1.3978588424487635, + "learning_rate": 1.6196351726008835e-06, + "loss": 0.2817, + "step": 25659 + }, + { + "epoch": 0.74, + "grad_norm": 1.4584017720060218, + "learning_rate": 1.6192890906305787e-06, + "loss": 0.2929, + "step": 25660 + }, + { + "epoch": 0.74, + "grad_norm": 1.5032896719220663, + "learning_rate": 1.618943038495085e-06, + "loss": 0.2736, + "step": 25661 + }, + { + "epoch": 0.74, + "grad_norm": 1.3270044233595588, + "learning_rate": 1.6185970161974568e-06, + "loss": 0.2864, + "step": 25662 + }, + { + "epoch": 0.74, + "grad_norm": 1.2961983473174055, + "learning_rate": 1.6182510237407472e-06, + "loss": 0.2615, + "step": 25663 + }, + { + "epoch": 0.74, + "grad_norm": 1.3230794248265962, + "learning_rate": 1.6179050611280117e-06, + "loss": 0.267, + "step": 25664 + }, + { + "epoch": 0.74, + "grad_norm": 1.3185714584223076, + "learning_rate": 1.6175591283622999e-06, + "loss": 0.2923, + "step": 25665 + }, + { + "epoch": 0.74, + "grad_norm": 1.4787809404770682, + "learning_rate": 1.6172132254466667e-06, + "loss": 0.2732, + "step": 25666 + }, + { + "epoch": 0.74, + "grad_norm": 1.311173708301667, + "learning_rate": 1.6168673523841655e-06, + "loss": 0.2608, + "step": 25667 + }, + { + "epoch": 0.74, + "grad_norm": 1.6519348264270708, + "learning_rate": 1.6165215091778457e-06, + "loss": 0.2866, + "step": 25668 + }, + { + "epoch": 0.74, + "grad_norm": 1.3971577665667134, + "learning_rate": 1.6161756958307611e-06, + "loss": 0.2863, + "step": 25669 + }, + { + "epoch": 0.74, + "grad_norm": 1.5129392830337558, + "learning_rate": 1.6158299123459635e-06, + "loss": 0.2659, + "step": 25670 + }, + { + "epoch": 0.74, + "grad_norm": 1.319158164717363, + "learning_rate": 1.6154841587265052e-06, + "loss": 0.2654, + "step": 25671 + }, + { + "epoch": 0.74, + "grad_norm": 1.3580887274429536, + "learning_rate": 1.6151384349754362e-06, + "loss": 0.2827, + "step": 25672 + }, + { + "epoch": 0.74, + "grad_norm": 1.4407544075907395, + "learning_rate": 1.6147927410958075e-06, + "loss": 0.2821, + "step": 25673 + }, + { + "epoch": 0.74, + "grad_norm": 1.2985286388107589, + "learning_rate": 1.614447077090673e-06, + "loss": 0.2915, + "step": 25674 + }, + { + "epoch": 0.74, + "grad_norm": 1.3328184189507595, + "learning_rate": 1.6141014429630787e-06, + "loss": 0.2842, + "step": 25675 + }, + { + "epoch": 0.74, + "grad_norm": 2.3676843794636193, + "learning_rate": 1.6137558387160762e-06, + "loss": 0.3017, + "step": 25676 + }, + { + "epoch": 0.74, + "grad_norm": 1.4462850004172383, + "learning_rate": 1.6134102643527166e-06, + "loss": 0.2842, + "step": 25677 + }, + { + "epoch": 0.74, + "grad_norm": 1.6297213543339364, + "learning_rate": 1.6130647198760486e-06, + "loss": 0.3062, + "step": 25678 + }, + { + "epoch": 0.74, + "grad_norm": 1.415551277650102, + "learning_rate": 1.6127192052891221e-06, + "loss": 0.2648, + "step": 25679 + }, + { + "epoch": 0.74, + "grad_norm": 1.383568128404665, + "learning_rate": 1.6123737205949864e-06, + "loss": 0.2818, + "step": 25680 + }, + { + "epoch": 0.74, + "grad_norm": 1.9630601172819897, + "learning_rate": 1.61202826579669e-06, + "loss": 0.2843, + "step": 25681 + }, + { + "epoch": 0.74, + "grad_norm": 1.392484469882329, + "learning_rate": 1.6116828408972819e-06, + "loss": 0.2876, + "step": 25682 + }, + { + "epoch": 0.74, + "grad_norm": 1.3013799061417273, + "learning_rate": 1.6113374458998121e-06, + "loss": 0.2741, + "step": 25683 + }, + { + "epoch": 0.74, + "grad_norm": 1.2234152247101084, + "learning_rate": 1.6109920808073249e-06, + "loss": 0.2602, + "step": 25684 + }, + { + "epoch": 0.74, + "grad_norm": 1.3295770069250101, + "learning_rate": 1.6106467456228703e-06, + "loss": 0.2754, + "step": 25685 + }, + { + "epoch": 0.75, + "grad_norm": 1.2641670324016088, + "learning_rate": 1.610301440349496e-06, + "loss": 0.2917, + "step": 25686 + }, + { + "epoch": 0.75, + "grad_norm": 1.4334513619607818, + "learning_rate": 1.609956164990249e-06, + "loss": 0.3042, + "step": 25687 + }, + { + "epoch": 0.75, + "grad_norm": 1.268832358443984, + "learning_rate": 1.6096109195481763e-06, + "loss": 0.2801, + "step": 25688 + }, + { + "epoch": 0.75, + "grad_norm": 2.1060120838813, + "learning_rate": 1.6092657040263248e-06, + "loss": 0.2723, + "step": 25689 + }, + { + "epoch": 0.75, + "grad_norm": 1.3094290424424913, + "learning_rate": 1.6089205184277411e-06, + "loss": 0.2921, + "step": 25690 + }, + { + "epoch": 0.75, + "grad_norm": 1.238316566132055, + "learning_rate": 1.6085753627554728e-06, + "loss": 0.2668, + "step": 25691 + }, + { + "epoch": 0.75, + "grad_norm": 1.3469888867858142, + "learning_rate": 1.6082302370125636e-06, + "loss": 0.256, + "step": 25692 + }, + { + "epoch": 0.75, + "grad_norm": 1.407304098558515, + "learning_rate": 1.6078851412020596e-06, + "loss": 0.2851, + "step": 25693 + }, + { + "epoch": 0.75, + "grad_norm": 4.4634529737779385, + "learning_rate": 1.6075400753270077e-06, + "loss": 0.2827, + "step": 25694 + }, + { + "epoch": 0.75, + "grad_norm": 1.4657842558716256, + "learning_rate": 1.607195039390453e-06, + "loss": 0.2874, + "step": 25695 + }, + { + "epoch": 0.75, + "grad_norm": 1.2817015574760273, + "learning_rate": 1.6068500333954385e-06, + "loss": 0.2894, + "step": 25696 + }, + { + "epoch": 0.75, + "grad_norm": 2.2723933640768528, + "learning_rate": 1.6065050573450097e-06, + "loss": 0.2583, + "step": 25697 + }, + { + "epoch": 0.75, + "grad_norm": 1.708176801692229, + "learning_rate": 1.606160111242212e-06, + "loss": 0.2837, + "step": 25698 + }, + { + "epoch": 0.75, + "grad_norm": 1.2328744751796747, + "learning_rate": 1.6058151950900886e-06, + "loss": 0.288, + "step": 25699 + }, + { + "epoch": 0.75, + "grad_norm": 1.3220053927726485, + "learning_rate": 1.605470308891684e-06, + "loss": 0.28, + "step": 25700 + }, + { + "epoch": 0.75, + "grad_norm": 1.4452494741679867, + "learning_rate": 1.6051254526500416e-06, + "loss": 0.2917, + "step": 25701 + }, + { + "epoch": 0.75, + "grad_norm": 1.3135722096604596, + "learning_rate": 1.604780626368206e-06, + "loss": 0.3111, + "step": 25702 + }, + { + "epoch": 0.75, + "grad_norm": 1.3070948506408686, + "learning_rate": 1.6044358300492175e-06, + "loss": 0.2649, + "step": 25703 + }, + { + "epoch": 0.75, + "grad_norm": 1.3301331319895924, + "learning_rate": 1.6040910636961204e-06, + "loss": 0.2766, + "step": 25704 + }, + { + "epoch": 0.75, + "grad_norm": 1.5583686316825258, + "learning_rate": 1.6037463273119574e-06, + "loss": 0.2514, + "step": 25705 + }, + { + "epoch": 0.75, + "grad_norm": 1.2979786754923484, + "learning_rate": 1.603401620899771e-06, + "loss": 0.2669, + "step": 25706 + }, + { + "epoch": 0.75, + "grad_norm": 1.4055858268989088, + "learning_rate": 1.6030569444626026e-06, + "loss": 0.2696, + "step": 25707 + }, + { + "epoch": 0.75, + "grad_norm": 1.3098930310747499, + "learning_rate": 1.602712298003495e-06, + "loss": 0.2906, + "step": 25708 + }, + { + "epoch": 0.75, + "grad_norm": 1.7633091569195969, + "learning_rate": 1.6023676815254885e-06, + "loss": 0.2876, + "step": 25709 + }, + { + "epoch": 0.75, + "grad_norm": 1.2794258371458058, + "learning_rate": 1.6020230950316268e-06, + "loss": 0.2763, + "step": 25710 + }, + { + "epoch": 0.75, + "grad_norm": 1.5053596040100417, + "learning_rate": 1.6016785385249467e-06, + "loss": 0.2732, + "step": 25711 + }, + { + "epoch": 0.75, + "grad_norm": 1.402238024597362, + "learning_rate": 1.6013340120084919e-06, + "loss": 0.2735, + "step": 25712 + }, + { + "epoch": 0.75, + "grad_norm": 1.6100066468309877, + "learning_rate": 1.6009895154853022e-06, + "loss": 0.2685, + "step": 25713 + }, + { + "epoch": 0.75, + "grad_norm": 1.2410758588151116, + "learning_rate": 1.6006450489584175e-06, + "loss": 0.2981, + "step": 25714 + }, + { + "epoch": 0.75, + "grad_norm": 1.2961085371961907, + "learning_rate": 1.6003006124308779e-06, + "loss": 0.2639, + "step": 25715 + }, + { + "epoch": 0.75, + "grad_norm": 1.2979721489329497, + "learning_rate": 1.5999562059057228e-06, + "loss": 0.2711, + "step": 25716 + }, + { + "epoch": 0.75, + "grad_norm": 1.3200610631705672, + "learning_rate": 1.5996118293859924e-06, + "loss": 0.2737, + "step": 25717 + }, + { + "epoch": 0.75, + "grad_norm": 1.2543791623111673, + "learning_rate": 1.599267482874725e-06, + "loss": 0.2754, + "step": 25718 + }, + { + "epoch": 0.75, + "grad_norm": 1.1924111725339133, + "learning_rate": 1.5989231663749616e-06, + "loss": 0.2594, + "step": 25719 + }, + { + "epoch": 0.75, + "grad_norm": 1.3633327120382113, + "learning_rate": 1.5985788798897374e-06, + "loss": 0.323, + "step": 25720 + }, + { + "epoch": 0.75, + "grad_norm": 1.427703559792124, + "learning_rate": 1.5982346234220925e-06, + "loss": 0.272, + "step": 25721 + }, + { + "epoch": 0.75, + "grad_norm": 1.4991810339868146, + "learning_rate": 1.5978903969750642e-06, + "loss": 0.2709, + "step": 25722 + }, + { + "epoch": 0.75, + "grad_norm": 1.2674862145892125, + "learning_rate": 1.5975462005516912e-06, + "loss": 0.2421, + "step": 25723 + }, + { + "epoch": 0.75, + "grad_norm": 1.474809255800903, + "learning_rate": 1.5972020341550126e-06, + "loss": 0.3044, + "step": 25724 + }, + { + "epoch": 0.75, + "grad_norm": 1.285900664210854, + "learning_rate": 1.5968578977880623e-06, + "loss": 0.2655, + "step": 25725 + }, + { + "epoch": 0.75, + "grad_norm": 1.398360544129751, + "learning_rate": 1.596513791453878e-06, + "loss": 0.2789, + "step": 25726 + }, + { + "epoch": 0.75, + "grad_norm": 2.277093219076622, + "learning_rate": 1.5961697151554979e-06, + "loss": 0.2674, + "step": 25727 + }, + { + "epoch": 0.75, + "grad_norm": 1.6432782864837077, + "learning_rate": 1.5958256688959578e-06, + "loss": 0.2808, + "step": 25728 + }, + { + "epoch": 0.75, + "grad_norm": 1.3513644959887325, + "learning_rate": 1.595481652678295e-06, + "loss": 0.2714, + "step": 25729 + }, + { + "epoch": 0.75, + "grad_norm": 1.5985099916367775, + "learning_rate": 1.5951376665055434e-06, + "loss": 0.2742, + "step": 25730 + }, + { + "epoch": 0.75, + "grad_norm": 1.289289431039835, + "learning_rate": 1.5947937103807392e-06, + "loss": 0.2813, + "step": 25731 + }, + { + "epoch": 0.75, + "grad_norm": 1.3551101799772025, + "learning_rate": 1.5944497843069185e-06, + "loss": 0.2709, + "step": 25732 + }, + { + "epoch": 0.75, + "grad_norm": 1.3928971094359615, + "learning_rate": 1.5941058882871163e-06, + "loss": 0.2713, + "step": 25733 + }, + { + "epoch": 0.75, + "grad_norm": 1.232868594886101, + "learning_rate": 1.5937620223243672e-06, + "loss": 0.2653, + "step": 25734 + }, + { + "epoch": 0.75, + "grad_norm": 1.2767739798031577, + "learning_rate": 1.5934181864217063e-06, + "loss": 0.2973, + "step": 25735 + }, + { + "epoch": 0.75, + "grad_norm": 1.3870766603907625, + "learning_rate": 1.5930743805821675e-06, + "loss": 0.2764, + "step": 25736 + }, + { + "epoch": 0.75, + "grad_norm": 1.363838001205103, + "learning_rate": 1.5927306048087855e-06, + "loss": 0.2969, + "step": 25737 + }, + { + "epoch": 0.75, + "grad_norm": 1.48088060520169, + "learning_rate": 1.5923868591045949e-06, + "loss": 0.2934, + "step": 25738 + }, + { + "epoch": 0.75, + "grad_norm": 1.3658739127241768, + "learning_rate": 1.5920431434726263e-06, + "loss": 0.2818, + "step": 25739 + }, + { + "epoch": 0.75, + "grad_norm": 3.3332351559649034, + "learning_rate": 1.5916994579159151e-06, + "loss": 0.2886, + "step": 25740 + }, + { + "epoch": 0.75, + "grad_norm": 1.355111962120831, + "learning_rate": 1.5913558024374943e-06, + "loss": 0.2901, + "step": 25741 + }, + { + "epoch": 0.75, + "grad_norm": 1.4601174914060715, + "learning_rate": 1.5910121770403958e-06, + "loss": 0.2797, + "step": 25742 + }, + { + "epoch": 0.75, + "grad_norm": 1.5845619460393348, + "learning_rate": 1.590668581727653e-06, + "loss": 0.2684, + "step": 25743 + }, + { + "epoch": 0.75, + "grad_norm": 1.44957303768252, + "learning_rate": 1.590325016502297e-06, + "loss": 0.2653, + "step": 25744 + }, + { + "epoch": 0.75, + "grad_norm": 1.409517342422128, + "learning_rate": 1.5899814813673614e-06, + "loss": 0.2744, + "step": 25745 + }, + { + "epoch": 0.75, + "grad_norm": 1.476215377125872, + "learning_rate": 1.5896379763258785e-06, + "loss": 0.273, + "step": 25746 + }, + { + "epoch": 0.75, + "grad_norm": 1.3717519296626133, + "learning_rate": 1.5892945013808764e-06, + "loss": 0.293, + "step": 25747 + }, + { + "epoch": 0.75, + "grad_norm": 2.1185101513629156, + "learning_rate": 1.5889510565353888e-06, + "loss": 0.288, + "step": 25748 + }, + { + "epoch": 0.75, + "grad_norm": 2.6037198620643167, + "learning_rate": 1.588607641792445e-06, + "loss": 0.2759, + "step": 25749 + }, + { + "epoch": 0.75, + "grad_norm": 1.2917391645400769, + "learning_rate": 1.5882642571550777e-06, + "loss": 0.2755, + "step": 25750 + }, + { + "epoch": 0.75, + "grad_norm": 1.407788677496391, + "learning_rate": 1.5879209026263153e-06, + "loss": 0.2835, + "step": 25751 + }, + { + "epoch": 0.75, + "grad_norm": 1.4571120133516284, + "learning_rate": 1.5875775782091907e-06, + "loss": 0.2899, + "step": 25752 + }, + { + "epoch": 0.75, + "grad_norm": 1.5684641478153543, + "learning_rate": 1.5872342839067305e-06, + "loss": 0.2793, + "step": 25753 + }, + { + "epoch": 0.75, + "grad_norm": 1.2589123424821962, + "learning_rate": 1.586891019721965e-06, + "loss": 0.2673, + "step": 25754 + }, + { + "epoch": 0.75, + "grad_norm": 1.326595274785167, + "learning_rate": 1.5865477856579248e-06, + "loss": 0.2647, + "step": 25755 + }, + { + "epoch": 0.75, + "grad_norm": 0.9400629425203924, + "learning_rate": 1.586204581717638e-06, + "loss": 0.548, + "step": 25756 + }, + { + "epoch": 0.75, + "grad_norm": 1.2736081378919428, + "learning_rate": 1.585861407904135e-06, + "loss": 0.2624, + "step": 25757 + }, + { + "epoch": 0.75, + "grad_norm": 1.2422257121023579, + "learning_rate": 1.5855182642204415e-06, + "loss": 0.2735, + "step": 25758 + }, + { + "epoch": 0.75, + "grad_norm": 1.533224631543361, + "learning_rate": 1.5851751506695873e-06, + "loss": 0.2717, + "step": 25759 + }, + { + "epoch": 0.75, + "grad_norm": 1.3328277808911548, + "learning_rate": 1.5848320672546003e-06, + "loss": 0.2992, + "step": 25760 + }, + { + "epoch": 0.75, + "grad_norm": 1.3968929241859538, + "learning_rate": 1.584489013978508e-06, + "loss": 0.2737, + "step": 25761 + }, + { + "epoch": 0.75, + "grad_norm": 1.9482673688695813, + "learning_rate": 1.5841459908443379e-06, + "loss": 0.2752, + "step": 25762 + }, + { + "epoch": 0.75, + "grad_norm": 1.3728186876160187, + "learning_rate": 1.5838029978551178e-06, + "loss": 0.2651, + "step": 25763 + }, + { + "epoch": 0.75, + "grad_norm": 1.3453692696252402, + "learning_rate": 1.5834600350138735e-06, + "loss": 0.2671, + "step": 25764 + }, + { + "epoch": 0.75, + "grad_norm": 1.552322895381843, + "learning_rate": 1.5831171023236347e-06, + "loss": 0.2797, + "step": 25765 + }, + { + "epoch": 0.75, + "grad_norm": 1.2875822512271657, + "learning_rate": 1.582774199787423e-06, + "loss": 0.2689, + "step": 25766 + }, + { + "epoch": 0.75, + "grad_norm": 1.5348170340543685, + "learning_rate": 1.5824313274082675e-06, + "loss": 0.2879, + "step": 25767 + }, + { + "epoch": 0.75, + "grad_norm": 1.3988896092502583, + "learning_rate": 1.5820884851891932e-06, + "loss": 0.2571, + "step": 25768 + }, + { + "epoch": 0.75, + "grad_norm": 1.6378835478438123, + "learning_rate": 1.581745673133226e-06, + "loss": 0.2993, + "step": 25769 + }, + { + "epoch": 0.75, + "grad_norm": 1.456440998403164, + "learning_rate": 1.581402891243391e-06, + "loss": 0.2813, + "step": 25770 + }, + { + "epoch": 0.75, + "grad_norm": 1.3613373683439747, + "learning_rate": 1.5810601395227133e-06, + "loss": 0.2688, + "step": 25771 + }, + { + "epoch": 0.75, + "grad_norm": 2.9949872378838793, + "learning_rate": 1.580717417974218e-06, + "loss": 0.2791, + "step": 25772 + }, + { + "epoch": 0.75, + "grad_norm": 1.3744543982961261, + "learning_rate": 1.5803747266009296e-06, + "loss": 0.2752, + "step": 25773 + }, + { + "epoch": 0.75, + "grad_norm": 1.4527417695022389, + "learning_rate": 1.5800320654058733e-06, + "loss": 0.2687, + "step": 25774 + }, + { + "epoch": 0.75, + "grad_norm": 1.337184634536786, + "learning_rate": 1.5796894343920705e-06, + "loss": 0.2609, + "step": 25775 + }, + { + "epoch": 0.75, + "grad_norm": 2.160645070933528, + "learning_rate": 1.5793468335625468e-06, + "loss": 0.2656, + "step": 25776 + }, + { + "epoch": 0.75, + "grad_norm": 1.3644035763810984, + "learning_rate": 1.579004262920325e-06, + "loss": 0.272, + "step": 25777 + }, + { + "epoch": 0.75, + "grad_norm": 1.2244640860661185, + "learning_rate": 1.5786617224684286e-06, + "loss": 0.2849, + "step": 25778 + }, + { + "epoch": 0.75, + "grad_norm": 1.3692678324682686, + "learning_rate": 1.57831921220988e-06, + "loss": 0.332, + "step": 25779 + }, + { + "epoch": 0.75, + "grad_norm": 1.3011596130691472, + "learning_rate": 1.5779767321477046e-06, + "loss": 0.2706, + "step": 25780 + }, + { + "epoch": 0.75, + "grad_norm": 1.3348065621689795, + "learning_rate": 1.5776342822849205e-06, + "loss": 0.3028, + "step": 25781 + }, + { + "epoch": 0.75, + "grad_norm": 1.7250803035783604, + "learning_rate": 1.5772918626245515e-06, + "loss": 0.2899, + "step": 25782 + }, + { + "epoch": 0.75, + "grad_norm": 2.3565034127615294, + "learning_rate": 1.5769494731696206e-06, + "loss": 0.2728, + "step": 25783 + }, + { + "epoch": 0.75, + "grad_norm": 1.6825663705462521, + "learning_rate": 1.5766071139231498e-06, + "loss": 0.2862, + "step": 25784 + }, + { + "epoch": 0.75, + "grad_norm": 1.363967873490713, + "learning_rate": 1.5762647848881575e-06, + "loss": 0.2719, + "step": 25785 + }, + { + "epoch": 0.75, + "grad_norm": 1.6154310325101822, + "learning_rate": 1.5759224860676663e-06, + "loss": 0.298, + "step": 25786 + }, + { + "epoch": 0.75, + "grad_norm": 1.2549194921125844, + "learning_rate": 1.5755802174646972e-06, + "loss": 0.2971, + "step": 25787 + }, + { + "epoch": 0.75, + "grad_norm": 1.462197367187963, + "learning_rate": 1.5752379790822708e-06, + "loss": 0.2817, + "step": 25788 + }, + { + "epoch": 0.75, + "grad_norm": 1.297154196573415, + "learning_rate": 1.5748957709234074e-06, + "loss": 0.2716, + "step": 25789 + }, + { + "epoch": 0.75, + "grad_norm": 1.3006576172590745, + "learning_rate": 1.5745535929911265e-06, + "loss": 0.2692, + "step": 25790 + }, + { + "epoch": 0.75, + "grad_norm": 1.3323687533454935, + "learning_rate": 1.5742114452884482e-06, + "loss": 0.3043, + "step": 25791 + }, + { + "epoch": 0.75, + "grad_norm": 1.4898506352503713, + "learning_rate": 1.573869327818392e-06, + "loss": 0.3254, + "step": 25792 + }, + { + "epoch": 0.75, + "grad_norm": 1.4936915152752892, + "learning_rate": 1.5735272405839786e-06, + "loss": 0.2771, + "step": 25793 + }, + { + "epoch": 0.75, + "grad_norm": 1.3860630134572387, + "learning_rate": 1.573185183588224e-06, + "loss": 0.2781, + "step": 25794 + }, + { + "epoch": 0.75, + "grad_norm": 1.1779017952520625, + "learning_rate": 1.572843156834148e-06, + "loss": 0.2674, + "step": 25795 + }, + { + "epoch": 0.75, + "grad_norm": 1.329340325199999, + "learning_rate": 1.5725011603247687e-06, + "loss": 0.2966, + "step": 25796 + }, + { + "epoch": 0.75, + "grad_norm": 1.2807254966579977, + "learning_rate": 1.5721591940631048e-06, + "loss": 0.2691, + "step": 25797 + }, + { + "epoch": 0.75, + "grad_norm": 1.3691174032304612, + "learning_rate": 1.5718172580521745e-06, + "loss": 0.2715, + "step": 25798 + }, + { + "epoch": 0.75, + "grad_norm": 1.344401906830256, + "learning_rate": 1.5714753522949943e-06, + "loss": 0.2747, + "step": 25799 + }, + { + "epoch": 0.75, + "grad_norm": 1.2361187293081697, + "learning_rate": 1.571133476794583e-06, + "loss": 0.2847, + "step": 25800 + }, + { + "epoch": 0.75, + "grad_norm": 1.9522336197989196, + "learning_rate": 1.570791631553958e-06, + "loss": 0.2702, + "step": 25801 + }, + { + "epoch": 0.75, + "grad_norm": 1.4429378161137334, + "learning_rate": 1.5704498165761329e-06, + "loss": 0.2946, + "step": 25802 + }, + { + "epoch": 0.75, + "grad_norm": 1.4246214326663726, + "learning_rate": 1.5701080318641265e-06, + "loss": 0.2636, + "step": 25803 + }, + { + "epoch": 0.75, + "grad_norm": 1.384975494999278, + "learning_rate": 1.5697662774209548e-06, + "loss": 0.2753, + "step": 25804 + }, + { + "epoch": 0.75, + "grad_norm": 1.2816825105804954, + "learning_rate": 1.569424553249634e-06, + "loss": 0.2883, + "step": 25805 + }, + { + "epoch": 0.75, + "grad_norm": 1.2675964122506964, + "learning_rate": 1.5690828593531792e-06, + "loss": 0.2597, + "step": 25806 + }, + { + "epoch": 0.75, + "grad_norm": 1.6242780620733244, + "learning_rate": 1.5687411957346066e-06, + "loss": 0.2706, + "step": 25807 + }, + { + "epoch": 0.75, + "grad_norm": 1.3440968923013024, + "learning_rate": 1.5683995623969323e-06, + "loss": 0.292, + "step": 25808 + }, + { + "epoch": 0.75, + "grad_norm": 1.2768479310151224, + "learning_rate": 1.5680579593431683e-06, + "loss": 0.2795, + "step": 25809 + }, + { + "epoch": 0.75, + "grad_norm": 1.368145561529811, + "learning_rate": 1.5677163865763311e-06, + "loss": 0.2792, + "step": 25810 + }, + { + "epoch": 0.75, + "grad_norm": 1.4429349030107126, + "learning_rate": 1.5673748440994347e-06, + "loss": 0.2799, + "step": 25811 + }, + { + "epoch": 0.75, + "grad_norm": 1.7614166771984683, + "learning_rate": 1.5670333319154951e-06, + "loss": 0.2661, + "step": 25812 + }, + { + "epoch": 0.75, + "grad_norm": 1.6314689433109841, + "learning_rate": 1.5666918500275225e-06, + "loss": 0.287, + "step": 25813 + }, + { + "epoch": 0.75, + "grad_norm": 1.468946955006192, + "learning_rate": 1.5663503984385325e-06, + "loss": 0.3012, + "step": 25814 + }, + { + "epoch": 0.75, + "grad_norm": 1.6511607266679649, + "learning_rate": 1.5660089771515386e-06, + "loss": 0.261, + "step": 25815 + }, + { + "epoch": 0.75, + "grad_norm": 1.3608265786912566, + "learning_rate": 1.5656675861695536e-06, + "loss": 0.2564, + "step": 25816 + }, + { + "epoch": 0.75, + "grad_norm": 1.6551440488435851, + "learning_rate": 1.5653262254955903e-06, + "loss": 0.2768, + "step": 25817 + }, + { + "epoch": 0.75, + "grad_norm": 1.437746411369172, + "learning_rate": 1.5649848951326608e-06, + "loss": 0.2834, + "step": 25818 + }, + { + "epoch": 0.75, + "grad_norm": 1.445830222038325, + "learning_rate": 1.564643595083778e-06, + "loss": 0.2683, + "step": 25819 + }, + { + "epoch": 0.75, + "grad_norm": 1.4469667230351668, + "learning_rate": 1.5643023253519552e-06, + "loss": 0.2838, + "step": 25820 + }, + { + "epoch": 0.75, + "grad_norm": 1.3931233071796527, + "learning_rate": 1.5639610859402004e-06, + "loss": 0.2816, + "step": 25821 + }, + { + "epoch": 0.75, + "grad_norm": 1.3263507347434356, + "learning_rate": 1.5636198768515276e-06, + "loss": 0.2856, + "step": 25822 + }, + { + "epoch": 0.75, + "grad_norm": 1.2876590748080232, + "learning_rate": 1.5632786980889475e-06, + "loss": 0.2802, + "step": 25823 + }, + { + "epoch": 0.75, + "grad_norm": 1.2671033943148846, + "learning_rate": 1.5629375496554706e-06, + "loss": 0.2637, + "step": 25824 + }, + { + "epoch": 0.75, + "grad_norm": 1.302513614064834, + "learning_rate": 1.5625964315541086e-06, + "loss": 0.2637, + "step": 25825 + }, + { + "epoch": 0.75, + "grad_norm": 1.3097365650625359, + "learning_rate": 1.5622553437878706e-06, + "loss": 0.2908, + "step": 25826 + }, + { + "epoch": 0.75, + "grad_norm": 1.2915915876354025, + "learning_rate": 1.5619142863597675e-06, + "loss": 0.2791, + "step": 25827 + }, + { + "epoch": 0.75, + "grad_norm": 1.3599883360997096, + "learning_rate": 1.5615732592728095e-06, + "loss": 0.3156, + "step": 25828 + }, + { + "epoch": 0.75, + "grad_norm": 0.9364226577131951, + "learning_rate": 1.5612322625300064e-06, + "loss": 0.5824, + "step": 25829 + }, + { + "epoch": 0.75, + "grad_norm": 1.3664865395591206, + "learning_rate": 1.5608912961343658e-06, + "loss": 0.2869, + "step": 25830 + }, + { + "epoch": 0.75, + "grad_norm": 1.5042093148232207, + "learning_rate": 1.5605503600888972e-06, + "loss": 0.334, + "step": 25831 + }, + { + "epoch": 0.75, + "grad_norm": 1.3264569676180245, + "learning_rate": 1.5602094543966102e-06, + "loss": 0.2715, + "step": 25832 + }, + { + "epoch": 0.75, + "grad_norm": 1.4601246921740845, + "learning_rate": 1.5598685790605128e-06, + "loss": 0.2946, + "step": 25833 + }, + { + "epoch": 0.75, + "grad_norm": 1.3599641240959022, + "learning_rate": 1.5595277340836135e-06, + "loss": 0.2888, + "step": 25834 + }, + { + "epoch": 0.75, + "grad_norm": 1.4375695807957225, + "learning_rate": 1.5591869194689197e-06, + "loss": 0.2883, + "step": 25835 + }, + { + "epoch": 0.75, + "grad_norm": 1.5559267673218884, + "learning_rate": 1.5588461352194412e-06, + "loss": 0.2739, + "step": 25836 + }, + { + "epoch": 0.75, + "grad_norm": 1.3484616480199436, + "learning_rate": 1.5585053813381823e-06, + "loss": 0.283, + "step": 25837 + }, + { + "epoch": 0.75, + "grad_norm": 1.3696889144277258, + "learning_rate": 1.5581646578281517e-06, + "loss": 0.2768, + "step": 25838 + }, + { + "epoch": 0.75, + "grad_norm": 2.4292265844707566, + "learning_rate": 1.5578239646923576e-06, + "loss": 0.2841, + "step": 25839 + }, + { + "epoch": 0.75, + "grad_norm": 1.3348923200880578, + "learning_rate": 1.5574833019338036e-06, + "loss": 0.2789, + "step": 25840 + }, + { + "epoch": 0.75, + "grad_norm": 1.2798166110051872, + "learning_rate": 1.5571426695554975e-06, + "loss": 0.2685, + "step": 25841 + }, + { + "epoch": 0.75, + "grad_norm": 1.4470443103145227, + "learning_rate": 1.5568020675604455e-06, + "loss": 0.2707, + "step": 25842 + }, + { + "epoch": 0.75, + "grad_norm": 1.617076452872189, + "learning_rate": 1.5564614959516538e-06, + "loss": 0.2638, + "step": 25843 + }, + { + "epoch": 0.75, + "grad_norm": 1.383323844447204, + "learning_rate": 1.5561209547321276e-06, + "loss": 0.2552, + "step": 25844 + }, + { + "epoch": 0.75, + "grad_norm": 1.2937118485342696, + "learning_rate": 1.5557804439048725e-06, + "loss": 0.3018, + "step": 25845 + }, + { + "epoch": 0.75, + "grad_norm": 1.4663024170252625, + "learning_rate": 1.5554399634728923e-06, + "loss": 0.2996, + "step": 25846 + }, + { + "epoch": 0.75, + "grad_norm": 1.368691467986407, + "learning_rate": 1.5550995134391933e-06, + "loss": 0.2718, + "step": 25847 + }, + { + "epoch": 0.75, + "grad_norm": 1.3265876756833443, + "learning_rate": 1.5547590938067803e-06, + "loss": 0.2843, + "step": 25848 + }, + { + "epoch": 0.75, + "grad_norm": 1.4817924808606198, + "learning_rate": 1.5544187045786552e-06, + "loss": 0.3078, + "step": 25849 + }, + { + "epoch": 0.75, + "grad_norm": 1.2819425803860454, + "learning_rate": 1.5540783457578234e-06, + "loss": 0.2735, + "step": 25850 + }, + { + "epoch": 0.75, + "grad_norm": 1.2568589383086586, + "learning_rate": 1.5537380173472888e-06, + "loss": 0.2714, + "step": 25851 + }, + { + "epoch": 0.75, + "grad_norm": 1.9417287940179773, + "learning_rate": 1.5533977193500538e-06, + "loss": 0.2958, + "step": 25852 + }, + { + "epoch": 0.75, + "grad_norm": 1.2473520523364812, + "learning_rate": 1.5530574517691221e-06, + "loss": 0.2743, + "step": 25853 + }, + { + "epoch": 0.75, + "grad_norm": 1.4945340083656886, + "learning_rate": 1.552717214607497e-06, + "loss": 0.2797, + "step": 25854 + }, + { + "epoch": 0.75, + "grad_norm": 1.3361323804206924, + "learning_rate": 1.5523770078681805e-06, + "loss": 0.2803, + "step": 25855 + }, + { + "epoch": 0.75, + "grad_norm": 1.4587440416764887, + "learning_rate": 1.5520368315541772e-06, + "loss": 0.2782, + "step": 25856 + }, + { + "epoch": 0.75, + "grad_norm": 1.4352375647951066, + "learning_rate": 1.551696685668485e-06, + "loss": 0.2904, + "step": 25857 + }, + { + "epoch": 0.75, + "grad_norm": 1.4840431481664078, + "learning_rate": 1.5513565702141082e-06, + "loss": 0.3092, + "step": 25858 + }, + { + "epoch": 0.75, + "grad_norm": 1.3991597097902333, + "learning_rate": 1.5510164851940474e-06, + "loss": 0.2925, + "step": 25859 + }, + { + "epoch": 0.75, + "grad_norm": 1.330997783852502, + "learning_rate": 1.5506764306113048e-06, + "loss": 0.2713, + "step": 25860 + }, + { + "epoch": 0.75, + "grad_norm": 1.5094712711008575, + "learning_rate": 1.5503364064688807e-06, + "loss": 0.2886, + "step": 25861 + }, + { + "epoch": 0.75, + "grad_norm": 1.3918763759797286, + "learning_rate": 1.5499964127697764e-06, + "loss": 0.2663, + "step": 25862 + }, + { + "epoch": 0.75, + "grad_norm": 1.5065877178014906, + "learning_rate": 1.549656449516992e-06, + "loss": 0.2782, + "step": 25863 + }, + { + "epoch": 0.75, + "grad_norm": 1.2766110917509823, + "learning_rate": 1.5493165167135288e-06, + "loss": 0.2892, + "step": 25864 + }, + { + "epoch": 0.75, + "grad_norm": 1.5998502675252366, + "learning_rate": 1.5489766143623846e-06, + "loss": 0.28, + "step": 25865 + }, + { + "epoch": 0.75, + "grad_norm": 1.4390979892534106, + "learning_rate": 1.54863674246656e-06, + "loss": 0.2566, + "step": 25866 + }, + { + "epoch": 0.75, + "grad_norm": 1.4243396990736117, + "learning_rate": 1.5482969010290555e-06, + "loss": 0.3065, + "step": 25867 + }, + { + "epoch": 0.75, + "grad_norm": 1.3076791085108432, + "learning_rate": 1.5479570900528678e-06, + "loss": 0.2664, + "step": 25868 + }, + { + "epoch": 0.75, + "grad_norm": 1.53243678741332, + "learning_rate": 1.5476173095409974e-06, + "loss": 0.3108, + "step": 25869 + }, + { + "epoch": 0.75, + "grad_norm": 1.3523929320120196, + "learning_rate": 1.5472775594964424e-06, + "loss": 0.2633, + "step": 25870 + }, + { + "epoch": 0.75, + "grad_norm": 1.2331915181567905, + "learning_rate": 1.5469378399222013e-06, + "loss": 0.2781, + "step": 25871 + }, + { + "epoch": 0.75, + "grad_norm": 1.1628375527002628, + "learning_rate": 1.546598150821272e-06, + "loss": 0.2707, + "step": 25872 + }, + { + "epoch": 0.75, + "grad_norm": 1.443070883098289, + "learning_rate": 1.5462584921966522e-06, + "loss": 0.283, + "step": 25873 + }, + { + "epoch": 0.75, + "grad_norm": 1.470871186432327, + "learning_rate": 1.5459188640513394e-06, + "loss": 0.2827, + "step": 25874 + }, + { + "epoch": 0.75, + "grad_norm": 1.4443196362365185, + "learning_rate": 1.5455792663883329e-06, + "loss": 0.2847, + "step": 25875 + }, + { + "epoch": 0.75, + "grad_norm": 2.7279713815368005, + "learning_rate": 1.5452396992106255e-06, + "loss": 0.2922, + "step": 25876 + }, + { + "epoch": 0.75, + "grad_norm": 1.4423012080268072, + "learning_rate": 1.5449001625212168e-06, + "loss": 0.2715, + "step": 25877 + }, + { + "epoch": 0.75, + "grad_norm": 1.3069181301825945, + "learning_rate": 1.5445606563231024e-06, + "loss": 0.2859, + "step": 25878 + }, + { + "epoch": 0.75, + "grad_norm": 1.3966020860588653, + "learning_rate": 1.544221180619278e-06, + "loss": 0.2893, + "step": 25879 + }, + { + "epoch": 0.75, + "grad_norm": 1.367951910677265, + "learning_rate": 1.5438817354127406e-06, + "loss": 0.2781, + "step": 25880 + }, + { + "epoch": 0.75, + "grad_norm": 1.9439614967598526, + "learning_rate": 1.5435423207064849e-06, + "loss": 0.3264, + "step": 25881 + }, + { + "epoch": 0.75, + "grad_norm": 1.4330157637578325, + "learning_rate": 1.5432029365035066e-06, + "loss": 0.2671, + "step": 25882 + }, + { + "epoch": 0.75, + "grad_norm": 2.217866055801727, + "learning_rate": 1.5428635828068011e-06, + "loss": 0.2983, + "step": 25883 + }, + { + "epoch": 0.75, + "grad_norm": 1.4048863641996285, + "learning_rate": 1.5425242596193641e-06, + "loss": 0.292, + "step": 25884 + }, + { + "epoch": 0.75, + "grad_norm": 2.138458340879609, + "learning_rate": 1.542184966944187e-06, + "loss": 0.2789, + "step": 25885 + }, + { + "epoch": 0.75, + "grad_norm": 1.3497885547413673, + "learning_rate": 1.541845704784266e-06, + "loss": 0.2676, + "step": 25886 + }, + { + "epoch": 0.75, + "grad_norm": 1.2054318857984765, + "learning_rate": 1.5415064731425955e-06, + "loss": 0.2582, + "step": 25887 + }, + { + "epoch": 0.75, + "grad_norm": 1.4842065532035646, + "learning_rate": 1.5411672720221682e-06, + "loss": 0.3143, + "step": 25888 + }, + { + "epoch": 0.75, + "grad_norm": 1.2098319309697934, + "learning_rate": 1.5408281014259785e-06, + "loss": 0.2707, + "step": 25889 + }, + { + "epoch": 0.75, + "grad_norm": 1.3565078007894815, + "learning_rate": 1.5404889613570189e-06, + "loss": 0.2826, + "step": 25890 + }, + { + "epoch": 0.75, + "grad_norm": 1.1949468305609248, + "learning_rate": 1.5401498518182827e-06, + "loss": 0.2701, + "step": 25891 + }, + { + "epoch": 0.75, + "grad_norm": 1.2003513229853164, + "learning_rate": 1.5398107728127643e-06, + "loss": 0.2652, + "step": 25892 + }, + { + "epoch": 0.75, + "grad_norm": 1.390558282991935, + "learning_rate": 1.5394717243434526e-06, + "loss": 0.2785, + "step": 25893 + }, + { + "epoch": 0.75, + "grad_norm": 1.3968976773936805, + "learning_rate": 1.5391327064133426e-06, + "loss": 0.276, + "step": 25894 + }, + { + "epoch": 0.75, + "grad_norm": 1.3796387522740505, + "learning_rate": 1.5387937190254232e-06, + "loss": 0.2699, + "step": 25895 + }, + { + "epoch": 0.75, + "grad_norm": 1.3942062500944914, + "learning_rate": 1.5384547621826878e-06, + "loss": 0.2799, + "step": 25896 + }, + { + "epoch": 0.75, + "grad_norm": 1.272748105358723, + "learning_rate": 1.5381158358881277e-06, + "loss": 0.2774, + "step": 25897 + }, + { + "epoch": 0.75, + "grad_norm": 1.2974562379752035, + "learning_rate": 1.5377769401447335e-06, + "loss": 0.2785, + "step": 25898 + }, + { + "epoch": 0.75, + "grad_norm": 1.2173377138321655, + "learning_rate": 1.5374380749554963e-06, + "loss": 0.2703, + "step": 25899 + }, + { + "epoch": 0.75, + "grad_norm": 2.1942525367848513, + "learning_rate": 1.5370992403234064e-06, + "loss": 0.2926, + "step": 25900 + }, + { + "epoch": 0.75, + "grad_norm": 1.4098030231765486, + "learning_rate": 1.5367604362514543e-06, + "loss": 0.2818, + "step": 25901 + }, + { + "epoch": 0.75, + "grad_norm": 1.4629724145008918, + "learning_rate": 1.5364216627426298e-06, + "loss": 0.2951, + "step": 25902 + }, + { + "epoch": 0.75, + "grad_norm": 1.2657860192791162, + "learning_rate": 1.536082919799924e-06, + "loss": 0.2582, + "step": 25903 + }, + { + "epoch": 0.75, + "grad_norm": 1.5127725087916402, + "learning_rate": 1.5357442074263235e-06, + "loss": 0.2888, + "step": 25904 + }, + { + "epoch": 0.75, + "grad_norm": 1.3162306492971463, + "learning_rate": 1.5354055256248185e-06, + "loss": 0.2583, + "step": 25905 + }, + { + "epoch": 0.75, + "grad_norm": 1.3552788542769565, + "learning_rate": 1.5350668743983982e-06, + "loss": 0.2836, + "step": 25906 + }, + { + "epoch": 0.75, + "grad_norm": 1.4656863962063251, + "learning_rate": 1.5347282537500513e-06, + "loss": 0.2893, + "step": 25907 + }, + { + "epoch": 0.75, + "grad_norm": 1.3509479980030616, + "learning_rate": 1.5343896636827655e-06, + "loss": 0.2819, + "step": 25908 + }, + { + "epoch": 0.75, + "grad_norm": 1.5690845766904975, + "learning_rate": 1.5340511041995298e-06, + "loss": 0.2985, + "step": 25909 + }, + { + "epoch": 0.75, + "grad_norm": 1.4298538987858047, + "learning_rate": 1.5337125753033316e-06, + "loss": 0.3249, + "step": 25910 + }, + { + "epoch": 0.75, + "grad_norm": 1.4364471895134368, + "learning_rate": 1.53337407699716e-06, + "loss": 0.2712, + "step": 25911 + }, + { + "epoch": 0.75, + "grad_norm": 1.5327405538393248, + "learning_rate": 1.5330356092839988e-06, + "loss": 0.2699, + "step": 25912 + }, + { + "epoch": 0.75, + "grad_norm": 1.3534889148670712, + "learning_rate": 1.5326971721668372e-06, + "loss": 0.2921, + "step": 25913 + }, + { + "epoch": 0.75, + "grad_norm": 1.2649187928543457, + "learning_rate": 1.532358765648661e-06, + "loss": 0.2781, + "step": 25914 + }, + { + "epoch": 0.75, + "grad_norm": 1.41983093007392, + "learning_rate": 1.5320203897324576e-06, + "loss": 0.2809, + "step": 25915 + }, + { + "epoch": 0.75, + "grad_norm": 1.2984028828252323, + "learning_rate": 1.5316820444212127e-06, + "loss": 0.2649, + "step": 25916 + }, + { + "epoch": 0.75, + "grad_norm": 1.4398959460064638, + "learning_rate": 1.5313437297179119e-06, + "loss": 0.2946, + "step": 25917 + }, + { + "epoch": 0.75, + "grad_norm": 1.9442750223985203, + "learning_rate": 1.5310054456255412e-06, + "loss": 0.3086, + "step": 25918 + }, + { + "epoch": 0.75, + "grad_norm": 1.3168579526073287, + "learning_rate": 1.5306671921470862e-06, + "loss": 0.2731, + "step": 25919 + }, + { + "epoch": 0.75, + "grad_norm": 1.394746116958413, + "learning_rate": 1.5303289692855333e-06, + "loss": 0.292, + "step": 25920 + }, + { + "epoch": 0.75, + "grad_norm": 2.310314770930365, + "learning_rate": 1.529990777043866e-06, + "loss": 0.2704, + "step": 25921 + }, + { + "epoch": 0.75, + "grad_norm": 1.4969782714414224, + "learning_rate": 1.5296526154250662e-06, + "loss": 0.2905, + "step": 25922 + }, + { + "epoch": 0.75, + "grad_norm": 1.4756682503433962, + "learning_rate": 1.5293144844321212e-06, + "loss": 0.2716, + "step": 25923 + }, + { + "epoch": 0.75, + "grad_norm": 1.5511317046482447, + "learning_rate": 1.5289763840680144e-06, + "loss": 0.285, + "step": 25924 + }, + { + "epoch": 0.75, + "grad_norm": 1.2901186965680767, + "learning_rate": 1.5286383143357293e-06, + "loss": 0.2734, + "step": 25925 + }, + { + "epoch": 0.75, + "grad_norm": 1.7027454028230107, + "learning_rate": 1.5283002752382493e-06, + "loss": 0.2764, + "step": 25926 + }, + { + "epoch": 0.75, + "grad_norm": 1.2715123408249018, + "learning_rate": 1.5279622667785581e-06, + "loss": 0.2648, + "step": 25927 + }, + { + "epoch": 0.75, + "grad_norm": 1.4789773883525292, + "learning_rate": 1.5276242889596388e-06, + "loss": 0.2722, + "step": 25928 + }, + { + "epoch": 0.75, + "grad_norm": 1.5120552356209833, + "learning_rate": 1.527286341784473e-06, + "loss": 0.3126, + "step": 25929 + }, + { + "epoch": 0.75, + "grad_norm": 1.6370731051112335, + "learning_rate": 1.5269484252560457e-06, + "loss": 0.2791, + "step": 25930 + }, + { + "epoch": 0.75, + "grad_norm": 1.2632345486162457, + "learning_rate": 1.5266105393773355e-06, + "loss": 0.2508, + "step": 25931 + }, + { + "epoch": 0.75, + "grad_norm": 0.9566089993672516, + "learning_rate": 1.526272684151326e-06, + "loss": 0.5676, + "step": 25932 + }, + { + "epoch": 0.75, + "grad_norm": 0.9836985747567534, + "learning_rate": 1.5259348595809981e-06, + "loss": 0.5543, + "step": 25933 + }, + { + "epoch": 0.75, + "grad_norm": 1.3333452949358917, + "learning_rate": 1.5255970656693342e-06, + "loss": 0.3015, + "step": 25934 + }, + { + "epoch": 0.75, + "grad_norm": 1.2568831564338976, + "learning_rate": 1.5252593024193141e-06, + "loss": 0.2857, + "step": 25935 + }, + { + "epoch": 0.75, + "grad_norm": 1.3049656945988395, + "learning_rate": 1.5249215698339198e-06, + "loss": 0.2726, + "step": 25936 + }, + { + "epoch": 0.75, + "grad_norm": 1.407706926245472, + "learning_rate": 1.5245838679161307e-06, + "loss": 0.2594, + "step": 25937 + }, + { + "epoch": 0.75, + "grad_norm": 1.2859366704049053, + "learning_rate": 1.5242461966689282e-06, + "loss": 0.2657, + "step": 25938 + }, + { + "epoch": 0.75, + "grad_norm": 1.419982196613019, + "learning_rate": 1.5239085560952927e-06, + "loss": 0.2777, + "step": 25939 + }, + { + "epoch": 0.75, + "grad_norm": 1.3278018129305016, + "learning_rate": 1.5235709461982012e-06, + "loss": 0.2933, + "step": 25940 + }, + { + "epoch": 0.75, + "grad_norm": 1.3195341091086017, + "learning_rate": 1.5232333669806344e-06, + "loss": 0.2727, + "step": 25941 + }, + { + "epoch": 0.75, + "grad_norm": 1.8424457647688668, + "learning_rate": 1.5228958184455723e-06, + "loss": 0.2849, + "step": 25942 + }, + { + "epoch": 0.75, + "grad_norm": 1.4530738511350727, + "learning_rate": 1.5225583005959927e-06, + "loss": 0.2563, + "step": 25943 + }, + { + "epoch": 0.75, + "grad_norm": 1.4220522379073868, + "learning_rate": 1.5222208134348748e-06, + "loss": 0.2671, + "step": 25944 + }, + { + "epoch": 0.75, + "grad_norm": 1.265854504676344, + "learning_rate": 1.5218833569651963e-06, + "loss": 0.2587, + "step": 25945 + }, + { + "epoch": 0.75, + "grad_norm": 1.3838836915471864, + "learning_rate": 1.5215459311899366e-06, + "loss": 0.287, + "step": 25946 + }, + { + "epoch": 0.75, + "grad_norm": 1.509620948223178, + "learning_rate": 1.5212085361120737e-06, + "loss": 0.3169, + "step": 25947 + }, + { + "epoch": 0.75, + "grad_norm": 1.5094204008791472, + "learning_rate": 1.520871171734583e-06, + "loss": 0.2667, + "step": 25948 + }, + { + "epoch": 0.75, + "grad_norm": 1.4197402560093144, + "learning_rate": 1.5205338380604439e-06, + "loss": 0.2972, + "step": 25949 + }, + { + "epoch": 0.75, + "grad_norm": 1.3366841410126293, + "learning_rate": 1.5201965350926312e-06, + "loss": 0.279, + "step": 25950 + }, + { + "epoch": 0.75, + "grad_norm": 1.3513806923696015, + "learning_rate": 1.519859262834122e-06, + "loss": 0.2966, + "step": 25951 + }, + { + "epoch": 0.75, + "grad_norm": 1.2593297274828028, + "learning_rate": 1.5195220212878935e-06, + "loss": 0.2652, + "step": 25952 + }, + { + "epoch": 0.75, + "grad_norm": 1.367840189692326, + "learning_rate": 1.5191848104569223e-06, + "loss": 0.2891, + "step": 25953 + }, + { + "epoch": 0.75, + "grad_norm": 1.321606171583103, + "learning_rate": 1.5188476303441835e-06, + "loss": 0.3052, + "step": 25954 + }, + { + "epoch": 0.75, + "grad_norm": 1.3111410187656505, + "learning_rate": 1.518510480952653e-06, + "loss": 0.2682, + "step": 25955 + }, + { + "epoch": 0.75, + "grad_norm": 3.864907596267365, + "learning_rate": 1.5181733622853062e-06, + "loss": 0.3138, + "step": 25956 + }, + { + "epoch": 0.75, + "grad_norm": 1.2408152872011171, + "learning_rate": 1.5178362743451197e-06, + "loss": 0.2787, + "step": 25957 + }, + { + "epoch": 0.75, + "grad_norm": 1.359687320377138, + "learning_rate": 1.517499217135065e-06, + "loss": 0.2741, + "step": 25958 + }, + { + "epoch": 0.75, + "grad_norm": 1.2724622328054387, + "learning_rate": 1.517162190658118e-06, + "loss": 0.2501, + "step": 25959 + }, + { + "epoch": 0.75, + "grad_norm": 1.391430140835537, + "learning_rate": 1.5168251949172536e-06, + "loss": 0.2894, + "step": 25960 + }, + { + "epoch": 0.75, + "grad_norm": 1.2412267332175095, + "learning_rate": 1.5164882299154459e-06, + "loss": 0.2711, + "step": 25961 + }, + { + "epoch": 0.75, + "grad_norm": 1.3753253148654525, + "learning_rate": 1.516151295655668e-06, + "loss": 0.3088, + "step": 25962 + }, + { + "epoch": 0.75, + "grad_norm": 1.384912570170314, + "learning_rate": 1.515814392140893e-06, + "loss": 0.3303, + "step": 25963 + }, + { + "epoch": 0.75, + "grad_norm": 1.4733591844577636, + "learning_rate": 1.515477519374095e-06, + "loss": 0.2719, + "step": 25964 + }, + { + "epoch": 0.75, + "grad_norm": 1.4679547442300191, + "learning_rate": 1.5151406773582466e-06, + "loss": 0.2718, + "step": 25965 + }, + { + "epoch": 0.75, + "grad_norm": 1.3882173145540073, + "learning_rate": 1.5148038660963222e-06, + "loss": 0.2905, + "step": 25966 + }, + { + "epoch": 0.75, + "grad_norm": 1.2949234205793347, + "learning_rate": 1.5144670855912908e-06, + "loss": 0.2952, + "step": 25967 + }, + { + "epoch": 0.75, + "grad_norm": 1.3097064681646295, + "learning_rate": 1.5141303358461256e-06, + "loss": 0.2612, + "step": 25968 + }, + { + "epoch": 0.75, + "grad_norm": 1.4767568751474054, + "learning_rate": 1.513793616863799e-06, + "loss": 0.289, + "step": 25969 + }, + { + "epoch": 0.75, + "grad_norm": 1.6334667733122414, + "learning_rate": 1.5134569286472823e-06, + "loss": 0.272, + "step": 25970 + }, + { + "epoch": 0.75, + "grad_norm": 1.468645668824123, + "learning_rate": 1.513120271199547e-06, + "loss": 0.2698, + "step": 25971 + }, + { + "epoch": 0.75, + "grad_norm": 1.9431923817209193, + "learning_rate": 1.5127836445235639e-06, + "loss": 0.2819, + "step": 25972 + }, + { + "epoch": 0.75, + "grad_norm": 1.3970142306631628, + "learning_rate": 1.5124470486223042e-06, + "loss": 0.2746, + "step": 25973 + }, + { + "epoch": 0.75, + "grad_norm": 2.068439614684957, + "learning_rate": 1.5121104834987378e-06, + "loss": 0.281, + "step": 25974 + }, + { + "epoch": 0.75, + "grad_norm": 1.2775880336173644, + "learning_rate": 1.5117739491558364e-06, + "loss": 0.2766, + "step": 25975 + }, + { + "epoch": 0.75, + "grad_norm": 1.868921514453276, + "learning_rate": 1.5114374455965685e-06, + "loss": 0.2854, + "step": 25976 + }, + { + "epoch": 0.75, + "grad_norm": 1.4916303828460882, + "learning_rate": 1.511100972823903e-06, + "loss": 0.3082, + "step": 25977 + }, + { + "epoch": 0.75, + "grad_norm": 1.35861630645898, + "learning_rate": 1.5107645308408092e-06, + "loss": 0.2831, + "step": 25978 + }, + { + "epoch": 0.75, + "grad_norm": 1.4356277794982537, + "learning_rate": 1.5104281196502579e-06, + "loss": 0.2797, + "step": 25979 + }, + { + "epoch": 0.75, + "grad_norm": 1.4111100839725754, + "learning_rate": 1.5100917392552167e-06, + "loss": 0.2617, + "step": 25980 + }, + { + "epoch": 0.75, + "grad_norm": 2.2862736347333215, + "learning_rate": 1.5097553896586547e-06, + "loss": 0.2844, + "step": 25981 + }, + { + "epoch": 0.75, + "grad_norm": 1.4166636020765453, + "learning_rate": 1.5094190708635404e-06, + "loss": 0.2822, + "step": 25982 + }, + { + "epoch": 0.75, + "grad_norm": 1.3769385943968524, + "learning_rate": 1.509082782872841e-06, + "loss": 0.27, + "step": 25983 + }, + { + "epoch": 0.75, + "grad_norm": 2.1140775493185786, + "learning_rate": 1.508746525689525e-06, + "loss": 0.2936, + "step": 25984 + }, + { + "epoch": 0.75, + "grad_norm": 1.4439008303333476, + "learning_rate": 1.5084102993165612e-06, + "loss": 0.2817, + "step": 25985 + }, + { + "epoch": 0.75, + "grad_norm": 1.2861391876948296, + "learning_rate": 1.5080741037569141e-06, + "loss": 0.2912, + "step": 25986 + }, + { + "epoch": 0.75, + "grad_norm": 1.313856270616859, + "learning_rate": 1.5077379390135511e-06, + "loss": 0.2891, + "step": 25987 + }, + { + "epoch": 0.75, + "grad_norm": 1.4037898535622015, + "learning_rate": 1.5074018050894402e-06, + "loss": 0.2708, + "step": 25988 + }, + { + "epoch": 0.75, + "grad_norm": 1.3236983095768204, + "learning_rate": 1.5070657019875461e-06, + "loss": 0.2968, + "step": 25989 + }, + { + "epoch": 0.75, + "grad_norm": 1.375593424600674, + "learning_rate": 1.506729629710837e-06, + "loss": 0.2767, + "step": 25990 + }, + { + "epoch": 0.75, + "grad_norm": 1.661547317090865, + "learning_rate": 1.506393588262277e-06, + "loss": 0.279, + "step": 25991 + }, + { + "epoch": 0.75, + "grad_norm": 1.3378928589068657, + "learning_rate": 1.5060575776448327e-06, + "loss": 0.268, + "step": 25992 + }, + { + "epoch": 0.75, + "grad_norm": 1.6966929444258934, + "learning_rate": 1.5057215978614703e-06, + "loss": 0.2626, + "step": 25993 + }, + { + "epoch": 0.75, + "grad_norm": 1.2520666875413005, + "learning_rate": 1.5053856489151524e-06, + "loss": 0.2869, + "step": 25994 + }, + { + "epoch": 0.75, + "grad_norm": 1.4910068197015398, + "learning_rate": 1.5050497308088446e-06, + "loss": 0.26, + "step": 25995 + }, + { + "epoch": 0.75, + "grad_norm": 1.5494271965019029, + "learning_rate": 1.5047138435455116e-06, + "loss": 0.2925, + "step": 25996 + }, + { + "epoch": 0.75, + "grad_norm": 1.4094345047814856, + "learning_rate": 1.5043779871281184e-06, + "loss": 0.2783, + "step": 25997 + }, + { + "epoch": 0.75, + "grad_norm": 1.4541070268889023, + "learning_rate": 1.5040421615596273e-06, + "loss": 0.29, + "step": 25998 + }, + { + "epoch": 0.75, + "grad_norm": 1.3662238085877594, + "learning_rate": 1.5037063668430035e-06, + "loss": 0.2796, + "step": 25999 + }, + { + "epoch": 0.75, + "grad_norm": 1.4239384666627304, + "learning_rate": 1.5033706029812095e-06, + "loss": 0.2794, + "step": 26000 + }, + { + "epoch": 0.75, + "grad_norm": 1.3986420797172507, + "learning_rate": 1.503034869977209e-06, + "loss": 0.2715, + "step": 26001 + }, + { + "epoch": 0.75, + "grad_norm": 1.3249128583106944, + "learning_rate": 1.5026991678339658e-06, + "loss": 0.2557, + "step": 26002 + }, + { + "epoch": 0.75, + "grad_norm": 1.8745814092719109, + "learning_rate": 1.5023634965544404e-06, + "loss": 0.29, + "step": 26003 + }, + { + "epoch": 0.75, + "grad_norm": 1.671150200864774, + "learning_rate": 1.5020278561415951e-06, + "loss": 0.2587, + "step": 26004 + }, + { + "epoch": 0.75, + "grad_norm": 1.5780319897289568, + "learning_rate": 1.5016922465983947e-06, + "loss": 0.2791, + "step": 26005 + }, + { + "epoch": 0.75, + "grad_norm": 1.4295381542302985, + "learning_rate": 1.5013566679277975e-06, + "loss": 0.2802, + "step": 26006 + }, + { + "epoch": 0.75, + "grad_norm": 1.0091166506557026, + "learning_rate": 1.5010211201327663e-06, + "loss": 0.6133, + "step": 26007 + }, + { + "epoch": 0.75, + "grad_norm": 1.5367349702694493, + "learning_rate": 1.5006856032162626e-06, + "loss": 0.2834, + "step": 26008 + }, + { + "epoch": 0.75, + "grad_norm": 1.503738130946839, + "learning_rate": 1.5003501171812473e-06, + "loss": 0.2818, + "step": 26009 + }, + { + "epoch": 0.75, + "grad_norm": 1.2802795315921631, + "learning_rate": 1.500014662030681e-06, + "loss": 0.296, + "step": 26010 + }, + { + "epoch": 0.75, + "grad_norm": 1.8599084247967543, + "learning_rate": 1.4996792377675245e-06, + "loss": 0.2793, + "step": 26011 + }, + { + "epoch": 0.75, + "grad_norm": 1.2758646146821442, + "learning_rate": 1.499343844394739e-06, + "loss": 0.2805, + "step": 26012 + }, + { + "epoch": 0.75, + "grad_norm": 1.4751864975151991, + "learning_rate": 1.499008481915281e-06, + "loss": 0.3265, + "step": 26013 + }, + { + "epoch": 0.75, + "grad_norm": 1.2122020365900799, + "learning_rate": 1.4986731503321123e-06, + "loss": 0.2957, + "step": 26014 + }, + { + "epoch": 0.75, + "grad_norm": 1.7082406839593314, + "learning_rate": 1.4983378496481914e-06, + "loss": 0.2921, + "step": 26015 + }, + { + "epoch": 0.75, + "grad_norm": 0.9959991351505303, + "learning_rate": 1.4980025798664783e-06, + "loss": 0.5878, + "step": 26016 + }, + { + "epoch": 0.75, + "grad_norm": 1.245820429660752, + "learning_rate": 1.4976673409899312e-06, + "loss": 0.264, + "step": 26017 + }, + { + "epoch": 0.75, + "grad_norm": 2.994770638733541, + "learning_rate": 1.4973321330215085e-06, + "loss": 0.2627, + "step": 26018 + }, + { + "epoch": 0.75, + "grad_norm": 1.25513333155212, + "learning_rate": 1.496996955964169e-06, + "loss": 0.262, + "step": 26019 + }, + { + "epoch": 0.75, + "grad_norm": 1.46133774969529, + "learning_rate": 1.4966618098208696e-06, + "loss": 0.2706, + "step": 26020 + }, + { + "epoch": 0.75, + "grad_norm": 1.3722749207689324, + "learning_rate": 1.4963266945945704e-06, + "loss": 0.2868, + "step": 26021 + }, + { + "epoch": 0.75, + "grad_norm": 1.7068143440114538, + "learning_rate": 1.4959916102882254e-06, + "loss": 0.2617, + "step": 26022 + }, + { + "epoch": 0.75, + "grad_norm": 1.3114843275523902, + "learning_rate": 1.4956565569047937e-06, + "loss": 0.3035, + "step": 26023 + }, + { + "epoch": 0.75, + "grad_norm": 1.5275712772718402, + "learning_rate": 1.4953215344472316e-06, + "loss": 0.2518, + "step": 26024 + }, + { + "epoch": 0.75, + "grad_norm": 1.3628282658624251, + "learning_rate": 1.4949865429184956e-06, + "loss": 0.2593, + "step": 26025 + }, + { + "epoch": 0.75, + "grad_norm": 1.3850468718081785, + "learning_rate": 1.4946515823215424e-06, + "loss": 0.2966, + "step": 26026 + }, + { + "epoch": 0.75, + "grad_norm": 1.6034599808110972, + "learning_rate": 1.4943166526593278e-06, + "loss": 0.3036, + "step": 26027 + }, + { + "epoch": 0.75, + "grad_norm": 2.831615382696091, + "learning_rate": 1.4939817539348078e-06, + "loss": 0.2759, + "step": 26028 + }, + { + "epoch": 0.75, + "grad_norm": 1.2858130958716152, + "learning_rate": 1.493646886150939e-06, + "loss": 0.2795, + "step": 26029 + }, + { + "epoch": 0.75, + "grad_norm": 1.5877935627288609, + "learning_rate": 1.493312049310674e-06, + "loss": 0.3046, + "step": 26030 + }, + { + "epoch": 0.76, + "grad_norm": 1.465911248650485, + "learning_rate": 1.4929772434169688e-06, + "loss": 0.3087, + "step": 26031 + }, + { + "epoch": 0.76, + "grad_norm": 1.3683651084840966, + "learning_rate": 1.4926424684727786e-06, + "loss": 0.2912, + "step": 26032 + }, + { + "epoch": 0.76, + "grad_norm": 1.2595735541774193, + "learning_rate": 1.4923077244810592e-06, + "loss": 0.2751, + "step": 26033 + }, + { + "epoch": 0.76, + "grad_norm": 2.0408870898762395, + "learning_rate": 1.4919730114447612e-06, + "loss": 0.2631, + "step": 26034 + }, + { + "epoch": 0.76, + "grad_norm": 1.4389014074533752, + "learning_rate": 1.49163832936684e-06, + "loss": 0.2853, + "step": 26035 + }, + { + "epoch": 0.76, + "grad_norm": 9.126941621322347, + "learning_rate": 1.49130367825025e-06, + "loss": 0.2749, + "step": 26036 + }, + { + "epoch": 0.76, + "grad_norm": 1.2963183450138078, + "learning_rate": 1.4909690580979435e-06, + "loss": 0.289, + "step": 26037 + }, + { + "epoch": 0.76, + "grad_norm": 1.3359177801007545, + "learning_rate": 1.4906344689128744e-06, + "loss": 0.2782, + "step": 26038 + }, + { + "epoch": 0.76, + "grad_norm": 1.3739893233180311, + "learning_rate": 1.4902999106979948e-06, + "loss": 0.324, + "step": 26039 + }, + { + "epoch": 0.76, + "grad_norm": 1.3557026511743695, + "learning_rate": 1.4899653834562594e-06, + "loss": 0.2856, + "step": 26040 + }, + { + "epoch": 0.76, + "grad_norm": 1.355066897083615, + "learning_rate": 1.4896308871906163e-06, + "loss": 0.3117, + "step": 26041 + }, + { + "epoch": 0.76, + "grad_norm": 1.3797855556574377, + "learning_rate": 1.4892964219040202e-06, + "loss": 0.3008, + "step": 26042 + }, + { + "epoch": 0.76, + "grad_norm": 1.3192740646047225, + "learning_rate": 1.4889619875994216e-06, + "loss": 0.288, + "step": 26043 + }, + { + "epoch": 0.76, + "grad_norm": 1.252134079191009, + "learning_rate": 1.4886275842797726e-06, + "loss": 0.2766, + "step": 26044 + }, + { + "epoch": 0.76, + "grad_norm": 0.950564083533902, + "learning_rate": 1.4882932119480243e-06, + "loss": 0.5995, + "step": 26045 + }, + { + "epoch": 0.76, + "grad_norm": 1.289821075057867, + "learning_rate": 1.4879588706071268e-06, + "loss": 0.3015, + "step": 26046 + }, + { + "epoch": 0.76, + "grad_norm": 1.3715530910106695, + "learning_rate": 1.4876245602600314e-06, + "loss": 0.3021, + "step": 26047 + }, + { + "epoch": 0.76, + "grad_norm": 1.6170616457058127, + "learning_rate": 1.4872902809096901e-06, + "loss": 0.2758, + "step": 26048 + }, + { + "epoch": 0.76, + "grad_norm": 1.373793042888645, + "learning_rate": 1.4869560325590492e-06, + "loss": 0.2776, + "step": 26049 + }, + { + "epoch": 0.76, + "grad_norm": 1.2518514807588983, + "learning_rate": 1.4866218152110606e-06, + "loss": 0.2585, + "step": 26050 + }, + { + "epoch": 0.76, + "grad_norm": 1.3780143299263525, + "learning_rate": 1.4862876288686734e-06, + "loss": 0.297, + "step": 26051 + }, + { + "epoch": 0.76, + "grad_norm": 1.3270069719538087, + "learning_rate": 1.4859534735348369e-06, + "loss": 0.2927, + "step": 26052 + }, + { + "epoch": 0.76, + "grad_norm": 1.181795947555778, + "learning_rate": 1.4856193492124992e-06, + "loss": 0.2767, + "step": 26053 + }, + { + "epoch": 0.76, + "grad_norm": 1.406694453053295, + "learning_rate": 1.4852852559046104e-06, + "loss": 0.2815, + "step": 26054 + }, + { + "epoch": 0.76, + "grad_norm": 1.274647019448104, + "learning_rate": 1.4849511936141186e-06, + "loss": 0.2715, + "step": 26055 + }, + { + "epoch": 0.76, + "grad_norm": 1.2841748650427036, + "learning_rate": 1.484617162343971e-06, + "loss": 0.2558, + "step": 26056 + }, + { + "epoch": 0.76, + "grad_norm": 1.3889514072498204, + "learning_rate": 1.4842831620971177e-06, + "loss": 0.281, + "step": 26057 + }, + { + "epoch": 0.76, + "grad_norm": 1.3068877942942627, + "learning_rate": 1.4839491928765027e-06, + "loss": 0.3108, + "step": 26058 + }, + { + "epoch": 0.76, + "grad_norm": 1.2571967853763426, + "learning_rate": 1.483615254685075e-06, + "loss": 0.2709, + "step": 26059 + }, + { + "epoch": 0.76, + "grad_norm": 1.2752739748406663, + "learning_rate": 1.4832813475257818e-06, + "loss": 0.279, + "step": 26060 + }, + { + "epoch": 0.76, + "grad_norm": 1.5697123524010215, + "learning_rate": 1.4829474714015713e-06, + "loss": 0.2779, + "step": 26061 + }, + { + "epoch": 0.76, + "grad_norm": 1.3562949661949184, + "learning_rate": 1.4826136263153867e-06, + "loss": 0.291, + "step": 26062 + }, + { + "epoch": 0.76, + "grad_norm": 1.3372677455484663, + "learning_rate": 1.4822798122701764e-06, + "loss": 0.305, + "step": 26063 + }, + { + "epoch": 0.76, + "grad_norm": 1.3365540234610849, + "learning_rate": 1.4819460292688853e-06, + "loss": 0.2831, + "step": 26064 + }, + { + "epoch": 0.76, + "grad_norm": 1.2735082323276254, + "learning_rate": 1.4816122773144598e-06, + "loss": 0.2753, + "step": 26065 + }, + { + "epoch": 0.76, + "grad_norm": 1.285804914373283, + "learning_rate": 1.4812785564098448e-06, + "loss": 0.2887, + "step": 26066 + }, + { + "epoch": 0.76, + "grad_norm": 1.3071443403050096, + "learning_rate": 1.4809448665579868e-06, + "loss": 0.2744, + "step": 26067 + }, + { + "epoch": 0.76, + "grad_norm": 1.2248941111712992, + "learning_rate": 1.4806112077618284e-06, + "loss": 0.2574, + "step": 26068 + }, + { + "epoch": 0.76, + "grad_norm": 1.3579276228492403, + "learning_rate": 1.4802775800243147e-06, + "loss": 0.2819, + "step": 26069 + }, + { + "epoch": 0.76, + "grad_norm": 1.4297062763798618, + "learning_rate": 1.4799439833483908e-06, + "loss": 0.2875, + "step": 26070 + }, + { + "epoch": 0.76, + "grad_norm": 1.3861890607596412, + "learning_rate": 1.4796104177370002e-06, + "loss": 0.2859, + "step": 26071 + }, + { + "epoch": 0.76, + "grad_norm": 1.4179449170514484, + "learning_rate": 1.4792768831930865e-06, + "loss": 0.2777, + "step": 26072 + }, + { + "epoch": 0.76, + "grad_norm": 1.2665369295865205, + "learning_rate": 1.4789433797195935e-06, + "loss": 0.2873, + "step": 26073 + }, + { + "epoch": 0.76, + "grad_norm": 1.286777819927756, + "learning_rate": 1.4786099073194644e-06, + "loss": 0.2911, + "step": 26074 + }, + { + "epoch": 0.76, + "grad_norm": 1.4769335431657271, + "learning_rate": 1.4782764659956417e-06, + "loss": 0.2691, + "step": 26075 + }, + { + "epoch": 0.76, + "grad_norm": 1.2067255998923407, + "learning_rate": 1.47794305575107e-06, + "loss": 0.2595, + "step": 26076 + }, + { + "epoch": 0.76, + "grad_norm": 1.4146474072675215, + "learning_rate": 1.4776096765886877e-06, + "loss": 0.2746, + "step": 26077 + }, + { + "epoch": 0.76, + "grad_norm": 1.3601621612643262, + "learning_rate": 1.47727632851144e-06, + "loss": 0.2819, + "step": 26078 + }, + { + "epoch": 0.76, + "grad_norm": 1.4581748604782871, + "learning_rate": 1.4769430115222672e-06, + "loss": 0.263, + "step": 26079 + }, + { + "epoch": 0.76, + "grad_norm": 1.2922235285730161, + "learning_rate": 1.4766097256241112e-06, + "loss": 0.271, + "step": 26080 + }, + { + "epoch": 0.76, + "grad_norm": 2.118086827658984, + "learning_rate": 1.476276470819914e-06, + "loss": 0.2704, + "step": 26081 + }, + { + "epoch": 0.76, + "grad_norm": 1.4105198249490256, + "learning_rate": 1.4759432471126162e-06, + "loss": 0.2878, + "step": 26082 + }, + { + "epoch": 0.76, + "grad_norm": 5.679984182398607, + "learning_rate": 1.4756100545051577e-06, + "loss": 0.3307, + "step": 26083 + }, + { + "epoch": 0.76, + "grad_norm": 1.64949411893405, + "learning_rate": 1.4752768930004813e-06, + "loss": 0.3066, + "step": 26084 + }, + { + "epoch": 0.76, + "grad_norm": 1.8659435585216828, + "learning_rate": 1.474943762601524e-06, + "loss": 0.2769, + "step": 26085 + }, + { + "epoch": 0.76, + "grad_norm": 1.3113514378367688, + "learning_rate": 1.4746106633112273e-06, + "loss": 0.2607, + "step": 26086 + }, + { + "epoch": 0.76, + "grad_norm": 1.3573068368215848, + "learning_rate": 1.4742775951325305e-06, + "loss": 0.284, + "step": 26087 + }, + { + "epoch": 0.76, + "grad_norm": 2.012590723869657, + "learning_rate": 1.4739445580683725e-06, + "loss": 0.2733, + "step": 26088 + }, + { + "epoch": 0.76, + "grad_norm": 1.343330515641007, + "learning_rate": 1.4736115521216953e-06, + "loss": 0.2883, + "step": 26089 + }, + { + "epoch": 0.76, + "grad_norm": 1.3083447854172974, + "learning_rate": 1.4732785772954333e-06, + "loss": 0.2833, + "step": 26090 + }, + { + "epoch": 0.76, + "grad_norm": 1.2902663546036632, + "learning_rate": 1.4729456335925268e-06, + "loss": 0.2606, + "step": 26091 + }, + { + "epoch": 0.76, + "grad_norm": 1.4276989273730478, + "learning_rate": 1.4726127210159147e-06, + "loss": 0.2754, + "step": 26092 + }, + { + "epoch": 0.76, + "grad_norm": 0.9864026461454473, + "learning_rate": 1.472279839568534e-06, + "loss": 0.5686, + "step": 26093 + }, + { + "epoch": 0.76, + "grad_norm": 1.3732806042217858, + "learning_rate": 1.4719469892533227e-06, + "loss": 0.2611, + "step": 26094 + }, + { + "epoch": 0.76, + "grad_norm": 2.072902446622466, + "learning_rate": 1.4716141700732207e-06, + "loss": 0.27, + "step": 26095 + }, + { + "epoch": 0.76, + "grad_norm": 1.604536622644212, + "learning_rate": 1.4712813820311604e-06, + "loss": 0.2574, + "step": 26096 + }, + { + "epoch": 0.76, + "grad_norm": 1.525569373081468, + "learning_rate": 1.4709486251300815e-06, + "loss": 0.3004, + "step": 26097 + }, + { + "epoch": 0.76, + "grad_norm": 1.5029624576566056, + "learning_rate": 1.4706158993729204e-06, + "loss": 0.2921, + "step": 26098 + }, + { + "epoch": 0.76, + "grad_norm": 1.879308566141685, + "learning_rate": 1.4702832047626125e-06, + "loss": 0.313, + "step": 26099 + }, + { + "epoch": 0.76, + "grad_norm": 1.6991066706672406, + "learning_rate": 1.4699505413020949e-06, + "loss": 0.2777, + "step": 26100 + }, + { + "epoch": 0.76, + "grad_norm": 1.3930273185249153, + "learning_rate": 1.4696179089943024e-06, + "loss": 0.2703, + "step": 26101 + }, + { + "epoch": 0.76, + "grad_norm": 1.4299979740537452, + "learning_rate": 1.4692853078421714e-06, + "loss": 0.2747, + "step": 26102 + }, + { + "epoch": 0.76, + "grad_norm": 1.375209274617097, + "learning_rate": 1.4689527378486385e-06, + "loss": 0.2713, + "step": 26103 + }, + { + "epoch": 0.76, + "grad_norm": 1.2805654854453277, + "learning_rate": 1.4686201990166348e-06, + "loss": 0.2636, + "step": 26104 + }, + { + "epoch": 0.76, + "grad_norm": 1.253304500443133, + "learning_rate": 1.4682876913490973e-06, + "loss": 0.256, + "step": 26105 + }, + { + "epoch": 0.76, + "grad_norm": 1.557393937602966, + "learning_rate": 1.46795521484896e-06, + "loss": 0.2928, + "step": 26106 + }, + { + "epoch": 0.76, + "grad_norm": 1.3191957626838566, + "learning_rate": 1.4676227695191576e-06, + "loss": 0.2967, + "step": 26107 + }, + { + "epoch": 0.76, + "grad_norm": 1.2533265215539682, + "learning_rate": 1.4672903553626228e-06, + "loss": 0.2685, + "step": 26108 + }, + { + "epoch": 0.76, + "grad_norm": 1.478978046705049, + "learning_rate": 1.4669579723822902e-06, + "loss": 0.2671, + "step": 26109 + }, + { + "epoch": 0.76, + "grad_norm": 1.4079622507117788, + "learning_rate": 1.4666256205810924e-06, + "loss": 0.2931, + "step": 26110 + }, + { + "epoch": 0.76, + "grad_norm": 2.3660452998801875, + "learning_rate": 1.4662932999619634e-06, + "loss": 0.2704, + "step": 26111 + }, + { + "epoch": 0.76, + "grad_norm": 1.3133936694409027, + "learning_rate": 1.465961010527836e-06, + "loss": 0.2694, + "step": 26112 + }, + { + "epoch": 0.76, + "grad_norm": 1.4546240762542593, + "learning_rate": 1.4656287522816405e-06, + "loss": 0.2676, + "step": 26113 + }, + { + "epoch": 0.76, + "grad_norm": 1.4132650653456655, + "learning_rate": 1.4652965252263107e-06, + "loss": 0.2751, + "step": 26114 + }, + { + "epoch": 0.76, + "grad_norm": 1.2902635443429296, + "learning_rate": 1.464964329364778e-06, + "loss": 0.2773, + "step": 26115 + }, + { + "epoch": 0.76, + "grad_norm": 1.5843005289027867, + "learning_rate": 1.4646321646999744e-06, + "loss": 0.3003, + "step": 26116 + }, + { + "epoch": 0.76, + "grad_norm": 1.3791861923459507, + "learning_rate": 1.4643000312348326e-06, + "loss": 0.3044, + "step": 26117 + }, + { + "epoch": 0.76, + "grad_norm": 1.4927315151712972, + "learning_rate": 1.4639679289722807e-06, + "loss": 0.2622, + "step": 26118 + }, + { + "epoch": 0.76, + "grad_norm": 1.4220742037585046, + "learning_rate": 1.4636358579152515e-06, + "loss": 0.2984, + "step": 26119 + }, + { + "epoch": 0.76, + "grad_norm": 1.6794601838309582, + "learning_rate": 1.4633038180666747e-06, + "loss": 0.2656, + "step": 26120 + }, + { + "epoch": 0.76, + "grad_norm": 1.281196736375135, + "learning_rate": 1.462971809429481e-06, + "loss": 0.2787, + "step": 26121 + }, + { + "epoch": 0.76, + "grad_norm": 1.5696325581589936, + "learning_rate": 1.462639832006602e-06, + "loss": 0.2702, + "step": 26122 + }, + { + "epoch": 0.76, + "grad_norm": 1.192196586382008, + "learning_rate": 1.4623078858009642e-06, + "loss": 0.2658, + "step": 26123 + }, + { + "epoch": 0.76, + "grad_norm": 1.348538625465295, + "learning_rate": 1.4619759708154986e-06, + "loss": 0.2589, + "step": 26124 + }, + { + "epoch": 0.76, + "grad_norm": 1.377127823284043, + "learning_rate": 1.461644087053134e-06, + "loss": 0.2766, + "step": 26125 + }, + { + "epoch": 0.76, + "grad_norm": 2.2527819925510526, + "learning_rate": 1.4613122345167996e-06, + "loss": 0.2803, + "step": 26126 + }, + { + "epoch": 0.76, + "grad_norm": 1.6068641393519945, + "learning_rate": 1.460980413209424e-06, + "loss": 0.2718, + "step": 26127 + }, + { + "epoch": 0.76, + "grad_norm": 1.2208092466171638, + "learning_rate": 1.4606486231339355e-06, + "loss": 0.2655, + "step": 26128 + }, + { + "epoch": 0.76, + "grad_norm": 1.5966261502684875, + "learning_rate": 1.4603168642932625e-06, + "loss": 0.2695, + "step": 26129 + }, + { + "epoch": 0.76, + "grad_norm": 1.2957100048675232, + "learning_rate": 1.4599851366903324e-06, + "loss": 0.2676, + "step": 26130 + }, + { + "epoch": 0.76, + "grad_norm": 1.2560307480742414, + "learning_rate": 1.4596534403280737e-06, + "loss": 0.2695, + "step": 26131 + }, + { + "epoch": 0.76, + "grad_norm": 1.3274610823562867, + "learning_rate": 1.4593217752094114e-06, + "loss": 0.2873, + "step": 26132 + }, + { + "epoch": 0.76, + "grad_norm": 1.2727977493072609, + "learning_rate": 1.4589901413372737e-06, + "loss": 0.272, + "step": 26133 + }, + { + "epoch": 0.76, + "grad_norm": 1.2640451965407622, + "learning_rate": 1.4586585387145873e-06, + "loss": 0.2647, + "step": 26134 + }, + { + "epoch": 0.76, + "grad_norm": 1.211853350191326, + "learning_rate": 1.458326967344279e-06, + "loss": 0.2714, + "step": 26135 + }, + { + "epoch": 0.76, + "grad_norm": 1.265316575367196, + "learning_rate": 1.457995427229274e-06, + "loss": 0.2792, + "step": 26136 + }, + { + "epoch": 0.76, + "grad_norm": 1.5384510659659176, + "learning_rate": 1.4576639183724988e-06, + "loss": 0.2699, + "step": 26137 + }, + { + "epoch": 0.76, + "grad_norm": 1.796935210908636, + "learning_rate": 1.457332440776879e-06, + "loss": 0.2938, + "step": 26138 + }, + { + "epoch": 0.76, + "grad_norm": 1.3985433616400234, + "learning_rate": 1.4570009944453411e-06, + "loss": 0.2976, + "step": 26139 + }, + { + "epoch": 0.76, + "grad_norm": 1.398553160465767, + "learning_rate": 1.4566695793808072e-06, + "loss": 0.2764, + "step": 26140 + }, + { + "epoch": 0.76, + "grad_norm": 1.4128699815014252, + "learning_rate": 1.4563381955862039e-06, + "loss": 0.2591, + "step": 26141 + }, + { + "epoch": 0.76, + "grad_norm": 1.2480858383203466, + "learning_rate": 1.4560068430644548e-06, + "loss": 0.2538, + "step": 26142 + }, + { + "epoch": 0.76, + "grad_norm": 1.4208857728725268, + "learning_rate": 1.4556755218184849e-06, + "loss": 0.2715, + "step": 26143 + }, + { + "epoch": 0.76, + "grad_norm": 1.2155396136749193, + "learning_rate": 1.4553442318512178e-06, + "loss": 0.2586, + "step": 26144 + }, + { + "epoch": 0.76, + "grad_norm": 1.272574643698851, + "learning_rate": 1.4550129731655788e-06, + "loss": 0.2768, + "step": 26145 + }, + { + "epoch": 0.76, + "grad_norm": 1.3417390054543819, + "learning_rate": 1.4546817457644879e-06, + "loss": 0.296, + "step": 26146 + }, + { + "epoch": 0.76, + "grad_norm": 1.4244098735350792, + "learning_rate": 1.4543505496508697e-06, + "loss": 0.2786, + "step": 26147 + }, + { + "epoch": 0.76, + "grad_norm": 1.3843102362522919, + "learning_rate": 1.454019384827648e-06, + "loss": 0.3106, + "step": 26148 + }, + { + "epoch": 0.76, + "grad_norm": 1.3654341324737354, + "learning_rate": 1.4536882512977435e-06, + "loss": 0.2788, + "step": 26149 + }, + { + "epoch": 0.76, + "grad_norm": 1.5344535781770021, + "learning_rate": 1.4533571490640819e-06, + "loss": 0.2841, + "step": 26150 + }, + { + "epoch": 0.76, + "grad_norm": 1.265370581105724, + "learning_rate": 1.4530260781295813e-06, + "loss": 0.279, + "step": 26151 + }, + { + "epoch": 0.76, + "grad_norm": 1.3933135423137386, + "learning_rate": 1.4526950384971644e-06, + "loss": 0.2997, + "step": 26152 + }, + { + "epoch": 0.76, + "grad_norm": 1.431810983284959, + "learning_rate": 1.4523640301697533e-06, + "loss": 0.2433, + "step": 26153 + }, + { + "epoch": 0.76, + "grad_norm": 1.4810980517829655, + "learning_rate": 1.4520330531502691e-06, + "loss": 0.3068, + "step": 26154 + }, + { + "epoch": 0.76, + "grad_norm": 1.30944315642463, + "learning_rate": 1.4517021074416327e-06, + "loss": 0.3226, + "step": 26155 + }, + { + "epoch": 0.76, + "grad_norm": 1.332044495555545, + "learning_rate": 1.4513711930467645e-06, + "loss": 0.2731, + "step": 26156 + }, + { + "epoch": 0.76, + "grad_norm": 1.1466514295124437, + "learning_rate": 1.4510403099685845e-06, + "loss": 0.2711, + "step": 26157 + }, + { + "epoch": 0.76, + "grad_norm": 1.4575039829336376, + "learning_rate": 1.450709458210015e-06, + "loss": 0.2807, + "step": 26158 + }, + { + "epoch": 0.76, + "grad_norm": 0.961727336877591, + "learning_rate": 1.450378637773972e-06, + "loss": 0.5316, + "step": 26159 + }, + { + "epoch": 0.76, + "grad_norm": 1.2778441113588999, + "learning_rate": 1.4500478486633774e-06, + "loss": 0.2688, + "step": 26160 + }, + { + "epoch": 0.76, + "grad_norm": 1.331607477920519, + "learning_rate": 1.4497170908811498e-06, + "loss": 0.2856, + "step": 26161 + }, + { + "epoch": 0.76, + "grad_norm": 1.4840279630862756, + "learning_rate": 1.4493863644302082e-06, + "loss": 0.257, + "step": 26162 + }, + { + "epoch": 0.76, + "grad_norm": 1.4798508467306641, + "learning_rate": 1.449055669313471e-06, + "loss": 0.2735, + "step": 26163 + }, + { + "epoch": 0.76, + "grad_norm": 1.6894119128456155, + "learning_rate": 1.4487250055338575e-06, + "loss": 0.2755, + "step": 26164 + }, + { + "epoch": 0.76, + "grad_norm": 1.253799219833812, + "learning_rate": 1.4483943730942851e-06, + "loss": 0.2564, + "step": 26165 + }, + { + "epoch": 0.76, + "grad_norm": 1.403019012119659, + "learning_rate": 1.4480637719976715e-06, + "loss": 0.321, + "step": 26166 + }, + { + "epoch": 0.76, + "grad_norm": 1.317868941459807, + "learning_rate": 1.447733202246936e-06, + "loss": 0.2737, + "step": 26167 + }, + { + "epoch": 0.76, + "grad_norm": 1.424440823781929, + "learning_rate": 1.4474026638449934e-06, + "loss": 0.2778, + "step": 26168 + }, + { + "epoch": 0.76, + "grad_norm": 1.4227357708085115, + "learning_rate": 1.447072156794761e-06, + "loss": 0.2813, + "step": 26169 + }, + { + "epoch": 0.76, + "grad_norm": 1.459253203667481, + "learning_rate": 1.446741681099157e-06, + "loss": 0.2791, + "step": 26170 + }, + { + "epoch": 0.76, + "grad_norm": 3.9930447986026927, + "learning_rate": 1.446411236761096e-06, + "loss": 0.2728, + "step": 26171 + }, + { + "epoch": 0.76, + "grad_norm": 1.2801256710974227, + "learning_rate": 1.4460808237834962e-06, + "loss": 0.2933, + "step": 26172 + }, + { + "epoch": 0.76, + "grad_norm": 1.3016119369958419, + "learning_rate": 1.4457504421692736e-06, + "loss": 0.2862, + "step": 26173 + }, + { + "epoch": 0.76, + "grad_norm": 1.406435166376852, + "learning_rate": 1.4454200919213418e-06, + "loss": 0.2881, + "step": 26174 + }, + { + "epoch": 0.76, + "grad_norm": 1.5735006840069192, + "learning_rate": 1.4450897730426166e-06, + "loss": 0.2721, + "step": 26175 + }, + { + "epoch": 0.76, + "grad_norm": 1.4007342004812628, + "learning_rate": 1.4447594855360137e-06, + "loss": 0.2838, + "step": 26176 + }, + { + "epoch": 0.76, + "grad_norm": 2.2116435303443844, + "learning_rate": 1.4444292294044493e-06, + "loss": 0.2979, + "step": 26177 + }, + { + "epoch": 0.76, + "grad_norm": 2.4978976432157403, + "learning_rate": 1.4440990046508352e-06, + "loss": 0.2833, + "step": 26178 + }, + { + "epoch": 0.76, + "grad_norm": 1.9224939430482995, + "learning_rate": 1.4437688112780863e-06, + "loss": 0.2621, + "step": 26179 + }, + { + "epoch": 0.76, + "grad_norm": 1.3557985874898009, + "learning_rate": 1.4434386492891173e-06, + "loss": 0.28, + "step": 26180 + }, + { + "epoch": 0.76, + "grad_norm": 1.3715400417175947, + "learning_rate": 1.4431085186868416e-06, + "loss": 0.2718, + "step": 26181 + }, + { + "epoch": 0.76, + "grad_norm": 0.9943599275179704, + "learning_rate": 1.4427784194741728e-06, + "loss": 0.5849, + "step": 26182 + }, + { + "epoch": 0.76, + "grad_norm": 1.3053053784501927, + "learning_rate": 1.4424483516540239e-06, + "loss": 0.2854, + "step": 26183 + }, + { + "epoch": 0.76, + "grad_norm": 1.3363105880745336, + "learning_rate": 1.4421183152293079e-06, + "loss": 0.2635, + "step": 26184 + }, + { + "epoch": 0.76, + "grad_norm": 1.2779241018681402, + "learning_rate": 1.4417883102029367e-06, + "loss": 0.2895, + "step": 26185 + }, + { + "epoch": 0.76, + "grad_norm": 1.2813968702292073, + "learning_rate": 1.4414583365778257e-06, + "loss": 0.2655, + "step": 26186 + }, + { + "epoch": 0.76, + "grad_norm": 1.7843654763030137, + "learning_rate": 1.441128394356882e-06, + "loss": 0.2621, + "step": 26187 + }, + { + "epoch": 0.76, + "grad_norm": 1.2767347991579556, + "learning_rate": 1.44079848354302e-06, + "loss": 0.2705, + "step": 26188 + }, + { + "epoch": 0.76, + "grad_norm": 2.5555810360256266, + "learning_rate": 1.4404686041391502e-06, + "loss": 0.2841, + "step": 26189 + }, + { + "epoch": 0.76, + "grad_norm": 1.3355531092568862, + "learning_rate": 1.4401387561481856e-06, + "loss": 0.2684, + "step": 26190 + }, + { + "epoch": 0.76, + "grad_norm": 1.4642794156539751, + "learning_rate": 1.4398089395730352e-06, + "loss": 0.2831, + "step": 26191 + }, + { + "epoch": 0.76, + "grad_norm": 1.3192755869665875, + "learning_rate": 1.4394791544166103e-06, + "loss": 0.2834, + "step": 26192 + }, + { + "epoch": 0.76, + "grad_norm": 1.2385684927398053, + "learning_rate": 1.4391494006818219e-06, + "loss": 0.2913, + "step": 26193 + }, + { + "epoch": 0.76, + "grad_norm": 1.6351768260359358, + "learning_rate": 1.4388196783715808e-06, + "loss": 0.2713, + "step": 26194 + }, + { + "epoch": 0.76, + "grad_norm": 1.3463269330459455, + "learning_rate": 1.4384899874887942e-06, + "loss": 0.287, + "step": 26195 + }, + { + "epoch": 0.76, + "grad_norm": 1.275114642501222, + "learning_rate": 1.4381603280363726e-06, + "loss": 0.2642, + "step": 26196 + }, + { + "epoch": 0.76, + "grad_norm": 1.2957242679036243, + "learning_rate": 1.437830700017226e-06, + "loss": 0.2703, + "step": 26197 + }, + { + "epoch": 0.76, + "grad_norm": 1.7637154678714588, + "learning_rate": 1.4375011034342622e-06, + "loss": 0.2819, + "step": 26198 + }, + { + "epoch": 0.76, + "grad_norm": 1.2175852994038885, + "learning_rate": 1.4371715382903918e-06, + "loss": 0.2625, + "step": 26199 + }, + { + "epoch": 0.76, + "grad_norm": 1.2571758989610526, + "learning_rate": 1.436842004588521e-06, + "loss": 0.2776, + "step": 26200 + }, + { + "epoch": 0.76, + "grad_norm": 1.3995228873539765, + "learning_rate": 1.436512502331562e-06, + "loss": 0.3086, + "step": 26201 + }, + { + "epoch": 0.76, + "grad_norm": 1.343225544823888, + "learning_rate": 1.4361830315224168e-06, + "loss": 0.3117, + "step": 26202 + }, + { + "epoch": 0.76, + "grad_norm": 1.3138814746630778, + "learning_rate": 1.4358535921639965e-06, + "loss": 0.2647, + "step": 26203 + }, + { + "epoch": 0.76, + "grad_norm": 1.279787546187356, + "learning_rate": 1.4355241842592077e-06, + "loss": 0.2806, + "step": 26204 + }, + { + "epoch": 0.76, + "grad_norm": 1.3765385303395634, + "learning_rate": 1.435194807810959e-06, + "loss": 0.2636, + "step": 26205 + }, + { + "epoch": 0.76, + "grad_norm": 1.3727614277652755, + "learning_rate": 1.4348654628221542e-06, + "loss": 0.2985, + "step": 26206 + }, + { + "epoch": 0.76, + "grad_norm": 1.26215790608739, + "learning_rate": 1.434536149295701e-06, + "loss": 0.2719, + "step": 26207 + }, + { + "epoch": 0.76, + "grad_norm": 1.4255354969801102, + "learning_rate": 1.4342068672345061e-06, + "loss": 0.2612, + "step": 26208 + }, + { + "epoch": 0.76, + "grad_norm": 1.6636519665910567, + "learning_rate": 1.4338776166414753e-06, + "loss": 0.3074, + "step": 26209 + }, + { + "epoch": 0.76, + "grad_norm": 1.3833192236936138, + "learning_rate": 1.4335483975195135e-06, + "loss": 0.2838, + "step": 26210 + }, + { + "epoch": 0.76, + "grad_norm": 1.265216921189868, + "learning_rate": 1.4332192098715275e-06, + "loss": 0.2823, + "step": 26211 + }, + { + "epoch": 0.76, + "grad_norm": 1.389550527081871, + "learning_rate": 1.432890053700421e-06, + "loss": 0.2842, + "step": 26212 + }, + { + "epoch": 0.76, + "grad_norm": 1.4539157585968028, + "learning_rate": 1.4325609290091008e-06, + "loss": 0.2924, + "step": 26213 + }, + { + "epoch": 0.76, + "grad_norm": 1.2513551726684278, + "learning_rate": 1.4322318358004688e-06, + "loss": 0.2691, + "step": 26214 + }, + { + "epoch": 0.76, + "grad_norm": 1.5316080051873109, + "learning_rate": 1.43190277407743e-06, + "loss": 0.2863, + "step": 26215 + }, + { + "epoch": 0.76, + "grad_norm": 1.3126302907827532, + "learning_rate": 1.431573743842889e-06, + "loss": 0.2767, + "step": 26216 + }, + { + "epoch": 0.76, + "grad_norm": 1.2949705371769886, + "learning_rate": 1.4312447450997497e-06, + "loss": 0.2845, + "step": 26217 + }, + { + "epoch": 0.76, + "grad_norm": 1.3774265673563397, + "learning_rate": 1.4309157778509148e-06, + "loss": 0.3237, + "step": 26218 + }, + { + "epoch": 0.76, + "grad_norm": 3.0730877253004527, + "learning_rate": 1.4305868420992875e-06, + "loss": 0.2595, + "step": 26219 + }, + { + "epoch": 0.76, + "grad_norm": 1.395696737856854, + "learning_rate": 1.430257937847771e-06, + "loss": 0.2914, + "step": 26220 + }, + { + "epoch": 0.76, + "grad_norm": 1.5100874788411092, + "learning_rate": 1.4299290650992682e-06, + "loss": 0.2767, + "step": 26221 + }, + { + "epoch": 0.76, + "grad_norm": 1.7152604542562477, + "learning_rate": 1.429600223856682e-06, + "loss": 0.2819, + "step": 26222 + }, + { + "epoch": 0.76, + "grad_norm": 1.6834435526018396, + "learning_rate": 1.4292714141229124e-06, + "loss": 0.2591, + "step": 26223 + }, + { + "epoch": 0.76, + "grad_norm": 1.3159254613885774, + "learning_rate": 1.4289426359008619e-06, + "loss": 0.2981, + "step": 26224 + }, + { + "epoch": 0.76, + "grad_norm": 1.2449732135321216, + "learning_rate": 1.428613889193432e-06, + "loss": 0.2695, + "step": 26225 + }, + { + "epoch": 0.76, + "grad_norm": 1.3989698802412125, + "learning_rate": 1.4282851740035247e-06, + "loss": 0.2936, + "step": 26226 + }, + { + "epoch": 0.76, + "grad_norm": 1.3521511580673231, + "learning_rate": 1.4279564903340398e-06, + "loss": 0.3038, + "step": 26227 + }, + { + "epoch": 0.76, + "grad_norm": 1.2806882660470214, + "learning_rate": 1.4276278381878788e-06, + "loss": 0.2847, + "step": 26228 + }, + { + "epoch": 0.76, + "grad_norm": 1.5285048259431318, + "learning_rate": 1.4272992175679435e-06, + "loss": 0.2639, + "step": 26229 + }, + { + "epoch": 0.76, + "grad_norm": 1.229677075432822, + "learning_rate": 1.4269706284771307e-06, + "loss": 0.2637, + "step": 26230 + }, + { + "epoch": 0.76, + "grad_norm": 2.3833338818512977, + "learning_rate": 1.4266420709183414e-06, + "loss": 0.2568, + "step": 26231 + }, + { + "epoch": 0.76, + "grad_norm": 2.9691615271449945, + "learning_rate": 1.426313544894477e-06, + "loss": 0.2828, + "step": 26232 + }, + { + "epoch": 0.76, + "grad_norm": 1.6299001311969703, + "learning_rate": 1.4259850504084337e-06, + "loss": 0.2704, + "step": 26233 + }, + { + "epoch": 0.76, + "grad_norm": 1.3579769221117328, + "learning_rate": 1.4256565874631122e-06, + "loss": 0.3083, + "step": 26234 + }, + { + "epoch": 0.76, + "grad_norm": 1.2511706802796592, + "learning_rate": 1.4253281560614107e-06, + "loss": 0.2994, + "step": 26235 + }, + { + "epoch": 0.76, + "grad_norm": 1.3825204953028545, + "learning_rate": 1.4249997562062285e-06, + "loss": 0.272, + "step": 26236 + }, + { + "epoch": 0.76, + "grad_norm": 2.5906179861174157, + "learning_rate": 1.4246713879004625e-06, + "loss": 0.2651, + "step": 26237 + }, + { + "epoch": 0.76, + "grad_norm": 2.79876352311505, + "learning_rate": 1.4243430511470113e-06, + "loss": 0.2967, + "step": 26238 + }, + { + "epoch": 0.76, + "grad_norm": 1.4297308009615313, + "learning_rate": 1.4240147459487725e-06, + "loss": 0.2801, + "step": 26239 + }, + { + "epoch": 0.76, + "grad_norm": 1.569096014393592, + "learning_rate": 1.423686472308643e-06, + "loss": 0.3169, + "step": 26240 + }, + { + "epoch": 0.76, + "grad_norm": 1.4512566150028978, + "learning_rate": 1.4233582302295219e-06, + "loss": 0.2836, + "step": 26241 + }, + { + "epoch": 0.76, + "grad_norm": 1.2621600263773967, + "learning_rate": 1.4230300197143022e-06, + "loss": 0.2515, + "step": 26242 + }, + { + "epoch": 0.76, + "grad_norm": 1.3693997156506006, + "learning_rate": 1.4227018407658822e-06, + "loss": 0.2702, + "step": 26243 + }, + { + "epoch": 0.76, + "grad_norm": 1.4012659755982502, + "learning_rate": 1.4223736933871585e-06, + "loss": 0.3219, + "step": 26244 + }, + { + "epoch": 0.76, + "grad_norm": 1.2864044901876397, + "learning_rate": 1.4220455775810266e-06, + "loss": 0.2699, + "step": 26245 + }, + { + "epoch": 0.76, + "grad_norm": 1.296044619535767, + "learning_rate": 1.4217174933503819e-06, + "loss": 0.2662, + "step": 26246 + }, + { + "epoch": 0.76, + "grad_norm": 1.5429217088020362, + "learning_rate": 1.42138944069812e-06, + "loss": 0.2688, + "step": 26247 + }, + { + "epoch": 0.76, + "grad_norm": 1.4302132478479492, + "learning_rate": 1.4210614196271362e-06, + "loss": 0.2464, + "step": 26248 + }, + { + "epoch": 0.76, + "grad_norm": 1.2379996548440093, + "learning_rate": 1.4207334301403264e-06, + "loss": 0.2968, + "step": 26249 + }, + { + "epoch": 0.76, + "grad_norm": 1.3557880029198, + "learning_rate": 1.4204054722405825e-06, + "loss": 0.2808, + "step": 26250 + }, + { + "epoch": 0.76, + "grad_norm": 1.2319346734230363, + "learning_rate": 1.4200775459308002e-06, + "loss": 0.2862, + "step": 26251 + }, + { + "epoch": 0.76, + "grad_norm": 1.22992195846754, + "learning_rate": 1.419749651213873e-06, + "loss": 0.282, + "step": 26252 + }, + { + "epoch": 0.76, + "grad_norm": 1.2949722597546602, + "learning_rate": 1.4194217880926948e-06, + "loss": 0.2673, + "step": 26253 + }, + { + "epoch": 0.76, + "grad_norm": 1.294191178182164, + "learning_rate": 1.4190939565701588e-06, + "loss": 0.2503, + "step": 26254 + }, + { + "epoch": 0.76, + "grad_norm": 1.289842959501925, + "learning_rate": 1.418766156649159e-06, + "loss": 0.2637, + "step": 26255 + }, + { + "epoch": 0.76, + "grad_norm": 1.4523783161067825, + "learning_rate": 1.418438388332587e-06, + "loss": 0.2662, + "step": 26256 + }, + { + "epoch": 0.76, + "grad_norm": 1.3894469746902651, + "learning_rate": 1.4181106516233362e-06, + "loss": 0.2744, + "step": 26257 + }, + { + "epoch": 0.76, + "grad_norm": 1.240747344017165, + "learning_rate": 1.4177829465243005e-06, + "loss": 0.259, + "step": 26258 + }, + { + "epoch": 0.76, + "grad_norm": 1.3005961388844343, + "learning_rate": 1.4174552730383696e-06, + "loss": 0.2783, + "step": 26259 + }, + { + "epoch": 0.76, + "grad_norm": 1.4213346213601328, + "learning_rate": 1.417127631168434e-06, + "loss": 0.2866, + "step": 26260 + }, + { + "epoch": 0.76, + "grad_norm": 1.3079080554601301, + "learning_rate": 1.416800020917387e-06, + "loss": 0.2817, + "step": 26261 + }, + { + "epoch": 0.76, + "grad_norm": 1.2917845002066022, + "learning_rate": 1.4164724422881198e-06, + "loss": 0.2692, + "step": 26262 + }, + { + "epoch": 0.76, + "grad_norm": 1.4403934032105952, + "learning_rate": 1.4161448952835227e-06, + "loss": 0.2921, + "step": 26263 + }, + { + "epoch": 0.76, + "grad_norm": 1.3337878464837758, + "learning_rate": 1.4158173799064867e-06, + "loss": 0.2614, + "step": 26264 + }, + { + "epoch": 0.76, + "grad_norm": 1.785392114536465, + "learning_rate": 1.4154898961599023e-06, + "loss": 0.2813, + "step": 26265 + }, + { + "epoch": 0.76, + "grad_norm": 0.9475853367588454, + "learning_rate": 1.4151624440466594e-06, + "loss": 0.5586, + "step": 26266 + }, + { + "epoch": 0.76, + "grad_norm": 1.376546592150953, + "learning_rate": 1.4148350235696473e-06, + "loss": 0.2904, + "step": 26267 + }, + { + "epoch": 0.76, + "grad_norm": 1.5505765010039534, + "learning_rate": 1.4145076347317576e-06, + "loss": 0.2794, + "step": 26268 + }, + { + "epoch": 0.76, + "grad_norm": 0.99507721375457, + "learning_rate": 1.4141802775358765e-06, + "loss": 0.6269, + "step": 26269 + }, + { + "epoch": 0.76, + "grad_norm": 1.2594378157119892, + "learning_rate": 1.4138529519848937e-06, + "loss": 0.2755, + "step": 26270 + }, + { + "epoch": 0.76, + "grad_norm": 1.2415380569075396, + "learning_rate": 1.413525658081699e-06, + "loss": 0.2631, + "step": 26271 + }, + { + "epoch": 0.76, + "grad_norm": 1.2547753757088755, + "learning_rate": 1.4131983958291795e-06, + "loss": 0.264, + "step": 26272 + }, + { + "epoch": 0.76, + "grad_norm": 1.31406448745911, + "learning_rate": 1.4128711652302246e-06, + "loss": 0.2536, + "step": 26273 + }, + { + "epoch": 0.76, + "grad_norm": 1.5370625861376732, + "learning_rate": 1.412543966287721e-06, + "loss": 0.2714, + "step": 26274 + }, + { + "epoch": 0.76, + "grad_norm": 1.2577330886038283, + "learning_rate": 1.412216799004557e-06, + "loss": 0.2751, + "step": 26275 + }, + { + "epoch": 0.76, + "grad_norm": 1.587558050339597, + "learning_rate": 1.4118896633836194e-06, + "loss": 0.2718, + "step": 26276 + }, + { + "epoch": 0.76, + "grad_norm": 1.4197578549100862, + "learning_rate": 1.4115625594277972e-06, + "loss": 0.265, + "step": 26277 + }, + { + "epoch": 0.76, + "grad_norm": 1.414070318965246, + "learning_rate": 1.411235487139973e-06, + "loss": 0.3106, + "step": 26278 + }, + { + "epoch": 0.76, + "grad_norm": 1.2806304327499052, + "learning_rate": 1.4109084465230361e-06, + "loss": 0.2741, + "step": 26279 + }, + { + "epoch": 0.76, + "grad_norm": 1.2795532936014078, + "learning_rate": 1.4105814375798715e-06, + "loss": 0.2638, + "step": 26280 + }, + { + "epoch": 0.76, + "grad_norm": 1.5975651993738171, + "learning_rate": 1.4102544603133656e-06, + "loss": 0.2864, + "step": 26281 + }, + { + "epoch": 0.76, + "grad_norm": 1.2711865434294345, + "learning_rate": 1.4099275147264041e-06, + "loss": 0.2772, + "step": 26282 + }, + { + "epoch": 0.76, + "grad_norm": 1.4696990289769045, + "learning_rate": 1.4096006008218721e-06, + "loss": 0.2753, + "step": 26283 + }, + { + "epoch": 0.76, + "grad_norm": 1.5620847386313204, + "learning_rate": 1.4092737186026546e-06, + "loss": 0.2853, + "step": 26284 + }, + { + "epoch": 0.76, + "grad_norm": 1.3860123314651158, + "learning_rate": 1.4089468680716378e-06, + "loss": 0.2991, + "step": 26285 + }, + { + "epoch": 0.76, + "grad_norm": 1.4243917378120956, + "learning_rate": 1.408620049231703e-06, + "loss": 0.2837, + "step": 26286 + }, + { + "epoch": 0.76, + "grad_norm": 1.2914104340011139, + "learning_rate": 1.4082932620857382e-06, + "loss": 0.2714, + "step": 26287 + }, + { + "epoch": 0.76, + "grad_norm": 1.3377506770898477, + "learning_rate": 1.4079665066366234e-06, + "loss": 0.2803, + "step": 26288 + }, + { + "epoch": 0.76, + "grad_norm": 1.318345860361682, + "learning_rate": 1.4076397828872441e-06, + "loss": 0.2551, + "step": 26289 + }, + { + "epoch": 0.76, + "grad_norm": 1.3234962396348877, + "learning_rate": 1.4073130908404832e-06, + "loss": 0.2694, + "step": 26290 + }, + { + "epoch": 0.76, + "grad_norm": 1.5398499675645931, + "learning_rate": 1.4069864304992242e-06, + "loss": 0.2657, + "step": 26291 + }, + { + "epoch": 0.76, + "grad_norm": 1.4026098377319909, + "learning_rate": 1.4066598018663502e-06, + "loss": 0.274, + "step": 26292 + }, + { + "epoch": 0.76, + "grad_norm": 1.3672906278799692, + "learning_rate": 1.4063332049447432e-06, + "loss": 0.264, + "step": 26293 + }, + { + "epoch": 0.76, + "grad_norm": 1.333171502134948, + "learning_rate": 1.4060066397372852e-06, + "loss": 0.2782, + "step": 26294 + }, + { + "epoch": 0.76, + "grad_norm": 1.4827124057449232, + "learning_rate": 1.4056801062468605e-06, + "loss": 0.272, + "step": 26295 + }, + { + "epoch": 0.76, + "grad_norm": 1.4188348049874484, + "learning_rate": 1.4053536044763465e-06, + "loss": 0.2778, + "step": 26296 + }, + { + "epoch": 0.76, + "grad_norm": 1.536669734250229, + "learning_rate": 1.4050271344286275e-06, + "loss": 0.2908, + "step": 26297 + }, + { + "epoch": 0.76, + "grad_norm": 1.2978828069932749, + "learning_rate": 1.4047006961065835e-06, + "loss": 0.2486, + "step": 26298 + }, + { + "epoch": 0.76, + "grad_norm": 1.690726156741737, + "learning_rate": 1.4043742895130962e-06, + "loss": 0.2932, + "step": 26299 + }, + { + "epoch": 0.76, + "grad_norm": 1.3515098721524499, + "learning_rate": 1.4040479146510456e-06, + "loss": 0.2623, + "step": 26300 + }, + { + "epoch": 0.76, + "grad_norm": 1.5266990813759733, + "learning_rate": 1.4037215715233116e-06, + "loss": 0.2985, + "step": 26301 + }, + { + "epoch": 0.76, + "grad_norm": 1.50097568813292, + "learning_rate": 1.4033952601327749e-06, + "loss": 0.2813, + "step": 26302 + }, + { + "epoch": 0.76, + "grad_norm": 1.3319218314608472, + "learning_rate": 1.403068980482315e-06, + "loss": 0.2778, + "step": 26303 + }, + { + "epoch": 0.76, + "grad_norm": 1.3434152767360141, + "learning_rate": 1.4027427325748128e-06, + "loss": 0.2661, + "step": 26304 + }, + { + "epoch": 0.76, + "grad_norm": 1.375613700810727, + "learning_rate": 1.4024165164131442e-06, + "loss": 0.2875, + "step": 26305 + }, + { + "epoch": 0.76, + "grad_norm": 1.0240861124276186, + "learning_rate": 1.4020903320001895e-06, + "loss": 0.586, + "step": 26306 + }, + { + "epoch": 0.76, + "grad_norm": 1.5878868617805746, + "learning_rate": 1.401764179338828e-06, + "loss": 0.2877, + "step": 26307 + }, + { + "epoch": 0.76, + "grad_norm": 1.3020375329621536, + "learning_rate": 1.401438058431937e-06, + "loss": 0.2787, + "step": 26308 + }, + { + "epoch": 0.76, + "grad_norm": 1.2538205197585446, + "learning_rate": 1.4011119692823954e-06, + "loss": 0.2697, + "step": 26309 + }, + { + "epoch": 0.76, + "grad_norm": 1.374307642284583, + "learning_rate": 1.400785911893081e-06, + "loss": 0.2546, + "step": 26310 + }, + { + "epoch": 0.76, + "grad_norm": 1.253520216935876, + "learning_rate": 1.4004598862668706e-06, + "loss": 0.2702, + "step": 26311 + }, + { + "epoch": 0.76, + "grad_norm": 1.41881425398166, + "learning_rate": 1.4001338924066415e-06, + "loss": 0.2825, + "step": 26312 + }, + { + "epoch": 0.76, + "grad_norm": 1.4049626129846222, + "learning_rate": 1.3998079303152723e-06, + "loss": 0.271, + "step": 26313 + }, + { + "epoch": 0.76, + "grad_norm": 1.585122687469313, + "learning_rate": 1.3994819999956378e-06, + "loss": 0.2791, + "step": 26314 + }, + { + "epoch": 0.76, + "grad_norm": 1.3217306994531501, + "learning_rate": 1.3991561014506133e-06, + "loss": 0.2831, + "step": 26315 + }, + { + "epoch": 0.76, + "grad_norm": 1.4408635426744558, + "learning_rate": 1.3988302346830762e-06, + "loss": 0.2584, + "step": 26316 + }, + { + "epoch": 0.76, + "grad_norm": 1.6353490780548414, + "learning_rate": 1.398504399695902e-06, + "loss": 0.2699, + "step": 26317 + }, + { + "epoch": 0.76, + "grad_norm": 1.2661438172697441, + "learning_rate": 1.398178596491967e-06, + "loss": 0.2733, + "step": 26318 + }, + { + "epoch": 0.76, + "grad_norm": 1.249769784274911, + "learning_rate": 1.3978528250741453e-06, + "loss": 0.2773, + "step": 26319 + }, + { + "epoch": 0.76, + "grad_norm": 0.9838633990658447, + "learning_rate": 1.3975270854453122e-06, + "loss": 0.6065, + "step": 26320 + }, + { + "epoch": 0.76, + "grad_norm": 1.543918466535125, + "learning_rate": 1.3972013776083433e-06, + "loss": 0.2789, + "step": 26321 + }, + { + "epoch": 0.76, + "grad_norm": 1.39298431414938, + "learning_rate": 1.3968757015661117e-06, + "loss": 0.3066, + "step": 26322 + }, + { + "epoch": 0.76, + "grad_norm": 1.3630170567726172, + "learning_rate": 1.3965500573214936e-06, + "loss": 0.2691, + "step": 26323 + }, + { + "epoch": 0.76, + "grad_norm": 1.400220516747061, + "learning_rate": 1.3962244448773593e-06, + "loss": 0.3196, + "step": 26324 + }, + { + "epoch": 0.76, + "grad_norm": 4.94897167546556, + "learning_rate": 1.3958988642365845e-06, + "loss": 0.2759, + "step": 26325 + }, + { + "epoch": 0.76, + "grad_norm": 1.5456240978759435, + "learning_rate": 1.3955733154020424e-06, + "loss": 0.2766, + "step": 26326 + }, + { + "epoch": 0.76, + "grad_norm": 1.3224391811679626, + "learning_rate": 1.395247798376606e-06, + "loss": 0.293, + "step": 26327 + }, + { + "epoch": 0.76, + "grad_norm": 1.6436891556180975, + "learning_rate": 1.3949223131631474e-06, + "loss": 0.2944, + "step": 26328 + }, + { + "epoch": 0.76, + "grad_norm": 1.3048288203210283, + "learning_rate": 1.3945968597645392e-06, + "loss": 0.2655, + "step": 26329 + }, + { + "epoch": 0.76, + "grad_norm": 1.3738118071611596, + "learning_rate": 1.3942714381836542e-06, + "loss": 0.2671, + "step": 26330 + }, + { + "epoch": 0.76, + "grad_norm": 1.4008371707437013, + "learning_rate": 1.3939460484233651e-06, + "loss": 0.2673, + "step": 26331 + }, + { + "epoch": 0.76, + "grad_norm": 1.3477713934446496, + "learning_rate": 1.3936206904865408e-06, + "loss": 0.2881, + "step": 26332 + }, + { + "epoch": 0.76, + "grad_norm": 1.518836948934633, + "learning_rate": 1.3932953643760534e-06, + "loss": 0.2995, + "step": 26333 + }, + { + "epoch": 0.76, + "grad_norm": 1.3942119897265903, + "learning_rate": 1.3929700700947746e-06, + "loss": 0.277, + "step": 26334 + }, + { + "epoch": 0.76, + "grad_norm": 1.3421388278681534, + "learning_rate": 1.392644807645575e-06, + "loss": 0.2535, + "step": 26335 + }, + { + "epoch": 0.76, + "grad_norm": 1.3021474894550404, + "learning_rate": 1.3923195770313251e-06, + "loss": 0.268, + "step": 26336 + }, + { + "epoch": 0.76, + "grad_norm": 1.2770099257103638, + "learning_rate": 1.391994378254895e-06, + "loss": 0.2771, + "step": 26337 + }, + { + "epoch": 0.76, + "grad_norm": 1.2361961115671034, + "learning_rate": 1.3916692113191544e-06, + "loss": 0.2634, + "step": 26338 + }, + { + "epoch": 0.76, + "grad_norm": 1.246465562006764, + "learning_rate": 1.391344076226973e-06, + "loss": 0.2879, + "step": 26339 + }, + { + "epoch": 0.76, + "grad_norm": 1.4071447723814525, + "learning_rate": 1.3910189729812217e-06, + "loss": 0.286, + "step": 26340 + }, + { + "epoch": 0.76, + "grad_norm": 1.2602809814358842, + "learning_rate": 1.3906939015847664e-06, + "loss": 0.2901, + "step": 26341 + }, + { + "epoch": 0.76, + "grad_norm": 1.2832494554430258, + "learning_rate": 1.3903688620404793e-06, + "loss": 0.2942, + "step": 26342 + }, + { + "epoch": 0.76, + "grad_norm": 1.5412328254575611, + "learning_rate": 1.390043854351225e-06, + "loss": 0.3093, + "step": 26343 + }, + { + "epoch": 0.76, + "grad_norm": 1.4943745617416382, + "learning_rate": 1.3897188785198746e-06, + "loss": 0.2748, + "step": 26344 + }, + { + "epoch": 0.76, + "grad_norm": 1.3295641254346984, + "learning_rate": 1.389393934549294e-06, + "loss": 0.3084, + "step": 26345 + }, + { + "epoch": 0.76, + "grad_norm": 1.5369645956375875, + "learning_rate": 1.389069022442353e-06, + "loss": 0.2708, + "step": 26346 + }, + { + "epoch": 0.76, + "grad_norm": 1.4600324665214786, + "learning_rate": 1.3887441422019172e-06, + "loss": 0.2822, + "step": 26347 + }, + { + "epoch": 0.76, + "grad_norm": 2.2520874890239244, + "learning_rate": 1.388419293830855e-06, + "loss": 0.3028, + "step": 26348 + }, + { + "epoch": 0.76, + "grad_norm": 1.3052511328316718, + "learning_rate": 1.3880944773320321e-06, + "loss": 0.2705, + "step": 26349 + }, + { + "epoch": 0.76, + "grad_norm": 1.2891592802270442, + "learning_rate": 1.3877696927083174e-06, + "loss": 0.2754, + "step": 26350 + }, + { + "epoch": 0.76, + "grad_norm": 1.3796939397204881, + "learning_rate": 1.3874449399625734e-06, + "loss": 0.2619, + "step": 26351 + }, + { + "epoch": 0.76, + "grad_norm": 1.6485179862853485, + "learning_rate": 1.387120219097668e-06, + "loss": 0.2666, + "step": 26352 + }, + { + "epoch": 0.76, + "grad_norm": 1.6855071689139733, + "learning_rate": 1.3867955301164671e-06, + "loss": 0.2635, + "step": 26353 + }, + { + "epoch": 0.76, + "grad_norm": 1.3103076014799844, + "learning_rate": 1.3864708730218351e-06, + "loss": 0.271, + "step": 26354 + }, + { + "epoch": 0.76, + "grad_norm": 1.2876071763474617, + "learning_rate": 1.3861462478166381e-06, + "loss": 0.2918, + "step": 26355 + }, + { + "epoch": 0.76, + "grad_norm": 1.3971704377270597, + "learning_rate": 1.3858216545037406e-06, + "loss": 0.2669, + "step": 26356 + }, + { + "epoch": 0.76, + "grad_norm": 1.2946171045401018, + "learning_rate": 1.3854970930860067e-06, + "loss": 0.2753, + "step": 26357 + }, + { + "epoch": 0.76, + "grad_norm": 2.035095129357793, + "learning_rate": 1.3851725635663017e-06, + "loss": 0.295, + "step": 26358 + }, + { + "epoch": 0.76, + "grad_norm": 1.2270259871995404, + "learning_rate": 1.38484806594749e-06, + "loss": 0.2721, + "step": 26359 + }, + { + "epoch": 0.76, + "grad_norm": 1.2945925535244749, + "learning_rate": 1.3845236002324325e-06, + "loss": 0.2775, + "step": 26360 + }, + { + "epoch": 0.76, + "grad_norm": 1.2848293060376652, + "learning_rate": 1.3841991664239946e-06, + "loss": 0.2842, + "step": 26361 + }, + { + "epoch": 0.76, + "grad_norm": 1.4377231844239042, + "learning_rate": 1.3838747645250394e-06, + "loss": 0.261, + "step": 26362 + }, + { + "epoch": 0.76, + "grad_norm": 1.2856468617837042, + "learning_rate": 1.3835503945384292e-06, + "loss": 0.2603, + "step": 26363 + }, + { + "epoch": 0.76, + "grad_norm": 1.3481964050722588, + "learning_rate": 1.3832260564670269e-06, + "loss": 0.2864, + "step": 26364 + }, + { + "epoch": 0.76, + "grad_norm": 1.5851875795636, + "learning_rate": 1.382901750313695e-06, + "loss": 0.2496, + "step": 26365 + }, + { + "epoch": 0.76, + "grad_norm": 1.391836117690184, + "learning_rate": 1.382577476081295e-06, + "loss": 0.2708, + "step": 26366 + }, + { + "epoch": 0.76, + "grad_norm": 1.6451879784771237, + "learning_rate": 1.3822532337726907e-06, + "loss": 0.2766, + "step": 26367 + }, + { + "epoch": 0.76, + "grad_norm": 1.3112212811132045, + "learning_rate": 1.38192902339074e-06, + "loss": 0.2716, + "step": 26368 + }, + { + "epoch": 0.76, + "grad_norm": 1.4040938810214934, + "learning_rate": 1.3816048449383063e-06, + "loss": 0.2568, + "step": 26369 + }, + { + "epoch": 0.76, + "grad_norm": 1.3744355959164896, + "learning_rate": 1.3812806984182508e-06, + "loss": 0.2779, + "step": 26370 + }, + { + "epoch": 0.76, + "grad_norm": 1.3610026050376787, + "learning_rate": 1.380956583833432e-06, + "loss": 0.2795, + "step": 26371 + }, + { + "epoch": 0.76, + "grad_norm": 1.3883956668364474, + "learning_rate": 1.3806325011867117e-06, + "loss": 0.293, + "step": 26372 + }, + { + "epoch": 0.76, + "grad_norm": 1.3674771750683292, + "learning_rate": 1.3803084504809494e-06, + "loss": 0.2777, + "step": 26373 + }, + { + "epoch": 0.76, + "grad_norm": 1.4766773109114728, + "learning_rate": 1.3799844317190053e-06, + "loss": 0.2943, + "step": 26374 + }, + { + "epoch": 0.77, + "grad_norm": 1.2965943954269767, + "learning_rate": 1.3796604449037388e-06, + "loss": 0.2797, + "step": 26375 + }, + { + "epoch": 0.77, + "grad_norm": 1.350223542770366, + "learning_rate": 1.379336490038009e-06, + "loss": 0.263, + "step": 26376 + }, + { + "epoch": 0.77, + "grad_norm": 1.460924170754129, + "learning_rate": 1.3790125671246752e-06, + "loss": 0.2841, + "step": 26377 + }, + { + "epoch": 0.77, + "grad_norm": 0.9753148480618715, + "learning_rate": 1.3786886761665968e-06, + "loss": 0.5472, + "step": 26378 + }, + { + "epoch": 0.77, + "grad_norm": 1.3739900235466316, + "learning_rate": 1.3783648171666298e-06, + "loss": 0.2801, + "step": 26379 + }, + { + "epoch": 0.77, + "grad_norm": 1.021532670317009, + "learning_rate": 1.378040990127633e-06, + "loss": 0.607, + "step": 26380 + }, + { + "epoch": 0.77, + "grad_norm": 1.3009558345101433, + "learning_rate": 1.3777171950524648e-06, + "loss": 0.2707, + "step": 26381 + }, + { + "epoch": 0.77, + "grad_norm": 1.4057727417606114, + "learning_rate": 1.3773934319439824e-06, + "loss": 0.2634, + "step": 26382 + }, + { + "epoch": 0.77, + "grad_norm": 1.3533075185188423, + "learning_rate": 1.3770697008050437e-06, + "loss": 0.282, + "step": 26383 + }, + { + "epoch": 0.77, + "grad_norm": 1.6294459043798186, + "learning_rate": 1.3767460016385049e-06, + "loss": 0.2873, + "step": 26384 + }, + { + "epoch": 0.77, + "grad_norm": 1.5366723996093006, + "learning_rate": 1.3764223344472222e-06, + "loss": 0.2739, + "step": 26385 + }, + { + "epoch": 0.77, + "grad_norm": 1.3642091379755226, + "learning_rate": 1.3760986992340548e-06, + "loss": 0.2699, + "step": 26386 + }, + { + "epoch": 0.77, + "grad_norm": 1.268972159755769, + "learning_rate": 1.3757750960018551e-06, + "loss": 0.2858, + "step": 26387 + }, + { + "epoch": 0.77, + "grad_norm": 1.3100827879166776, + "learning_rate": 1.37545152475348e-06, + "loss": 0.2834, + "step": 26388 + }, + { + "epoch": 0.77, + "grad_norm": 1.398449915880201, + "learning_rate": 1.3751279854917854e-06, + "loss": 0.2987, + "step": 26389 + }, + { + "epoch": 0.77, + "grad_norm": 1.7477411730647778, + "learning_rate": 1.3748044782196268e-06, + "loss": 0.2954, + "step": 26390 + }, + { + "epoch": 0.77, + "grad_norm": 1.6140492829316593, + "learning_rate": 1.3744810029398587e-06, + "loss": 0.2527, + "step": 26391 + }, + { + "epoch": 0.77, + "grad_norm": 1.3068833269198505, + "learning_rate": 1.3741575596553358e-06, + "loss": 0.2591, + "step": 26392 + }, + { + "epoch": 0.77, + "grad_norm": 1.6891364492134706, + "learning_rate": 1.3738341483689132e-06, + "loss": 0.2494, + "step": 26393 + }, + { + "epoch": 0.77, + "grad_norm": 1.3136292227194415, + "learning_rate": 1.3735107690834442e-06, + "loss": 0.2785, + "step": 26394 + }, + { + "epoch": 0.77, + "grad_norm": 1.4168592540177605, + "learning_rate": 1.3731874218017843e-06, + "loss": 0.281, + "step": 26395 + }, + { + "epoch": 0.77, + "grad_norm": 1.2925764050342643, + "learning_rate": 1.372864106526784e-06, + "loss": 0.272, + "step": 26396 + }, + { + "epoch": 0.77, + "grad_norm": 1.5414680647985963, + "learning_rate": 1.3725408232612986e-06, + "loss": 0.2816, + "step": 26397 + }, + { + "epoch": 0.77, + "grad_norm": 1.5100540363518473, + "learning_rate": 1.372217572008182e-06, + "loss": 0.2591, + "step": 26398 + }, + { + "epoch": 0.77, + "grad_norm": 1.3912970194177718, + "learning_rate": 1.371894352770284e-06, + "loss": 0.2719, + "step": 26399 + }, + { + "epoch": 0.77, + "grad_norm": 1.3606928402232878, + "learning_rate": 1.3715711655504588e-06, + "loss": 0.2853, + "step": 26400 + }, + { + "epoch": 0.77, + "grad_norm": 1.2925923058303765, + "learning_rate": 1.371248010351558e-06, + "loss": 0.2856, + "step": 26401 + }, + { + "epoch": 0.77, + "grad_norm": 1.632536476382811, + "learning_rate": 1.3709248871764336e-06, + "loss": 0.2763, + "step": 26402 + }, + { + "epoch": 0.77, + "grad_norm": 1.5731334291621495, + "learning_rate": 1.3706017960279377e-06, + "loss": 0.2924, + "step": 26403 + }, + { + "epoch": 0.77, + "grad_norm": 1.2992746731249583, + "learning_rate": 1.370278736908921e-06, + "loss": 0.2774, + "step": 26404 + }, + { + "epoch": 0.77, + "grad_norm": 1.4957782864978177, + "learning_rate": 1.3699557098222367e-06, + "loss": 0.2683, + "step": 26405 + }, + { + "epoch": 0.77, + "grad_norm": 1.347102884358651, + "learning_rate": 1.3696327147707317e-06, + "loss": 0.2852, + "step": 26406 + }, + { + "epoch": 0.77, + "grad_norm": 1.3464701355612076, + "learning_rate": 1.369309751757258e-06, + "loss": 0.2796, + "step": 26407 + }, + { + "epoch": 0.77, + "grad_norm": 2.0314063302157552, + "learning_rate": 1.3689868207846663e-06, + "loss": 0.2785, + "step": 26408 + }, + { + "epoch": 0.77, + "grad_norm": 0.9838532757151375, + "learning_rate": 1.3686639218558063e-06, + "loss": 0.5579, + "step": 26409 + }, + { + "epoch": 0.77, + "grad_norm": 1.864641876742834, + "learning_rate": 1.368341054973527e-06, + "loss": 0.2638, + "step": 26410 + }, + { + "epoch": 0.77, + "grad_norm": 1.286185031200822, + "learning_rate": 1.368018220140679e-06, + "loss": 0.2551, + "step": 26411 + }, + { + "epoch": 0.77, + "grad_norm": 1.2085529948361018, + "learning_rate": 1.3676954173601098e-06, + "loss": 0.25, + "step": 26412 + }, + { + "epoch": 0.77, + "grad_norm": 1.6168323403919302, + "learning_rate": 1.3673726466346688e-06, + "loss": 0.3397, + "step": 26413 + }, + { + "epoch": 0.77, + "grad_norm": 1.6980875902860042, + "learning_rate": 1.3670499079672067e-06, + "loss": 0.292, + "step": 26414 + }, + { + "epoch": 0.77, + "grad_norm": 1.3971128631575946, + "learning_rate": 1.366727201360567e-06, + "loss": 0.3022, + "step": 26415 + }, + { + "epoch": 0.77, + "grad_norm": 1.659274537258279, + "learning_rate": 1.3664045268176007e-06, + "loss": 0.3008, + "step": 26416 + }, + { + "epoch": 0.77, + "grad_norm": 1.4078614335007558, + "learning_rate": 1.3660818843411549e-06, + "loss": 0.2767, + "step": 26417 + }, + { + "epoch": 0.77, + "grad_norm": 1.2655777271267181, + "learning_rate": 1.3657592739340765e-06, + "loss": 0.3046, + "step": 26418 + }, + { + "epoch": 0.77, + "grad_norm": 1.2493842180729684, + "learning_rate": 1.3654366955992126e-06, + "loss": 0.2805, + "step": 26419 + }, + { + "epoch": 0.77, + "grad_norm": 1.2190348978470984, + "learning_rate": 1.3651141493394104e-06, + "loss": 0.2727, + "step": 26420 + }, + { + "epoch": 0.77, + "grad_norm": 1.4790505561952962, + "learning_rate": 1.3647916351575162e-06, + "loss": 0.2982, + "step": 26421 + }, + { + "epoch": 0.77, + "grad_norm": 1.304032932063998, + "learning_rate": 1.3644691530563775e-06, + "loss": 0.2546, + "step": 26422 + }, + { + "epoch": 0.77, + "grad_norm": 1.3158812860313382, + "learning_rate": 1.3641467030388378e-06, + "loss": 0.2591, + "step": 26423 + }, + { + "epoch": 0.77, + "grad_norm": 0.9812878974696618, + "learning_rate": 1.3638242851077433e-06, + "loss": 0.5304, + "step": 26424 + }, + { + "epoch": 0.77, + "grad_norm": 1.4109371798461283, + "learning_rate": 1.36350189926594e-06, + "loss": 0.283, + "step": 26425 + }, + { + "epoch": 0.77, + "grad_norm": 1.7527535501306242, + "learning_rate": 1.363179545516274e-06, + "loss": 0.26, + "step": 26426 + }, + { + "epoch": 0.77, + "grad_norm": 1.407740215411352, + "learning_rate": 1.3628572238615878e-06, + "loss": 0.2724, + "step": 26427 + }, + { + "epoch": 0.77, + "grad_norm": 1.4456085354557182, + "learning_rate": 1.3625349343047266e-06, + "loss": 0.2808, + "step": 26428 + }, + { + "epoch": 0.77, + "grad_norm": 1.531046859021793, + "learning_rate": 1.362212676848535e-06, + "loss": 0.2742, + "step": 26429 + }, + { + "epoch": 0.77, + "grad_norm": 1.264861989120207, + "learning_rate": 1.361890451495857e-06, + "loss": 0.2648, + "step": 26430 + }, + { + "epoch": 0.77, + "grad_norm": 1.7342152295338549, + "learning_rate": 1.361568258249536e-06, + "loss": 0.2672, + "step": 26431 + }, + { + "epoch": 0.77, + "grad_norm": 1.238258082604947, + "learning_rate": 1.361246097112416e-06, + "loss": 0.2701, + "step": 26432 + }, + { + "epoch": 0.77, + "grad_norm": 1.2434933653425169, + "learning_rate": 1.3609239680873404e-06, + "loss": 0.2688, + "step": 26433 + }, + { + "epoch": 0.77, + "grad_norm": 1.2116339100169684, + "learning_rate": 1.3606018711771502e-06, + "loss": 0.2881, + "step": 26434 + }, + { + "epoch": 0.77, + "grad_norm": 1.555887194190599, + "learning_rate": 1.3602798063846884e-06, + "loss": 0.2818, + "step": 26435 + }, + { + "epoch": 0.77, + "grad_norm": 1.472401816523713, + "learning_rate": 1.3599577737127978e-06, + "loss": 0.2663, + "step": 26436 + }, + { + "epoch": 0.77, + "grad_norm": 1.5962210504251224, + "learning_rate": 1.35963577316432e-06, + "loss": 0.2831, + "step": 26437 + }, + { + "epoch": 0.77, + "grad_norm": 4.224167950499146, + "learning_rate": 1.3593138047420974e-06, + "loss": 0.2806, + "step": 26438 + }, + { + "epoch": 0.77, + "grad_norm": 1.356819323064037, + "learning_rate": 1.3589918684489706e-06, + "loss": 0.2717, + "step": 26439 + }, + { + "epoch": 0.77, + "grad_norm": 1.2624140977249465, + "learning_rate": 1.358669964287781e-06, + "loss": 0.256, + "step": 26440 + }, + { + "epoch": 0.77, + "grad_norm": 1.5313141997533957, + "learning_rate": 1.3583480922613706e-06, + "loss": 0.2839, + "step": 26441 + }, + { + "epoch": 0.77, + "grad_norm": 1.324743689818524, + "learning_rate": 1.3580262523725773e-06, + "loss": 0.2716, + "step": 26442 + }, + { + "epoch": 0.77, + "grad_norm": 1.2456999190444187, + "learning_rate": 1.3577044446242431e-06, + "loss": 0.2789, + "step": 26443 + }, + { + "epoch": 0.77, + "grad_norm": 1.5462084699775795, + "learning_rate": 1.3573826690192071e-06, + "loss": 0.2776, + "step": 26444 + }, + { + "epoch": 0.77, + "grad_norm": 1.3020647289014942, + "learning_rate": 1.35706092556031e-06, + "loss": 0.2801, + "step": 26445 + }, + { + "epoch": 0.77, + "grad_norm": 1.2771801494435118, + "learning_rate": 1.3567392142503904e-06, + "loss": 0.2828, + "step": 26446 + }, + { + "epoch": 0.77, + "grad_norm": 1.0467541448171316, + "learning_rate": 1.3564175350922877e-06, + "loss": 0.5558, + "step": 26447 + }, + { + "epoch": 0.77, + "grad_norm": 1.3333946763622468, + "learning_rate": 1.3560958880888403e-06, + "loss": 0.3028, + "step": 26448 + }, + { + "epoch": 0.77, + "grad_norm": 1.3283243993845255, + "learning_rate": 1.3557742732428875e-06, + "loss": 0.2729, + "step": 26449 + }, + { + "epoch": 0.77, + "grad_norm": 1.3441001635439602, + "learning_rate": 1.3554526905572684e-06, + "loss": 0.2716, + "step": 26450 + }, + { + "epoch": 0.77, + "grad_norm": 1.4296076989149562, + "learning_rate": 1.3551311400348182e-06, + "loss": 0.2563, + "step": 26451 + }, + { + "epoch": 0.77, + "grad_norm": 1.3259804480240127, + "learning_rate": 1.3548096216783762e-06, + "loss": 0.2886, + "step": 26452 + }, + { + "epoch": 0.77, + "grad_norm": 1.2958759755615357, + "learning_rate": 1.3544881354907802e-06, + "loss": 0.2842, + "step": 26453 + }, + { + "epoch": 0.77, + "grad_norm": 1.3580673357443187, + "learning_rate": 1.3541666814748677e-06, + "loss": 0.3059, + "step": 26454 + }, + { + "epoch": 0.77, + "grad_norm": 1.3026003939514952, + "learning_rate": 1.3538452596334733e-06, + "loss": 0.2652, + "step": 26455 + }, + { + "epoch": 0.77, + "grad_norm": 1.4254346313117432, + "learning_rate": 1.3535238699694353e-06, + "loss": 0.2767, + "step": 26456 + }, + { + "epoch": 0.77, + "grad_norm": 1.4426536367569747, + "learning_rate": 1.3532025124855892e-06, + "loss": 0.2988, + "step": 26457 + }, + { + "epoch": 0.77, + "grad_norm": 1.2514236109407062, + "learning_rate": 1.3528811871847718e-06, + "loss": 0.2554, + "step": 26458 + }, + { + "epoch": 0.77, + "grad_norm": 1.5565380521979706, + "learning_rate": 1.3525598940698181e-06, + "loss": 0.2875, + "step": 26459 + }, + { + "epoch": 0.77, + "grad_norm": 1.433291068969451, + "learning_rate": 1.3522386331435655e-06, + "loss": 0.2763, + "step": 26460 + }, + { + "epoch": 0.77, + "grad_norm": 1.5168116153242506, + "learning_rate": 1.3519174044088456e-06, + "loss": 0.2952, + "step": 26461 + }, + { + "epoch": 0.77, + "grad_norm": 1.2148269192812513, + "learning_rate": 1.3515962078684953e-06, + "loss": 0.2672, + "step": 26462 + }, + { + "epoch": 0.77, + "grad_norm": 1.3971023852158677, + "learning_rate": 1.3512750435253486e-06, + "loss": 0.2931, + "step": 26463 + }, + { + "epoch": 0.77, + "grad_norm": 1.302969733720944, + "learning_rate": 1.3509539113822406e-06, + "loss": 0.3059, + "step": 26464 + }, + { + "epoch": 0.77, + "grad_norm": 1.461723405491813, + "learning_rate": 1.3506328114420048e-06, + "loss": 0.2685, + "step": 26465 + }, + { + "epoch": 0.77, + "grad_norm": 1.4403338524815705, + "learning_rate": 1.3503117437074743e-06, + "loss": 0.2724, + "step": 26466 + }, + { + "epoch": 0.77, + "grad_norm": 1.3875796675343834, + "learning_rate": 1.349990708181484e-06, + "loss": 0.2706, + "step": 26467 + }, + { + "epoch": 0.77, + "grad_norm": 1.5383359816550892, + "learning_rate": 1.3496697048668656e-06, + "loss": 0.2855, + "step": 26468 + }, + { + "epoch": 0.77, + "grad_norm": 1.3642157261769103, + "learning_rate": 1.349348733766454e-06, + "loss": 0.2626, + "step": 26469 + }, + { + "epoch": 0.77, + "grad_norm": 1.3017760588827305, + "learning_rate": 1.3490277948830794e-06, + "loss": 0.2812, + "step": 26470 + }, + { + "epoch": 0.77, + "grad_norm": 1.3524945929513963, + "learning_rate": 1.3487068882195741e-06, + "loss": 0.2584, + "step": 26471 + }, + { + "epoch": 0.77, + "grad_norm": 0.9135039454151206, + "learning_rate": 1.348386013778772e-06, + "loss": 0.5956, + "step": 26472 + }, + { + "epoch": 0.77, + "grad_norm": 1.4053372566053275, + "learning_rate": 1.3480651715635035e-06, + "loss": 0.3032, + "step": 26473 + }, + { + "epoch": 0.77, + "grad_norm": 0.9541825086364089, + "learning_rate": 1.3477443615766e-06, + "loss": 0.57, + "step": 26474 + }, + { + "epoch": 0.77, + "grad_norm": 1.4185836897159942, + "learning_rate": 1.3474235838208932e-06, + "loss": 0.2758, + "step": 26475 + }, + { + "epoch": 0.77, + "grad_norm": 1.6300089568318021, + "learning_rate": 1.347102838299214e-06, + "loss": 0.2564, + "step": 26476 + }, + { + "epoch": 0.77, + "grad_norm": 1.2563398578606735, + "learning_rate": 1.3467821250143943e-06, + "loss": 0.2575, + "step": 26477 + }, + { + "epoch": 0.77, + "grad_norm": 1.3047566132113246, + "learning_rate": 1.3464614439692614e-06, + "loss": 0.2764, + "step": 26478 + }, + { + "epoch": 0.77, + "grad_norm": 1.2286321163142315, + "learning_rate": 1.346140795166646e-06, + "loss": 0.2623, + "step": 26479 + }, + { + "epoch": 0.77, + "grad_norm": 1.226877147836293, + "learning_rate": 1.3458201786093795e-06, + "loss": 0.2541, + "step": 26480 + }, + { + "epoch": 0.77, + "grad_norm": 1.6211440410776214, + "learning_rate": 1.3454995943002902e-06, + "loss": 0.3027, + "step": 26481 + }, + { + "epoch": 0.77, + "grad_norm": 1.2979938976004954, + "learning_rate": 1.345179042242209e-06, + "loss": 0.2619, + "step": 26482 + }, + { + "epoch": 0.77, + "grad_norm": 1.3182063850791093, + "learning_rate": 1.3448585224379617e-06, + "loss": 0.2858, + "step": 26483 + }, + { + "epoch": 0.77, + "grad_norm": 1.317070687997836, + "learning_rate": 1.3445380348903791e-06, + "loss": 0.2687, + "step": 26484 + }, + { + "epoch": 0.77, + "grad_norm": 1.2903254615718336, + "learning_rate": 1.3442175796022882e-06, + "loss": 0.2646, + "step": 26485 + }, + { + "epoch": 0.77, + "grad_norm": 1.3398038581019927, + "learning_rate": 1.343897156576518e-06, + "loss": 0.2757, + "step": 26486 + }, + { + "epoch": 0.77, + "grad_norm": 1.4874637976187526, + "learning_rate": 1.343576765815896e-06, + "loss": 0.2981, + "step": 26487 + }, + { + "epoch": 0.77, + "grad_norm": 1.3423091178123445, + "learning_rate": 1.3432564073232507e-06, + "loss": 0.2611, + "step": 26488 + }, + { + "epoch": 0.77, + "grad_norm": 1.4497476910890474, + "learning_rate": 1.342936081101407e-06, + "loss": 0.2764, + "step": 26489 + }, + { + "epoch": 0.77, + "grad_norm": 1.2628627103103758, + "learning_rate": 1.3426157871531932e-06, + "loss": 0.2775, + "step": 26490 + }, + { + "epoch": 0.77, + "grad_norm": 1.2950266421464118, + "learning_rate": 1.3422955254814351e-06, + "loss": 0.2618, + "step": 26491 + }, + { + "epoch": 0.77, + "grad_norm": 1.2353257592887463, + "learning_rate": 1.3419752960889598e-06, + "loss": 0.2636, + "step": 26492 + }, + { + "epoch": 0.77, + "grad_norm": 1.5625686705934982, + "learning_rate": 1.3416550989785932e-06, + "loss": 0.2917, + "step": 26493 + }, + { + "epoch": 0.77, + "grad_norm": 1.5935685908346975, + "learning_rate": 1.3413349341531606e-06, + "loss": 0.2901, + "step": 26494 + }, + { + "epoch": 0.77, + "grad_norm": 1.835597502453448, + "learning_rate": 1.3410148016154884e-06, + "loss": 0.2827, + "step": 26495 + }, + { + "epoch": 0.77, + "grad_norm": 1.2871711855801675, + "learning_rate": 1.3406947013684018e-06, + "loss": 0.2626, + "step": 26496 + }, + { + "epoch": 0.77, + "grad_norm": 1.2747942029155443, + "learning_rate": 1.3403746334147244e-06, + "loss": 0.2836, + "step": 26497 + }, + { + "epoch": 0.77, + "grad_norm": 1.312792780350706, + "learning_rate": 1.340054597757281e-06, + "loss": 0.2757, + "step": 26498 + }, + { + "epoch": 0.77, + "grad_norm": 2.0680177307477297, + "learning_rate": 1.3397345943988966e-06, + "loss": 0.2876, + "step": 26499 + }, + { + "epoch": 0.77, + "grad_norm": 1.280055951204069, + "learning_rate": 1.339414623342395e-06, + "loss": 0.2855, + "step": 26500 + }, + { + "epoch": 0.77, + "grad_norm": 1.7002838981517605, + "learning_rate": 1.3390946845906e-06, + "loss": 0.2914, + "step": 26501 + }, + { + "epoch": 0.77, + "grad_norm": 1.247078819156779, + "learning_rate": 1.3387747781463345e-06, + "loss": 0.2659, + "step": 26502 + }, + { + "epoch": 0.77, + "grad_norm": 1.415096856037324, + "learning_rate": 1.3384549040124229e-06, + "loss": 0.2708, + "step": 26503 + }, + { + "epoch": 0.77, + "grad_norm": 1.3356702794040127, + "learning_rate": 1.3381350621916866e-06, + "loss": 0.2716, + "step": 26504 + }, + { + "epoch": 0.77, + "grad_norm": 1.611494225849105, + "learning_rate": 1.3378152526869515e-06, + "loss": 0.2653, + "step": 26505 + }, + { + "epoch": 0.77, + "grad_norm": 1.5792343133147555, + "learning_rate": 1.337495475501035e-06, + "loss": 0.3064, + "step": 26506 + }, + { + "epoch": 0.77, + "grad_norm": 1.3385452658498471, + "learning_rate": 1.3371757306367622e-06, + "loss": 0.2833, + "step": 26507 + }, + { + "epoch": 0.77, + "grad_norm": 1.1769714193044782, + "learning_rate": 1.3368560180969537e-06, + "loss": 0.2703, + "step": 26508 + }, + { + "epoch": 0.77, + "grad_norm": 1.3242308598941617, + "learning_rate": 1.336536337884432e-06, + "loss": 0.2677, + "step": 26509 + }, + { + "epoch": 0.77, + "grad_norm": 1.7329092219951827, + "learning_rate": 1.3362166900020191e-06, + "loss": 0.2819, + "step": 26510 + }, + { + "epoch": 0.77, + "grad_norm": 1.238090435699272, + "learning_rate": 1.3358970744525328e-06, + "loss": 0.2699, + "step": 26511 + }, + { + "epoch": 0.77, + "grad_norm": 0.9978939313931201, + "learning_rate": 1.3355774912387953e-06, + "loss": 0.5389, + "step": 26512 + }, + { + "epoch": 0.77, + "grad_norm": 1.3887350828907323, + "learning_rate": 1.335257940363628e-06, + "loss": 0.2618, + "step": 26513 + }, + { + "epoch": 0.77, + "grad_norm": 1.482206452649887, + "learning_rate": 1.3349384218298488e-06, + "loss": 0.2814, + "step": 26514 + }, + { + "epoch": 0.77, + "grad_norm": 1.440890383181791, + "learning_rate": 1.3346189356402812e-06, + "loss": 0.2701, + "step": 26515 + }, + { + "epoch": 0.77, + "grad_norm": 1.2967273125116954, + "learning_rate": 1.33429948179774e-06, + "loss": 0.268, + "step": 26516 + }, + { + "epoch": 0.77, + "grad_norm": 1.3298637388700416, + "learning_rate": 1.3339800603050468e-06, + "loss": 0.2654, + "step": 26517 + }, + { + "epoch": 0.77, + "grad_norm": 1.9637947735326329, + "learning_rate": 1.3336606711650202e-06, + "loss": 0.2873, + "step": 26518 + }, + { + "epoch": 0.77, + "grad_norm": 1.4097802469675178, + "learning_rate": 1.333341314380479e-06, + "loss": 0.2856, + "step": 26519 + }, + { + "epoch": 0.77, + "grad_norm": 1.470276774675477, + "learning_rate": 1.3330219899542413e-06, + "loss": 0.2601, + "step": 26520 + }, + { + "epoch": 0.77, + "grad_norm": 1.3624485767062104, + "learning_rate": 1.3327026978891255e-06, + "loss": 0.2875, + "step": 26521 + }, + { + "epoch": 0.77, + "grad_norm": 2.9711132185170626, + "learning_rate": 1.3323834381879486e-06, + "loss": 0.2702, + "step": 26522 + }, + { + "epoch": 0.77, + "grad_norm": 1.4255021313808742, + "learning_rate": 1.3320642108535287e-06, + "loss": 0.2611, + "step": 26523 + }, + { + "epoch": 0.77, + "grad_norm": 1.3928618021248542, + "learning_rate": 1.3317450158886841e-06, + "loss": 0.2477, + "step": 26524 + }, + { + "epoch": 0.77, + "grad_norm": 1.4037919662481297, + "learning_rate": 1.3314258532962294e-06, + "loss": 0.2829, + "step": 26525 + }, + { + "epoch": 0.77, + "grad_norm": 1.2592001171512524, + "learning_rate": 1.331106723078982e-06, + "loss": 0.2699, + "step": 26526 + }, + { + "epoch": 0.77, + "grad_norm": 1.2436354782567853, + "learning_rate": 1.3307876252397583e-06, + "loss": 0.283, + "step": 26527 + }, + { + "epoch": 0.77, + "grad_norm": 1.265072698654071, + "learning_rate": 1.330468559781375e-06, + "loss": 0.2784, + "step": 26528 + }, + { + "epoch": 0.77, + "grad_norm": 1.2896798135405414, + "learning_rate": 1.3301495267066473e-06, + "loss": 0.2738, + "step": 26529 + }, + { + "epoch": 0.77, + "grad_norm": 1.8279581511339285, + "learning_rate": 1.3298305260183903e-06, + "loss": 0.2695, + "step": 26530 + }, + { + "epoch": 0.77, + "grad_norm": 1.248649116639314, + "learning_rate": 1.3295115577194196e-06, + "loss": 0.2795, + "step": 26531 + }, + { + "epoch": 0.77, + "grad_norm": 1.7727981875115038, + "learning_rate": 1.3291926218125516e-06, + "loss": 0.2843, + "step": 26532 + }, + { + "epoch": 0.77, + "grad_norm": 1.337707254342925, + "learning_rate": 1.3288737183005985e-06, + "loss": 0.2812, + "step": 26533 + }, + { + "epoch": 0.77, + "grad_norm": 1.437502209310243, + "learning_rate": 1.328554847186375e-06, + "loss": 0.2781, + "step": 26534 + }, + { + "epoch": 0.77, + "grad_norm": 1.6180795907736791, + "learning_rate": 1.3282360084726965e-06, + "loss": 0.2852, + "step": 26535 + }, + { + "epoch": 0.77, + "grad_norm": 1.308224341355667, + "learning_rate": 1.327917202162375e-06, + "loss": 0.2924, + "step": 26536 + }, + { + "epoch": 0.77, + "grad_norm": 0.9419778619389048, + "learning_rate": 1.3275984282582254e-06, + "loss": 0.5225, + "step": 26537 + }, + { + "epoch": 0.77, + "grad_norm": 1.2874608516396406, + "learning_rate": 1.3272796867630604e-06, + "loss": 0.2684, + "step": 26538 + }, + { + "epoch": 0.77, + "grad_norm": 1.3815221644286702, + "learning_rate": 1.3269609776796944e-06, + "loss": 0.2782, + "step": 26539 + }, + { + "epoch": 0.77, + "grad_norm": 1.4460714364208538, + "learning_rate": 1.3266423010109368e-06, + "loss": 0.2954, + "step": 26540 + }, + { + "epoch": 0.77, + "grad_norm": 1.3997837370671788, + "learning_rate": 1.3263236567596015e-06, + "loss": 0.2718, + "step": 26541 + }, + { + "epoch": 0.77, + "grad_norm": 1.7709760409345865, + "learning_rate": 1.3260050449285012e-06, + "loss": 0.2554, + "step": 26542 + }, + { + "epoch": 0.77, + "grad_norm": 1.4056991765862965, + "learning_rate": 1.3256864655204488e-06, + "loss": 0.2968, + "step": 26543 + }, + { + "epoch": 0.77, + "grad_norm": 1.3655463424151346, + "learning_rate": 1.3253679185382523e-06, + "loss": 0.3017, + "step": 26544 + }, + { + "epoch": 0.77, + "grad_norm": 1.2750761110763442, + "learning_rate": 1.3250494039847245e-06, + "loss": 0.2697, + "step": 26545 + }, + { + "epoch": 0.77, + "grad_norm": 1.3474722848536371, + "learning_rate": 1.3247309218626764e-06, + "loss": 0.2645, + "step": 26546 + }, + { + "epoch": 0.77, + "grad_norm": 1.4391520306009622, + "learning_rate": 1.324412472174919e-06, + "loss": 0.2817, + "step": 26547 + }, + { + "epoch": 0.77, + "grad_norm": 1.3994779908440456, + "learning_rate": 1.324094054924262e-06, + "loss": 0.2828, + "step": 26548 + }, + { + "epoch": 0.77, + "grad_norm": 1.177347547267135, + "learning_rate": 1.3237756701135164e-06, + "loss": 0.2572, + "step": 26549 + }, + { + "epoch": 0.77, + "grad_norm": 1.8911548791054626, + "learning_rate": 1.3234573177454906e-06, + "loss": 0.3051, + "step": 26550 + }, + { + "epoch": 0.77, + "grad_norm": 1.3294517549540383, + "learning_rate": 1.3231389978229965e-06, + "loss": 0.2814, + "step": 26551 + }, + { + "epoch": 0.77, + "grad_norm": 1.3158902745123464, + "learning_rate": 1.32282071034884e-06, + "loss": 0.2907, + "step": 26552 + }, + { + "epoch": 0.77, + "grad_norm": 1.3602486283222153, + "learning_rate": 1.3225024553258315e-06, + "loss": 0.2747, + "step": 26553 + }, + { + "epoch": 0.77, + "grad_norm": 1.4292356495932343, + "learning_rate": 1.3221842327567797e-06, + "loss": 0.2819, + "step": 26554 + }, + { + "epoch": 0.77, + "grad_norm": 1.2268943261379084, + "learning_rate": 1.3218660426444924e-06, + "loss": 0.2692, + "step": 26555 + }, + { + "epoch": 0.77, + "grad_norm": 1.3754826708801617, + "learning_rate": 1.321547884991779e-06, + "loss": 0.2519, + "step": 26556 + }, + { + "epoch": 0.77, + "grad_norm": 1.3672261685304403, + "learning_rate": 1.3212297598014456e-06, + "loss": 0.2826, + "step": 26557 + }, + { + "epoch": 0.77, + "grad_norm": 1.4403878754214463, + "learning_rate": 1.3209116670763005e-06, + "loss": 0.276, + "step": 26558 + }, + { + "epoch": 0.77, + "grad_norm": 1.3514452021002827, + "learning_rate": 1.320593606819151e-06, + "loss": 0.294, + "step": 26559 + }, + { + "epoch": 0.77, + "grad_norm": 1.274994177170303, + "learning_rate": 1.3202755790328049e-06, + "loss": 0.2618, + "step": 26560 + }, + { + "epoch": 0.77, + "grad_norm": 1.2822045434339773, + "learning_rate": 1.3199575837200661e-06, + "loss": 0.2787, + "step": 26561 + }, + { + "epoch": 0.77, + "grad_norm": 2.4012647267679235, + "learning_rate": 1.3196396208837431e-06, + "loss": 0.3488, + "step": 26562 + }, + { + "epoch": 0.77, + "grad_norm": 1.2674285615512133, + "learning_rate": 1.3193216905266403e-06, + "loss": 0.2745, + "step": 26563 + }, + { + "epoch": 0.77, + "grad_norm": 1.4301164188227935, + "learning_rate": 1.3190037926515653e-06, + "loss": 0.2713, + "step": 26564 + }, + { + "epoch": 0.77, + "grad_norm": 1.3089387893776172, + "learning_rate": 1.3186859272613222e-06, + "loss": 0.26, + "step": 26565 + }, + { + "epoch": 0.77, + "grad_norm": 1.3781860149431007, + "learning_rate": 1.3183680943587168e-06, + "loss": 0.2958, + "step": 26566 + }, + { + "epoch": 0.77, + "grad_norm": 1.4018531661866767, + "learning_rate": 1.318050293946555e-06, + "loss": 0.2785, + "step": 26567 + }, + { + "epoch": 0.77, + "grad_norm": 1.4713759342810377, + "learning_rate": 1.317732526027639e-06, + "loss": 0.267, + "step": 26568 + }, + { + "epoch": 0.77, + "grad_norm": 1.3131931191199002, + "learning_rate": 1.3174147906047747e-06, + "loss": 0.2488, + "step": 26569 + }, + { + "epoch": 0.77, + "grad_norm": 1.756586379959793, + "learning_rate": 1.3170970876807665e-06, + "loss": 0.2778, + "step": 26570 + }, + { + "epoch": 0.77, + "grad_norm": 1.4874549847171248, + "learning_rate": 1.3167794172584164e-06, + "loss": 0.2619, + "step": 26571 + }, + { + "epoch": 0.77, + "grad_norm": 1.3415893408621369, + "learning_rate": 1.3164617793405282e-06, + "loss": 0.2909, + "step": 26572 + }, + { + "epoch": 0.77, + "grad_norm": 1.5906525600542951, + "learning_rate": 1.3161441739299063e-06, + "loss": 0.2791, + "step": 26573 + }, + { + "epoch": 0.77, + "grad_norm": 1.5048947374707646, + "learning_rate": 1.3158266010293524e-06, + "loss": 0.2763, + "step": 26574 + }, + { + "epoch": 0.77, + "grad_norm": 1.3229790730249777, + "learning_rate": 1.3155090606416698e-06, + "loss": 0.2615, + "step": 26575 + }, + { + "epoch": 0.77, + "grad_norm": 1.40818353663594, + "learning_rate": 1.3151915527696607e-06, + "loss": 0.278, + "step": 26576 + }, + { + "epoch": 0.77, + "grad_norm": 1.3739028660341008, + "learning_rate": 1.3148740774161274e-06, + "loss": 0.2519, + "step": 26577 + }, + { + "epoch": 0.77, + "grad_norm": 3.0577410752568666, + "learning_rate": 1.3145566345838707e-06, + "loss": 0.28, + "step": 26578 + }, + { + "epoch": 0.77, + "grad_norm": 1.3233958590962953, + "learning_rate": 1.314239224275694e-06, + "loss": 0.2808, + "step": 26579 + }, + { + "epoch": 0.77, + "grad_norm": 1.3682047612144301, + "learning_rate": 1.3139218464943958e-06, + "loss": 0.3151, + "step": 26580 + }, + { + "epoch": 0.77, + "grad_norm": 1.3515063200128454, + "learning_rate": 1.3136045012427778e-06, + "loss": 0.2834, + "step": 26581 + }, + { + "epoch": 0.77, + "grad_norm": 1.4559844081662006, + "learning_rate": 1.3132871885236414e-06, + "loss": 0.2653, + "step": 26582 + }, + { + "epoch": 0.77, + "grad_norm": 1.5149355690426787, + "learning_rate": 1.3129699083397867e-06, + "loss": 0.2724, + "step": 26583 + }, + { + "epoch": 0.77, + "grad_norm": 1.3577394374222487, + "learning_rate": 1.3126526606940132e-06, + "loss": 0.2706, + "step": 26584 + }, + { + "epoch": 0.77, + "grad_norm": 1.3531963616883131, + "learning_rate": 1.3123354455891208e-06, + "loss": 0.2741, + "step": 26585 + }, + { + "epoch": 0.77, + "grad_norm": 1.5852483914278122, + "learning_rate": 1.3120182630279089e-06, + "loss": 0.2789, + "step": 26586 + }, + { + "epoch": 0.77, + "grad_norm": 1.3902420013996557, + "learning_rate": 1.3117011130131785e-06, + "loss": 0.2781, + "step": 26587 + }, + { + "epoch": 0.77, + "grad_norm": 1.3147465313716387, + "learning_rate": 1.3113839955477249e-06, + "loss": 0.267, + "step": 26588 + }, + { + "epoch": 0.77, + "grad_norm": 1.3264425150295884, + "learning_rate": 1.3110669106343488e-06, + "loss": 0.2665, + "step": 26589 + }, + { + "epoch": 0.77, + "grad_norm": 1.3835883473440658, + "learning_rate": 1.3107498582758477e-06, + "loss": 0.2753, + "step": 26590 + }, + { + "epoch": 0.77, + "grad_norm": 1.334407276591786, + "learning_rate": 1.3104328384750204e-06, + "loss": 0.2921, + "step": 26591 + }, + { + "epoch": 0.77, + "grad_norm": 1.5160250212728694, + "learning_rate": 1.310115851234664e-06, + "loss": 0.2834, + "step": 26592 + }, + { + "epoch": 0.77, + "grad_norm": 1.4194204985997576, + "learning_rate": 1.3097988965575763e-06, + "loss": 0.2711, + "step": 26593 + }, + { + "epoch": 0.77, + "grad_norm": 1.3573236614443294, + "learning_rate": 1.3094819744465542e-06, + "loss": 0.2564, + "step": 26594 + }, + { + "epoch": 0.77, + "grad_norm": 2.1574962637323805, + "learning_rate": 1.3091650849043962e-06, + "loss": 0.2734, + "step": 26595 + }, + { + "epoch": 0.77, + "grad_norm": 1.2251525774721819, + "learning_rate": 1.3088482279338953e-06, + "loss": 0.2705, + "step": 26596 + }, + { + "epoch": 0.77, + "grad_norm": 1.311294690330485, + "learning_rate": 1.308531403537852e-06, + "loss": 0.2847, + "step": 26597 + }, + { + "epoch": 0.77, + "grad_norm": 1.7879651279174866, + "learning_rate": 1.3082146117190581e-06, + "loss": 0.277, + "step": 26598 + }, + { + "epoch": 0.77, + "grad_norm": 2.245187811408738, + "learning_rate": 1.3078978524803116e-06, + "loss": 0.2831, + "step": 26599 + }, + { + "epoch": 0.77, + "grad_norm": 1.7437062212942642, + "learning_rate": 1.307581125824407e-06, + "loss": 0.2819, + "step": 26600 + }, + { + "epoch": 0.77, + "grad_norm": 1.3525587270263941, + "learning_rate": 1.30726443175414e-06, + "loss": 0.3041, + "step": 26601 + }, + { + "epoch": 0.77, + "grad_norm": 1.6045446501543703, + "learning_rate": 1.3069477702723054e-06, + "loss": 0.2878, + "step": 26602 + }, + { + "epoch": 0.77, + "grad_norm": 1.1594232212764888, + "learning_rate": 1.306631141381698e-06, + "loss": 0.2529, + "step": 26603 + }, + { + "epoch": 0.77, + "grad_norm": 1.4086487540592756, + "learning_rate": 1.3063145450851112e-06, + "loss": 0.2884, + "step": 26604 + }, + { + "epoch": 0.77, + "grad_norm": 1.2747077558158144, + "learning_rate": 1.3059979813853401e-06, + "loss": 0.2746, + "step": 26605 + }, + { + "epoch": 0.77, + "grad_norm": 1.3170110982119807, + "learning_rate": 1.305681450285179e-06, + "loss": 0.2841, + "step": 26606 + }, + { + "epoch": 0.77, + "grad_norm": 1.3117977687356668, + "learning_rate": 1.3053649517874184e-06, + "loss": 0.2694, + "step": 26607 + }, + { + "epoch": 0.77, + "grad_norm": 1.4213173928909564, + "learning_rate": 1.305048485894853e-06, + "loss": 0.2649, + "step": 26608 + }, + { + "epoch": 0.77, + "grad_norm": 1.3381927747209545, + "learning_rate": 1.3047320526102764e-06, + "loss": 0.2614, + "step": 26609 + }, + { + "epoch": 0.77, + "grad_norm": 1.261403167490579, + "learning_rate": 1.3044156519364797e-06, + "loss": 0.2731, + "step": 26610 + }, + { + "epoch": 0.77, + "grad_norm": 2.432358910698076, + "learning_rate": 1.3040992838762562e-06, + "loss": 0.302, + "step": 26611 + }, + { + "epoch": 0.77, + "grad_norm": 0.9748400308209108, + "learning_rate": 1.3037829484323977e-06, + "loss": 0.6209, + "step": 26612 + }, + { + "epoch": 0.77, + "grad_norm": 1.3150580016352447, + "learning_rate": 1.3034666456076954e-06, + "loss": 0.2691, + "step": 26613 + }, + { + "epoch": 0.77, + "grad_norm": 1.295220270299059, + "learning_rate": 1.3031503754049412e-06, + "loss": 0.2722, + "step": 26614 + }, + { + "epoch": 0.77, + "grad_norm": 1.2149130264164942, + "learning_rate": 1.302834137826927e-06, + "loss": 0.2624, + "step": 26615 + }, + { + "epoch": 0.77, + "grad_norm": 1.5158222329321513, + "learning_rate": 1.3025179328764414e-06, + "loss": 0.2724, + "step": 26616 + }, + { + "epoch": 0.77, + "grad_norm": 1.4672248587625034, + "learning_rate": 1.3022017605562758e-06, + "loss": 0.2792, + "step": 26617 + }, + { + "epoch": 0.77, + "grad_norm": 1.3053985702841455, + "learning_rate": 1.301885620869221e-06, + "loss": 0.2672, + "step": 26618 + }, + { + "epoch": 0.77, + "grad_norm": 1.3621326620765453, + "learning_rate": 1.3015695138180668e-06, + "loss": 0.2875, + "step": 26619 + }, + { + "epoch": 0.77, + "grad_norm": 1.4034800884895364, + "learning_rate": 1.3012534394056025e-06, + "loss": 0.2606, + "step": 26620 + }, + { + "epoch": 0.77, + "grad_norm": 1.3218922657048195, + "learning_rate": 1.3009373976346173e-06, + "loss": 0.3083, + "step": 26621 + }, + { + "epoch": 0.77, + "grad_norm": 2.0352349248511516, + "learning_rate": 1.300621388507901e-06, + "loss": 0.2806, + "step": 26622 + }, + { + "epoch": 0.77, + "grad_norm": 1.4417179802597544, + "learning_rate": 1.3003054120282433e-06, + "loss": 0.266, + "step": 26623 + }, + { + "epoch": 0.77, + "grad_norm": 1.2573147927295638, + "learning_rate": 1.29998946819843e-06, + "loss": 0.2885, + "step": 26624 + }, + { + "epoch": 0.77, + "grad_norm": 1.6863266654208988, + "learning_rate": 1.2996735570212516e-06, + "loss": 0.3251, + "step": 26625 + }, + { + "epoch": 0.77, + "grad_norm": 1.3566922934155554, + "learning_rate": 1.2993576784994943e-06, + "loss": 0.2937, + "step": 26626 + }, + { + "epoch": 0.77, + "grad_norm": 1.2748470976522381, + "learning_rate": 1.299041832635946e-06, + "loss": 0.3021, + "step": 26627 + }, + { + "epoch": 0.77, + "grad_norm": 1.4729936504293324, + "learning_rate": 1.298726019433395e-06, + "loss": 0.2825, + "step": 26628 + }, + { + "epoch": 0.77, + "grad_norm": 1.2865221309304546, + "learning_rate": 1.298410238894628e-06, + "loss": 0.2864, + "step": 26629 + }, + { + "epoch": 0.77, + "grad_norm": 1.3304686783811541, + "learning_rate": 1.2980944910224313e-06, + "loss": 0.2818, + "step": 26630 + }, + { + "epoch": 0.77, + "grad_norm": 1.2995578044905458, + "learning_rate": 1.297778775819592e-06, + "loss": 0.281, + "step": 26631 + }, + { + "epoch": 0.77, + "grad_norm": 5.015372604346546, + "learning_rate": 1.2974630932888966e-06, + "loss": 0.2863, + "step": 26632 + }, + { + "epoch": 0.77, + "grad_norm": 1.6038035575881102, + "learning_rate": 1.297147443433131e-06, + "loss": 0.2866, + "step": 26633 + }, + { + "epoch": 0.77, + "grad_norm": 1.2180877134834587, + "learning_rate": 1.2968318262550794e-06, + "loss": 0.2536, + "step": 26634 + }, + { + "epoch": 0.77, + "grad_norm": 1.3063199269603905, + "learning_rate": 1.2965162417575278e-06, + "loss": 0.2695, + "step": 26635 + }, + { + "epoch": 0.77, + "grad_norm": 1.6012649735891233, + "learning_rate": 1.2962006899432616e-06, + "loss": 0.3116, + "step": 26636 + }, + { + "epoch": 0.77, + "grad_norm": 1.3811496400797936, + "learning_rate": 1.2958851708150654e-06, + "loss": 0.2956, + "step": 26637 + }, + { + "epoch": 0.77, + "grad_norm": 1.4444628065662148, + "learning_rate": 1.2955696843757238e-06, + "loss": 0.2642, + "step": 26638 + }, + { + "epoch": 0.77, + "grad_norm": 1.3158669175318665, + "learning_rate": 1.2952542306280202e-06, + "loss": 0.2832, + "step": 26639 + }, + { + "epoch": 0.77, + "grad_norm": 1.5877002177166077, + "learning_rate": 1.2949388095747395e-06, + "loss": 0.2609, + "step": 26640 + }, + { + "epoch": 0.77, + "grad_norm": 1.4001624146578255, + "learning_rate": 1.2946234212186654e-06, + "loss": 0.2764, + "step": 26641 + }, + { + "epoch": 0.77, + "grad_norm": 1.4432880808171973, + "learning_rate": 1.2943080655625813e-06, + "loss": 0.2825, + "step": 26642 + }, + { + "epoch": 0.77, + "grad_norm": 1.221208641389645, + "learning_rate": 1.2939927426092686e-06, + "loss": 0.265, + "step": 26643 + }, + { + "epoch": 0.77, + "grad_norm": 1.3168406182125236, + "learning_rate": 1.2936774523615109e-06, + "loss": 0.2702, + "step": 26644 + }, + { + "epoch": 0.77, + "grad_norm": 0.9353611872083845, + "learning_rate": 1.2933621948220908e-06, + "loss": 0.6129, + "step": 26645 + }, + { + "epoch": 0.77, + "grad_norm": 1.481016099445944, + "learning_rate": 1.293046969993791e-06, + "loss": 0.2691, + "step": 26646 + }, + { + "epoch": 0.77, + "grad_norm": 1.2254243059091867, + "learning_rate": 1.2927317778793925e-06, + "loss": 0.2731, + "step": 26647 + }, + { + "epoch": 0.77, + "grad_norm": 1.4361288608358915, + "learning_rate": 1.2924166184816766e-06, + "loss": 0.2751, + "step": 26648 + }, + { + "epoch": 0.77, + "grad_norm": 1.5535855509551018, + "learning_rate": 1.2921014918034259e-06, + "loss": 0.2618, + "step": 26649 + }, + { + "epoch": 0.77, + "grad_norm": 1.3664096581107261, + "learning_rate": 1.2917863978474204e-06, + "loss": 0.283, + "step": 26650 + }, + { + "epoch": 0.77, + "grad_norm": 1.4916018414668144, + "learning_rate": 1.2914713366164427e-06, + "loss": 0.2889, + "step": 26651 + }, + { + "epoch": 0.77, + "grad_norm": 1.27142975305825, + "learning_rate": 1.2911563081132717e-06, + "loss": 0.2577, + "step": 26652 + }, + { + "epoch": 0.77, + "grad_norm": 0.9107579123839138, + "learning_rate": 1.2908413123406855e-06, + "loss": 0.5073, + "step": 26653 + }, + { + "epoch": 0.77, + "grad_norm": 1.3341068658369266, + "learning_rate": 1.2905263493014657e-06, + "loss": 0.2795, + "step": 26654 + }, + { + "epoch": 0.77, + "grad_norm": 1.9860155761787286, + "learning_rate": 1.2902114189983927e-06, + "loss": 0.2872, + "step": 26655 + }, + { + "epoch": 0.77, + "grad_norm": 2.0983546425097517, + "learning_rate": 1.2898965214342447e-06, + "loss": 0.2808, + "step": 26656 + }, + { + "epoch": 0.77, + "grad_norm": 1.4306635377386046, + "learning_rate": 1.2895816566118014e-06, + "loss": 0.2895, + "step": 26657 + }, + { + "epoch": 0.77, + "grad_norm": 1.2449546269140177, + "learning_rate": 1.2892668245338408e-06, + "loss": 0.2835, + "step": 26658 + }, + { + "epoch": 0.77, + "grad_norm": 1.4398558824672238, + "learning_rate": 1.2889520252031418e-06, + "loss": 0.284, + "step": 26659 + }, + { + "epoch": 0.77, + "grad_norm": 0.9695957669889425, + "learning_rate": 1.288637258622482e-06, + "loss": 0.5288, + "step": 26660 + }, + { + "epoch": 0.77, + "grad_norm": 1.1657102899669285, + "learning_rate": 1.2883225247946412e-06, + "loss": 0.2888, + "step": 26661 + }, + { + "epoch": 0.77, + "grad_norm": 1.3132903315870588, + "learning_rate": 1.2880078237223943e-06, + "loss": 0.2743, + "step": 26662 + }, + { + "epoch": 0.77, + "grad_norm": 1.3823757167376178, + "learning_rate": 1.2876931554085191e-06, + "loss": 0.2741, + "step": 26663 + }, + { + "epoch": 0.77, + "grad_norm": 1.368820811261778, + "learning_rate": 1.287378519855793e-06, + "loss": 0.2919, + "step": 26664 + }, + { + "epoch": 0.77, + "grad_norm": 1.4193693929156406, + "learning_rate": 1.287063917066993e-06, + "loss": 0.284, + "step": 26665 + }, + { + "epoch": 0.77, + "grad_norm": 1.2642522074293678, + "learning_rate": 1.2867493470448943e-06, + "loss": 0.2767, + "step": 26666 + }, + { + "epoch": 0.77, + "grad_norm": 1.4484623432297121, + "learning_rate": 1.2864348097922746e-06, + "loss": 0.2942, + "step": 26667 + }, + { + "epoch": 0.77, + "grad_norm": 1.5103956716024274, + "learning_rate": 1.2861203053119086e-06, + "loss": 0.2631, + "step": 26668 + }, + { + "epoch": 0.77, + "grad_norm": 1.4772450093762395, + "learning_rate": 1.2858058336065738e-06, + "loss": 0.2895, + "step": 26669 + }, + { + "epoch": 0.77, + "grad_norm": 1.43780749121438, + "learning_rate": 1.285491394679042e-06, + "loss": 0.2834, + "step": 26670 + }, + { + "epoch": 0.77, + "grad_norm": 1.326423928573755, + "learning_rate": 1.2851769885320897e-06, + "loss": 0.2797, + "step": 26671 + }, + { + "epoch": 0.77, + "grad_norm": 2.006704610644627, + "learning_rate": 1.2848626151684918e-06, + "loss": 0.2657, + "step": 26672 + }, + { + "epoch": 0.77, + "grad_norm": 1.4565295316773805, + "learning_rate": 1.2845482745910227e-06, + "loss": 0.2671, + "step": 26673 + }, + { + "epoch": 0.77, + "grad_norm": 1.4069478539243152, + "learning_rate": 1.2842339668024562e-06, + "loss": 0.2722, + "step": 26674 + }, + { + "epoch": 0.77, + "grad_norm": 1.550226319379654, + "learning_rate": 1.2839196918055663e-06, + "loss": 0.2841, + "step": 26675 + }, + { + "epoch": 0.77, + "grad_norm": 1.4866524048613468, + "learning_rate": 1.2836054496031258e-06, + "loss": 0.2831, + "step": 26676 + }, + { + "epoch": 0.77, + "grad_norm": 1.4878932619555834, + "learning_rate": 1.283291240197909e-06, + "loss": 0.2916, + "step": 26677 + }, + { + "epoch": 0.77, + "grad_norm": 1.251080817697751, + "learning_rate": 1.2829770635926897e-06, + "loss": 0.2602, + "step": 26678 + }, + { + "epoch": 0.77, + "grad_norm": 2.492126508491625, + "learning_rate": 1.2826629197902373e-06, + "loss": 0.2855, + "step": 26679 + }, + { + "epoch": 0.77, + "grad_norm": 1.4123297983008, + "learning_rate": 1.2823488087933279e-06, + "loss": 0.2606, + "step": 26680 + }, + { + "epoch": 0.77, + "grad_norm": 1.4059875861099473, + "learning_rate": 1.2820347306047292e-06, + "loss": 0.2922, + "step": 26681 + }, + { + "epoch": 0.77, + "grad_norm": 1.4504823935940259, + "learning_rate": 1.2817206852272157e-06, + "loss": 0.2751, + "step": 26682 + }, + { + "epoch": 0.77, + "grad_norm": 1.4193004552525097, + "learning_rate": 1.2814066726635582e-06, + "loss": 0.288, + "step": 26683 + }, + { + "epoch": 0.77, + "grad_norm": 1.3630663713571272, + "learning_rate": 1.2810926929165285e-06, + "loss": 0.2901, + "step": 26684 + }, + { + "epoch": 0.77, + "grad_norm": 3.3527388524086215, + "learning_rate": 1.2807787459888965e-06, + "loss": 0.2682, + "step": 26685 + }, + { + "epoch": 0.77, + "grad_norm": 1.706618025741561, + "learning_rate": 1.280464831883434e-06, + "loss": 0.2793, + "step": 26686 + }, + { + "epoch": 0.77, + "grad_norm": 1.323676145547638, + "learning_rate": 1.2801509506029102e-06, + "loss": 0.2789, + "step": 26687 + }, + { + "epoch": 0.77, + "grad_norm": 1.2493325764345373, + "learning_rate": 1.2798371021500971e-06, + "loss": 0.2741, + "step": 26688 + }, + { + "epoch": 0.77, + "grad_norm": 1.3810619128701307, + "learning_rate": 1.2795232865277612e-06, + "loss": 0.2846, + "step": 26689 + }, + { + "epoch": 0.77, + "grad_norm": 1.32084235288674, + "learning_rate": 1.2792095037386737e-06, + "loss": 0.2638, + "step": 26690 + }, + { + "epoch": 0.77, + "grad_norm": 1.461863522619761, + "learning_rate": 1.2788957537856035e-06, + "loss": 0.2734, + "step": 26691 + }, + { + "epoch": 0.77, + "grad_norm": 1.2840555097838215, + "learning_rate": 1.2785820366713198e-06, + "loss": 0.2993, + "step": 26692 + }, + { + "epoch": 0.77, + "grad_norm": 1.3434509860884016, + "learning_rate": 1.2782683523985905e-06, + "loss": 0.295, + "step": 26693 + }, + { + "epoch": 0.77, + "grad_norm": 1.5683468301673416, + "learning_rate": 1.2779547009701842e-06, + "loss": 0.3, + "step": 26694 + }, + { + "epoch": 0.77, + "grad_norm": 1.3935020463171266, + "learning_rate": 1.2776410823888697e-06, + "loss": 0.2857, + "step": 26695 + }, + { + "epoch": 0.77, + "grad_norm": 1.4375191232232647, + "learning_rate": 1.2773274966574134e-06, + "loss": 0.2815, + "step": 26696 + }, + { + "epoch": 0.77, + "grad_norm": 1.391843428169969, + "learning_rate": 1.2770139437785845e-06, + "loss": 0.2638, + "step": 26697 + }, + { + "epoch": 0.77, + "grad_norm": 1.2654949071617425, + "learning_rate": 1.2767004237551478e-06, + "loss": 0.261, + "step": 26698 + }, + { + "epoch": 0.77, + "grad_norm": 1.5433711445937734, + "learning_rate": 1.2763869365898712e-06, + "loss": 0.2632, + "step": 26699 + }, + { + "epoch": 0.77, + "grad_norm": 1.2879846056076876, + "learning_rate": 1.276073482285521e-06, + "loss": 0.2805, + "step": 26700 + }, + { + "epoch": 0.77, + "grad_norm": 1.4020596859033194, + "learning_rate": 1.2757600608448634e-06, + "loss": 0.3151, + "step": 26701 + }, + { + "epoch": 0.77, + "grad_norm": 1.2988379203846108, + "learning_rate": 1.275446672270665e-06, + "loss": 0.2832, + "step": 26702 + }, + { + "epoch": 0.77, + "grad_norm": 1.2757328381533561, + "learning_rate": 1.275133316565691e-06, + "loss": 0.2581, + "step": 26703 + }, + { + "epoch": 0.77, + "grad_norm": 1.209994675657145, + "learning_rate": 1.2748199937327066e-06, + "loss": 0.2735, + "step": 26704 + }, + { + "epoch": 0.77, + "grad_norm": 1.4613691667582265, + "learning_rate": 1.2745067037744785e-06, + "loss": 0.2964, + "step": 26705 + }, + { + "epoch": 0.77, + "grad_norm": 1.2700300624446181, + "learning_rate": 1.2741934466937689e-06, + "loss": 0.2536, + "step": 26706 + }, + { + "epoch": 0.77, + "grad_norm": 1.3775643181464556, + "learning_rate": 1.273880222493345e-06, + "loss": 0.2674, + "step": 26707 + }, + { + "epoch": 0.77, + "grad_norm": 1.4593653077253597, + "learning_rate": 1.2735670311759674e-06, + "loss": 0.2912, + "step": 26708 + }, + { + "epoch": 0.77, + "grad_norm": 1.273538492115937, + "learning_rate": 1.2732538727444027e-06, + "loss": 0.2843, + "step": 26709 + }, + { + "epoch": 0.77, + "grad_norm": 1.3370011255136247, + "learning_rate": 1.2729407472014132e-06, + "loss": 0.2756, + "step": 26710 + }, + { + "epoch": 0.77, + "grad_norm": 1.4109016349646297, + "learning_rate": 1.272627654549764e-06, + "loss": 0.2815, + "step": 26711 + }, + { + "epoch": 0.77, + "grad_norm": 1.417968949455221, + "learning_rate": 1.272314594792216e-06, + "loss": 0.2803, + "step": 26712 + }, + { + "epoch": 0.77, + "grad_norm": 1.2957511753112254, + "learning_rate": 1.272001567931534e-06, + "loss": 0.2826, + "step": 26713 + }, + { + "epoch": 0.77, + "grad_norm": 2.179784906745792, + "learning_rate": 1.2716885739704787e-06, + "loss": 0.2793, + "step": 26714 + }, + { + "epoch": 0.77, + "grad_norm": 1.3507189988917772, + "learning_rate": 1.2713756129118137e-06, + "loss": 0.2622, + "step": 26715 + }, + { + "epoch": 0.77, + "grad_norm": 1.3130655989173479, + "learning_rate": 1.2710626847583013e-06, + "loss": 0.2977, + "step": 26716 + }, + { + "epoch": 0.77, + "grad_norm": 1.2237165691526402, + "learning_rate": 1.2707497895127013e-06, + "loss": 0.2691, + "step": 26717 + }, + { + "epoch": 0.77, + "grad_norm": 1.5576451282961332, + "learning_rate": 1.270436927177775e-06, + "loss": 0.2649, + "step": 26718 + }, + { + "epoch": 0.77, + "grad_norm": 1.2930181955691564, + "learning_rate": 1.2701240977562846e-06, + "loss": 0.2663, + "step": 26719 + }, + { + "epoch": 0.78, + "grad_norm": 1.5118897950863417, + "learning_rate": 1.2698113012509906e-06, + "loss": 0.2713, + "step": 26720 + }, + { + "epoch": 0.78, + "grad_norm": 1.2703470110452222, + "learning_rate": 1.269498537664653e-06, + "loss": 0.26, + "step": 26721 + }, + { + "epoch": 0.78, + "grad_norm": 1.231257473455264, + "learning_rate": 1.269185807000032e-06, + "loss": 0.2589, + "step": 26722 + }, + { + "epoch": 0.78, + "grad_norm": 1.5802015743623874, + "learning_rate": 1.2688731092598877e-06, + "loss": 0.2771, + "step": 26723 + }, + { + "epoch": 0.78, + "grad_norm": 1.3739321618289886, + "learning_rate": 1.2685604444469813e-06, + "loss": 0.2983, + "step": 26724 + }, + { + "epoch": 0.78, + "grad_norm": 1.5380400360416064, + "learning_rate": 1.2682478125640685e-06, + "loss": 0.2808, + "step": 26725 + }, + { + "epoch": 0.78, + "grad_norm": 1.3476165069396178, + "learning_rate": 1.2679352136139105e-06, + "loss": 0.2614, + "step": 26726 + }, + { + "epoch": 0.78, + "grad_norm": 1.40598287378101, + "learning_rate": 1.267622647599266e-06, + "loss": 0.2824, + "step": 26727 + }, + { + "epoch": 0.78, + "grad_norm": 1.7768237270084926, + "learning_rate": 1.2673101145228923e-06, + "loss": 0.2776, + "step": 26728 + }, + { + "epoch": 0.78, + "grad_norm": 1.3521872168759772, + "learning_rate": 1.2669976143875484e-06, + "loss": 0.2811, + "step": 26729 + }, + { + "epoch": 0.78, + "grad_norm": 1.28924218674297, + "learning_rate": 1.2666851471959923e-06, + "loss": 0.2837, + "step": 26730 + }, + { + "epoch": 0.78, + "grad_norm": 1.298676465333867, + "learning_rate": 1.2663727129509812e-06, + "loss": 0.2706, + "step": 26731 + }, + { + "epoch": 0.78, + "grad_norm": 1.7394778697713054, + "learning_rate": 1.2660603116552716e-06, + "loss": 0.2558, + "step": 26732 + }, + { + "epoch": 0.78, + "grad_norm": 1.7565443630331148, + "learning_rate": 1.2657479433116232e-06, + "loss": 0.2783, + "step": 26733 + }, + { + "epoch": 0.78, + "grad_norm": 1.3180944764585711, + "learning_rate": 1.2654356079227891e-06, + "loss": 0.2823, + "step": 26734 + }, + { + "epoch": 0.78, + "grad_norm": 1.2940597401143885, + "learning_rate": 1.265123305491529e-06, + "loss": 0.2917, + "step": 26735 + }, + { + "epoch": 0.78, + "grad_norm": 1.2164650054799604, + "learning_rate": 1.2648110360205956e-06, + "loss": 0.2823, + "step": 26736 + }, + { + "epoch": 0.78, + "grad_norm": 1.2850407126938623, + "learning_rate": 1.2644987995127456e-06, + "loss": 0.2619, + "step": 26737 + }, + { + "epoch": 0.78, + "grad_norm": 1.362548873184832, + "learning_rate": 1.2641865959707356e-06, + "loss": 0.2979, + "step": 26738 + }, + { + "epoch": 0.78, + "grad_norm": 1.8108657449842651, + "learning_rate": 1.2638744253973207e-06, + "loss": 0.2791, + "step": 26739 + }, + { + "epoch": 0.78, + "grad_norm": 1.3934157318016462, + "learning_rate": 1.2635622877952546e-06, + "loss": 0.2607, + "step": 26740 + }, + { + "epoch": 0.78, + "grad_norm": 1.3649273718206716, + "learning_rate": 1.2632501831672933e-06, + "loss": 0.2639, + "step": 26741 + }, + { + "epoch": 0.78, + "grad_norm": 1.5171753879741872, + "learning_rate": 1.2629381115161904e-06, + "loss": 0.2599, + "step": 26742 + }, + { + "epoch": 0.78, + "grad_norm": 1.3004942716479109, + "learning_rate": 1.262626072844702e-06, + "loss": 0.2753, + "step": 26743 + }, + { + "epoch": 0.78, + "grad_norm": 0.9020876580444406, + "learning_rate": 1.2623140671555778e-06, + "loss": 0.5158, + "step": 26744 + }, + { + "epoch": 0.78, + "grad_norm": 1.460659979492746, + "learning_rate": 1.2620020944515744e-06, + "loss": 0.297, + "step": 26745 + }, + { + "epoch": 0.78, + "grad_norm": 1.3032946210867369, + "learning_rate": 1.2616901547354431e-06, + "loss": 0.2682, + "step": 26746 + }, + { + "epoch": 0.78, + "grad_norm": 1.4803183891645544, + "learning_rate": 1.261378248009938e-06, + "loss": 0.2635, + "step": 26747 + }, + { + "epoch": 0.78, + "grad_norm": 1.4815836923496368, + "learning_rate": 1.2610663742778112e-06, + "loss": 0.2713, + "step": 26748 + }, + { + "epoch": 0.78, + "grad_norm": 1.3826116432168372, + "learning_rate": 1.2607545335418154e-06, + "loss": 0.2848, + "step": 26749 + }, + { + "epoch": 0.78, + "grad_norm": 1.3433193955831093, + "learning_rate": 1.260442725804702e-06, + "loss": 0.2781, + "step": 26750 + }, + { + "epoch": 0.78, + "grad_norm": 1.4217053945091056, + "learning_rate": 1.2601309510692229e-06, + "loss": 0.2769, + "step": 26751 + }, + { + "epoch": 0.78, + "grad_norm": 1.7729167126044776, + "learning_rate": 1.2598192093381316e-06, + "loss": 0.2831, + "step": 26752 + }, + { + "epoch": 0.78, + "grad_norm": 1.2890372683743427, + "learning_rate": 1.2595075006141754e-06, + "loss": 0.2641, + "step": 26753 + }, + { + "epoch": 0.78, + "grad_norm": 1.599617622542487, + "learning_rate": 1.2591958249001074e-06, + "loss": 0.2885, + "step": 26754 + }, + { + "epoch": 0.78, + "grad_norm": 2.204200614043871, + "learning_rate": 1.2588841821986774e-06, + "loss": 0.2987, + "step": 26755 + }, + { + "epoch": 0.78, + "grad_norm": 1.3909449563475789, + "learning_rate": 1.2585725725126363e-06, + "loss": 0.2705, + "step": 26756 + }, + { + "epoch": 0.78, + "grad_norm": 1.4026975248219324, + "learning_rate": 1.2582609958447334e-06, + "loss": 0.2788, + "step": 26757 + }, + { + "epoch": 0.78, + "grad_norm": 1.2460718813341263, + "learning_rate": 1.2579494521977188e-06, + "loss": 0.2647, + "step": 26758 + }, + { + "epoch": 0.78, + "grad_norm": 1.318570308687134, + "learning_rate": 1.2576379415743416e-06, + "loss": 0.2827, + "step": 26759 + }, + { + "epoch": 0.78, + "grad_norm": 1.192048289140361, + "learning_rate": 1.257326463977353e-06, + "loss": 0.2659, + "step": 26760 + }, + { + "epoch": 0.78, + "grad_norm": 1.3092950473485123, + "learning_rate": 1.2570150194094982e-06, + "loss": 0.2825, + "step": 26761 + }, + { + "epoch": 0.78, + "grad_norm": 1.3626459753696125, + "learning_rate": 1.2567036078735273e-06, + "loss": 0.265, + "step": 26762 + }, + { + "epoch": 0.78, + "grad_norm": 1.517772140625172, + "learning_rate": 1.2563922293721903e-06, + "loss": 0.2764, + "step": 26763 + }, + { + "epoch": 0.78, + "grad_norm": 1.3227389750776508, + "learning_rate": 1.2560808839082316e-06, + "loss": 0.2803, + "step": 26764 + }, + { + "epoch": 0.78, + "grad_norm": 1.285374145230071, + "learning_rate": 1.255769571484401e-06, + "loss": 0.2879, + "step": 26765 + }, + { + "epoch": 0.78, + "grad_norm": 1.2244759528070337, + "learning_rate": 1.2554582921034454e-06, + "loss": 0.3049, + "step": 26766 + }, + { + "epoch": 0.78, + "grad_norm": 1.318661901311678, + "learning_rate": 1.255147045768112e-06, + "loss": 0.3099, + "step": 26767 + }, + { + "epoch": 0.78, + "grad_norm": 1.336851976596216, + "learning_rate": 1.2548358324811471e-06, + "loss": 0.2756, + "step": 26768 + }, + { + "epoch": 0.78, + "grad_norm": 1.261976340090373, + "learning_rate": 1.2545246522452974e-06, + "loss": 0.2556, + "step": 26769 + }, + { + "epoch": 0.78, + "grad_norm": 1.242100276584155, + "learning_rate": 1.2542135050633097e-06, + "loss": 0.2532, + "step": 26770 + }, + { + "epoch": 0.78, + "grad_norm": 1.4169806502329967, + "learning_rate": 1.2539023909379306e-06, + "loss": 0.2773, + "step": 26771 + }, + { + "epoch": 0.78, + "grad_norm": 1.3607621362450335, + "learning_rate": 1.2535913098719032e-06, + "loss": 0.2654, + "step": 26772 + }, + { + "epoch": 0.78, + "grad_norm": 1.2676893523649868, + "learning_rate": 1.253280261867974e-06, + "loss": 0.2845, + "step": 26773 + }, + { + "epoch": 0.78, + "grad_norm": 1.338054287290117, + "learning_rate": 1.2529692469288878e-06, + "loss": 0.2831, + "step": 26774 + }, + { + "epoch": 0.78, + "grad_norm": 1.2208399718925274, + "learning_rate": 1.2526582650573899e-06, + "loss": 0.2711, + "step": 26775 + }, + { + "epoch": 0.78, + "grad_norm": 1.342859205146989, + "learning_rate": 1.2523473162562238e-06, + "loss": 0.3073, + "step": 26776 + }, + { + "epoch": 0.78, + "grad_norm": 1.273229056330876, + "learning_rate": 1.252036400528135e-06, + "loss": 0.2836, + "step": 26777 + }, + { + "epoch": 0.78, + "grad_norm": 1.4498975920394435, + "learning_rate": 1.2517255178758659e-06, + "loss": 0.2724, + "step": 26778 + }, + { + "epoch": 0.78, + "grad_norm": 1.346574181526849, + "learning_rate": 1.2514146683021621e-06, + "loss": 0.2907, + "step": 26779 + }, + { + "epoch": 0.78, + "grad_norm": 1.2858937745554526, + "learning_rate": 1.2511038518097646e-06, + "loss": 0.2718, + "step": 26780 + }, + { + "epoch": 0.78, + "grad_norm": 1.4910366204441952, + "learning_rate": 1.250793068401417e-06, + "loss": 0.287, + "step": 26781 + }, + { + "epoch": 0.78, + "grad_norm": 1.3821064299368828, + "learning_rate": 1.2504823180798614e-06, + "loss": 0.2736, + "step": 26782 + }, + { + "epoch": 0.78, + "grad_norm": 1.243464177018255, + "learning_rate": 1.250171600847842e-06, + "loss": 0.2723, + "step": 26783 + }, + { + "epoch": 0.78, + "grad_norm": 1.4042478153182478, + "learning_rate": 1.2498609167080994e-06, + "loss": 0.2759, + "step": 26784 + }, + { + "epoch": 0.78, + "grad_norm": 1.2412479992433567, + "learning_rate": 1.2495502656633756e-06, + "loss": 0.2743, + "step": 26785 + }, + { + "epoch": 0.78, + "grad_norm": 1.566604924897376, + "learning_rate": 1.2492396477164126e-06, + "loss": 0.2803, + "step": 26786 + }, + { + "epoch": 0.78, + "grad_norm": 1.3920398408200265, + "learning_rate": 1.2489290628699513e-06, + "loss": 0.2679, + "step": 26787 + }, + { + "epoch": 0.78, + "grad_norm": 1.43305256181562, + "learning_rate": 1.2486185111267334e-06, + "loss": 0.2783, + "step": 26788 + }, + { + "epoch": 0.78, + "grad_norm": 1.3063942194162148, + "learning_rate": 1.2483079924894982e-06, + "loss": 0.2693, + "step": 26789 + }, + { + "epoch": 0.78, + "grad_norm": 1.2936053184762952, + "learning_rate": 1.2479975069609862e-06, + "loss": 0.2726, + "step": 26790 + }, + { + "epoch": 0.78, + "grad_norm": 1.2955166821827728, + "learning_rate": 1.2476870545439395e-06, + "loss": 0.2542, + "step": 26791 + }, + { + "epoch": 0.78, + "grad_norm": 1.3340529287387124, + "learning_rate": 1.2473766352410943e-06, + "loss": 0.2581, + "step": 26792 + }, + { + "epoch": 0.78, + "grad_norm": 1.1653931072698052, + "learning_rate": 1.2470662490551926e-06, + "loss": 0.2636, + "step": 26793 + }, + { + "epoch": 0.78, + "grad_norm": 1.2752607542948167, + "learning_rate": 1.2467558959889724e-06, + "loss": 0.2476, + "step": 26794 + }, + { + "epoch": 0.78, + "grad_norm": 1.2004947682982412, + "learning_rate": 1.2464455760451733e-06, + "loss": 0.2743, + "step": 26795 + }, + { + "epoch": 0.78, + "grad_norm": 1.2902569189755135, + "learning_rate": 1.2461352892265333e-06, + "loss": 0.2961, + "step": 26796 + }, + { + "epoch": 0.78, + "grad_norm": 1.4525931385593123, + "learning_rate": 1.245825035535791e-06, + "loss": 0.2792, + "step": 26797 + }, + { + "epoch": 0.78, + "grad_norm": 1.6430962179315602, + "learning_rate": 1.2455148149756862e-06, + "loss": 0.2927, + "step": 26798 + }, + { + "epoch": 0.78, + "grad_norm": 1.3078236634164677, + "learning_rate": 1.2452046275489533e-06, + "loss": 0.2737, + "step": 26799 + }, + { + "epoch": 0.78, + "grad_norm": 1.2712242712188038, + "learning_rate": 1.2448944732583308e-06, + "loss": 0.2717, + "step": 26800 + }, + { + "epoch": 0.78, + "grad_norm": 1.3973580832364296, + "learning_rate": 1.2445843521065564e-06, + "loss": 0.3111, + "step": 26801 + }, + { + "epoch": 0.78, + "grad_norm": 1.501383566411583, + "learning_rate": 1.2442742640963673e-06, + "loss": 0.2866, + "step": 26802 + }, + { + "epoch": 0.78, + "grad_norm": 1.2401407810975154, + "learning_rate": 1.2439642092304988e-06, + "loss": 0.2889, + "step": 26803 + }, + { + "epoch": 0.78, + "grad_norm": 1.329836038918549, + "learning_rate": 1.2436541875116881e-06, + "loss": 0.2978, + "step": 26804 + }, + { + "epoch": 0.78, + "grad_norm": 1.2628371947900714, + "learning_rate": 1.2433441989426709e-06, + "loss": 0.2838, + "step": 26805 + }, + { + "epoch": 0.78, + "grad_norm": 1.5089970687063117, + "learning_rate": 1.2430342435261828e-06, + "loss": 0.2738, + "step": 26806 + }, + { + "epoch": 0.78, + "grad_norm": 1.253108657284057, + "learning_rate": 1.2427243212649608e-06, + "loss": 0.2779, + "step": 26807 + }, + { + "epoch": 0.78, + "grad_norm": 1.3918284333970552, + "learning_rate": 1.2424144321617365e-06, + "loss": 0.291, + "step": 26808 + }, + { + "epoch": 0.78, + "grad_norm": 1.2431015905261427, + "learning_rate": 1.2421045762192469e-06, + "loss": 0.287, + "step": 26809 + }, + { + "epoch": 0.78, + "grad_norm": 1.4801278865888208, + "learning_rate": 1.2417947534402257e-06, + "loss": 0.2673, + "step": 26810 + }, + { + "epoch": 0.78, + "grad_norm": 1.3396271622288924, + "learning_rate": 1.241484963827408e-06, + "loss": 0.2671, + "step": 26811 + }, + { + "epoch": 0.78, + "grad_norm": 1.3409276585177017, + "learning_rate": 1.241175207383527e-06, + "loss": 0.2698, + "step": 26812 + }, + { + "epoch": 0.78, + "grad_norm": 1.3126063167019326, + "learning_rate": 1.240865484111316e-06, + "loss": 0.3405, + "step": 26813 + }, + { + "epoch": 0.78, + "grad_norm": 1.3141555978199342, + "learning_rate": 1.2405557940135093e-06, + "loss": 0.2721, + "step": 26814 + }, + { + "epoch": 0.78, + "grad_norm": 1.4119063924154578, + "learning_rate": 1.240246137092841e-06, + "loss": 0.2617, + "step": 26815 + }, + { + "epoch": 0.78, + "grad_norm": 1.327513387439551, + "learning_rate": 1.2399365133520408e-06, + "loss": 0.2669, + "step": 26816 + }, + { + "epoch": 0.78, + "grad_norm": 1.374381394971144, + "learning_rate": 1.2396269227938422e-06, + "loss": 0.2675, + "step": 26817 + }, + { + "epoch": 0.78, + "grad_norm": 1.467124280810515, + "learning_rate": 1.2393173654209778e-06, + "loss": 0.3064, + "step": 26818 + }, + { + "epoch": 0.78, + "grad_norm": 1.3011620434884015, + "learning_rate": 1.2390078412361796e-06, + "loss": 0.2609, + "step": 26819 + }, + { + "epoch": 0.78, + "grad_norm": 1.3487464199012138, + "learning_rate": 1.23869835024218e-06, + "loss": 0.2801, + "step": 26820 + }, + { + "epoch": 0.78, + "grad_norm": 1.3826769698339934, + "learning_rate": 1.2383888924417082e-06, + "loss": 0.2581, + "step": 26821 + }, + { + "epoch": 0.78, + "grad_norm": 1.3355386448641855, + "learning_rate": 1.2380794678374958e-06, + "loss": 0.2589, + "step": 26822 + }, + { + "epoch": 0.78, + "grad_norm": 1.3560913600229025, + "learning_rate": 1.2377700764322736e-06, + "loss": 0.2671, + "step": 26823 + }, + { + "epoch": 0.78, + "grad_norm": 2.753482893889406, + "learning_rate": 1.2374607182287728e-06, + "loss": 0.2994, + "step": 26824 + }, + { + "epoch": 0.78, + "grad_norm": 1.3498874960430214, + "learning_rate": 1.237151393229723e-06, + "loss": 0.2688, + "step": 26825 + }, + { + "epoch": 0.78, + "grad_norm": 1.3279331327110557, + "learning_rate": 1.2368421014378545e-06, + "loss": 0.2862, + "step": 26826 + }, + { + "epoch": 0.78, + "grad_norm": 1.434622420934388, + "learning_rate": 1.2365328428558953e-06, + "loss": 0.2868, + "step": 26827 + }, + { + "epoch": 0.78, + "grad_norm": 1.3579524698610517, + "learning_rate": 1.2362236174865755e-06, + "loss": 0.2847, + "step": 26828 + }, + { + "epoch": 0.78, + "grad_norm": 1.7995849991324622, + "learning_rate": 1.2359144253326244e-06, + "loss": 0.2727, + "step": 26829 + }, + { + "epoch": 0.78, + "grad_norm": 1.3782243358801283, + "learning_rate": 1.2356052663967694e-06, + "loss": 0.2861, + "step": 26830 + }, + { + "epoch": 0.78, + "grad_norm": 1.3065822778490555, + "learning_rate": 1.2352961406817404e-06, + "loss": 0.2654, + "step": 26831 + }, + { + "epoch": 0.78, + "grad_norm": 1.2996149231659613, + "learning_rate": 1.2349870481902642e-06, + "loss": 0.2765, + "step": 26832 + }, + { + "epoch": 0.78, + "grad_norm": 1.2667038128974684, + "learning_rate": 1.2346779889250692e-06, + "loss": 0.2765, + "step": 26833 + }, + { + "epoch": 0.78, + "grad_norm": 1.2874336971372733, + "learning_rate": 1.2343689628888845e-06, + "loss": 0.269, + "step": 26834 + }, + { + "epoch": 0.78, + "grad_norm": 1.3855533767234227, + "learning_rate": 1.2340599700844342e-06, + "loss": 0.2846, + "step": 26835 + }, + { + "epoch": 0.78, + "grad_norm": 1.258218619383033, + "learning_rate": 1.2337510105144458e-06, + "loss": 0.2646, + "step": 26836 + }, + { + "epoch": 0.78, + "grad_norm": 1.280607715249234, + "learning_rate": 1.233442084181647e-06, + "loss": 0.2923, + "step": 26837 + }, + { + "epoch": 0.78, + "grad_norm": 1.2823091802721511, + "learning_rate": 1.2331331910887635e-06, + "loss": 0.2763, + "step": 26838 + }, + { + "epoch": 0.78, + "grad_norm": 1.4839274218833316, + "learning_rate": 1.2328243312385208e-06, + "loss": 0.2677, + "step": 26839 + }, + { + "epoch": 0.78, + "grad_norm": 2.60551400892082, + "learning_rate": 1.2325155046336456e-06, + "loss": 0.2525, + "step": 26840 + }, + { + "epoch": 0.78, + "grad_norm": 1.2245518697221547, + "learning_rate": 1.2322067112768632e-06, + "loss": 0.2643, + "step": 26841 + }, + { + "epoch": 0.78, + "grad_norm": 1.21231871622409, + "learning_rate": 1.2318979511708979e-06, + "loss": 0.2568, + "step": 26842 + }, + { + "epoch": 0.78, + "grad_norm": 1.1896067564353354, + "learning_rate": 1.2315892243184762e-06, + "loss": 0.2607, + "step": 26843 + }, + { + "epoch": 0.78, + "grad_norm": 1.4338928541293459, + "learning_rate": 1.2312805307223208e-06, + "loss": 0.2561, + "step": 26844 + }, + { + "epoch": 0.78, + "grad_norm": 1.6076604645614665, + "learning_rate": 1.2309718703851558e-06, + "loss": 0.2783, + "step": 26845 + }, + { + "epoch": 0.78, + "grad_norm": 1.492320357501363, + "learning_rate": 1.230663243309706e-06, + "loss": 0.2978, + "step": 26846 + }, + { + "epoch": 0.78, + "grad_norm": 1.5621568983887093, + "learning_rate": 1.2303546494986951e-06, + "loss": 0.2613, + "step": 26847 + }, + { + "epoch": 0.78, + "grad_norm": 1.312971304121399, + "learning_rate": 1.2300460889548477e-06, + "loss": 0.2772, + "step": 26848 + }, + { + "epoch": 0.78, + "grad_norm": 1.3747308004759062, + "learning_rate": 1.2297375616808837e-06, + "loss": 0.2753, + "step": 26849 + }, + { + "epoch": 0.78, + "grad_norm": 1.3993354277552943, + "learning_rate": 1.2294290676795279e-06, + "loss": 0.2679, + "step": 26850 + }, + { + "epoch": 0.78, + "grad_norm": 1.2871428021728712, + "learning_rate": 1.2291206069535022e-06, + "loss": 0.2832, + "step": 26851 + }, + { + "epoch": 0.78, + "grad_norm": 1.3365789095898053, + "learning_rate": 1.2288121795055286e-06, + "loss": 0.2717, + "step": 26852 + }, + { + "epoch": 0.78, + "grad_norm": 1.442751977483601, + "learning_rate": 1.2285037853383315e-06, + "loss": 0.3252, + "step": 26853 + }, + { + "epoch": 0.78, + "grad_norm": 1.3363210101195198, + "learning_rate": 1.2281954244546285e-06, + "loss": 0.2616, + "step": 26854 + }, + { + "epoch": 0.78, + "grad_norm": 1.5946687488009696, + "learning_rate": 1.2278870968571426e-06, + "loss": 0.2792, + "step": 26855 + }, + { + "epoch": 0.78, + "grad_norm": 1.2597423394280411, + "learning_rate": 1.2275788025485952e-06, + "loss": 0.2781, + "step": 26856 + }, + { + "epoch": 0.78, + "grad_norm": 1.5476537952664284, + "learning_rate": 1.2272705415317066e-06, + "loss": 0.2683, + "step": 26857 + }, + { + "epoch": 0.78, + "grad_norm": 1.7492851248903476, + "learning_rate": 1.2269623138091973e-06, + "loss": 0.2729, + "step": 26858 + }, + { + "epoch": 0.78, + "grad_norm": 1.2311862172186407, + "learning_rate": 1.2266541193837872e-06, + "loss": 0.2699, + "step": 26859 + }, + { + "epoch": 0.78, + "grad_norm": 1.3210777672995555, + "learning_rate": 1.2263459582581967e-06, + "loss": 0.2681, + "step": 26860 + }, + { + "epoch": 0.78, + "grad_norm": 1.4144149109270867, + "learning_rate": 1.226037830435145e-06, + "loss": 0.2662, + "step": 26861 + }, + { + "epoch": 0.78, + "grad_norm": 1.2621475411901997, + "learning_rate": 1.2257297359173525e-06, + "loss": 0.281, + "step": 26862 + }, + { + "epoch": 0.78, + "grad_norm": 1.3652806607857926, + "learning_rate": 1.225421674707536e-06, + "loss": 0.2816, + "step": 26863 + }, + { + "epoch": 0.78, + "grad_norm": 1.6447314686590935, + "learning_rate": 1.2251136468084147e-06, + "loss": 0.2867, + "step": 26864 + }, + { + "epoch": 0.78, + "grad_norm": 1.3026032427864296, + "learning_rate": 1.224805652222707e-06, + "loss": 0.2895, + "step": 26865 + }, + { + "epoch": 0.78, + "grad_norm": 3.293319764275659, + "learning_rate": 1.224497690953132e-06, + "loss": 0.2798, + "step": 26866 + }, + { + "epoch": 0.78, + "grad_norm": 1.2612129740133189, + "learning_rate": 1.2241897630024063e-06, + "loss": 0.2889, + "step": 26867 + }, + { + "epoch": 0.78, + "grad_norm": 1.5562484681201298, + "learning_rate": 1.223881868373248e-06, + "loss": 0.2864, + "step": 26868 + }, + { + "epoch": 0.78, + "grad_norm": 1.2089589522888107, + "learning_rate": 1.2235740070683738e-06, + "loss": 0.2615, + "step": 26869 + }, + { + "epoch": 0.78, + "grad_norm": 1.5921463667139248, + "learning_rate": 1.2232661790905025e-06, + "loss": 0.279, + "step": 26870 + }, + { + "epoch": 0.78, + "grad_norm": 1.2406640449830832, + "learning_rate": 1.2229583844423481e-06, + "loss": 0.2781, + "step": 26871 + }, + { + "epoch": 0.78, + "grad_norm": 1.3705193888964124, + "learning_rate": 1.2226506231266273e-06, + "loss": 0.2577, + "step": 26872 + }, + { + "epoch": 0.78, + "grad_norm": 1.549722653043424, + "learning_rate": 1.222342895146057e-06, + "loss": 0.2786, + "step": 26873 + }, + { + "epoch": 0.78, + "grad_norm": 1.264922486738276, + "learning_rate": 1.2220352005033526e-06, + "loss": 0.2906, + "step": 26874 + }, + { + "epoch": 0.78, + "grad_norm": 1.4012654004942875, + "learning_rate": 1.221727539201229e-06, + "loss": 0.2616, + "step": 26875 + }, + { + "epoch": 0.78, + "grad_norm": 1.50003709736143, + "learning_rate": 1.2214199112424037e-06, + "loss": 0.2829, + "step": 26876 + }, + { + "epoch": 0.78, + "grad_norm": 3.150997429234501, + "learning_rate": 1.221112316629588e-06, + "loss": 0.2886, + "step": 26877 + }, + { + "epoch": 0.78, + "grad_norm": 1.3193145931356605, + "learning_rate": 1.220804755365498e-06, + "loss": 0.2797, + "step": 26878 + }, + { + "epoch": 0.78, + "grad_norm": 1.5667059394796188, + "learning_rate": 1.2204972274528475e-06, + "loss": 0.2843, + "step": 26879 + }, + { + "epoch": 0.78, + "grad_norm": 1.5080211993978148, + "learning_rate": 1.2201897328943512e-06, + "loss": 0.2737, + "step": 26880 + }, + { + "epoch": 0.78, + "grad_norm": 1.9549928443394367, + "learning_rate": 1.2198822716927244e-06, + "loss": 0.285, + "step": 26881 + }, + { + "epoch": 0.78, + "grad_norm": 1.2823610907487084, + "learning_rate": 1.2195748438506765e-06, + "loss": 0.2789, + "step": 26882 + }, + { + "epoch": 0.78, + "grad_norm": 1.5016189503376824, + "learning_rate": 1.2192674493709228e-06, + "loss": 0.2694, + "step": 26883 + }, + { + "epoch": 0.78, + "grad_norm": 1.5285554860545265, + "learning_rate": 1.2189600882561753e-06, + "loss": 0.2543, + "step": 26884 + }, + { + "epoch": 0.78, + "grad_norm": 1.4873887052972294, + "learning_rate": 1.2186527605091475e-06, + "loss": 0.2915, + "step": 26885 + }, + { + "epoch": 0.78, + "grad_norm": 1.5184193012529943, + "learning_rate": 1.2183454661325511e-06, + "loss": 0.2824, + "step": 26886 + }, + { + "epoch": 0.78, + "grad_norm": 1.2149020980745102, + "learning_rate": 1.2180382051290974e-06, + "loss": 0.2679, + "step": 26887 + }, + { + "epoch": 0.78, + "grad_norm": 7.360984470017626, + "learning_rate": 1.217730977501499e-06, + "loss": 0.29, + "step": 26888 + }, + { + "epoch": 0.78, + "grad_norm": 3.533360508339315, + "learning_rate": 1.2174237832524677e-06, + "loss": 0.2765, + "step": 26889 + }, + { + "epoch": 0.78, + "grad_norm": 2.092519850391731, + "learning_rate": 1.2171166223847125e-06, + "loss": 0.2626, + "step": 26890 + }, + { + "epoch": 0.78, + "grad_norm": 1.4977506279311847, + "learning_rate": 1.2168094949009445e-06, + "loss": 0.2556, + "step": 26891 + }, + { + "epoch": 0.78, + "grad_norm": 1.8440890950861646, + "learning_rate": 1.216502400803875e-06, + "loss": 0.2978, + "step": 26892 + }, + { + "epoch": 0.78, + "grad_norm": 1.938854660624656, + "learning_rate": 1.216195340096214e-06, + "loss": 0.2817, + "step": 26893 + }, + { + "epoch": 0.78, + "grad_norm": 1.3681243099881624, + "learning_rate": 1.215888312780671e-06, + "loss": 0.2969, + "step": 26894 + }, + { + "epoch": 0.78, + "grad_norm": 1.227132752281576, + "learning_rate": 1.2155813188599551e-06, + "loss": 0.2652, + "step": 26895 + }, + { + "epoch": 0.78, + "grad_norm": 1.1991716044190996, + "learning_rate": 1.2152743583367765e-06, + "loss": 0.2704, + "step": 26896 + }, + { + "epoch": 0.78, + "grad_norm": 1.434001153596438, + "learning_rate": 1.2149674312138438e-06, + "loss": 0.2965, + "step": 26897 + }, + { + "epoch": 0.78, + "grad_norm": 1.955980813217563, + "learning_rate": 1.2146605374938669e-06, + "loss": 0.2791, + "step": 26898 + }, + { + "epoch": 0.78, + "grad_norm": 1.6999364525160114, + "learning_rate": 1.214353677179551e-06, + "loss": 0.2942, + "step": 26899 + }, + { + "epoch": 0.78, + "grad_norm": 1.239579212663828, + "learning_rate": 1.2140468502736063e-06, + "loss": 0.2894, + "step": 26900 + }, + { + "epoch": 0.78, + "grad_norm": 1.440129807265268, + "learning_rate": 1.21374005677874e-06, + "loss": 0.2805, + "step": 26901 + }, + { + "epoch": 0.78, + "grad_norm": 1.5542545585867384, + "learning_rate": 1.2134332966976593e-06, + "loss": 0.2898, + "step": 26902 + }, + { + "epoch": 0.78, + "grad_norm": 1.2727471208709658, + "learning_rate": 1.2131265700330725e-06, + "loss": 0.2803, + "step": 26903 + }, + { + "epoch": 0.78, + "grad_norm": 1.4169340540921822, + "learning_rate": 1.2128198767876864e-06, + "loss": 0.2769, + "step": 26904 + }, + { + "epoch": 0.78, + "grad_norm": 1.797338547787574, + "learning_rate": 1.2125132169642057e-06, + "loss": 0.278, + "step": 26905 + }, + { + "epoch": 0.78, + "grad_norm": 1.2863127031359716, + "learning_rate": 1.212206590565338e-06, + "loss": 0.2896, + "step": 26906 + }, + { + "epoch": 0.78, + "grad_norm": 1.4700384787617973, + "learning_rate": 1.2118999975937895e-06, + "loss": 0.2749, + "step": 26907 + }, + { + "epoch": 0.78, + "grad_norm": 1.2854898790349454, + "learning_rate": 1.2115934380522665e-06, + "loss": 0.2813, + "step": 26908 + }, + { + "epoch": 0.78, + "grad_norm": 1.3560410834679804, + "learning_rate": 1.2112869119434723e-06, + "loss": 0.2803, + "step": 26909 + }, + { + "epoch": 0.78, + "grad_norm": 1.3440008287075416, + "learning_rate": 1.210980419270113e-06, + "loss": 0.2752, + "step": 26910 + }, + { + "epoch": 0.78, + "grad_norm": 1.3090738579885752, + "learning_rate": 1.2106739600348932e-06, + "loss": 0.2647, + "step": 26911 + }, + { + "epoch": 0.78, + "grad_norm": 1.2809186984072052, + "learning_rate": 1.210367534240518e-06, + "loss": 0.2782, + "step": 26912 + }, + { + "epoch": 0.78, + "grad_norm": 1.26225772480524, + "learning_rate": 1.2100611418896913e-06, + "loss": 0.2906, + "step": 26913 + }, + { + "epoch": 0.78, + "grad_norm": 1.3140648842320337, + "learning_rate": 1.2097547829851174e-06, + "loss": 0.2821, + "step": 26914 + }, + { + "epoch": 0.78, + "grad_norm": 0.926978291666052, + "learning_rate": 1.209448457529499e-06, + "loss": 0.5831, + "step": 26915 + }, + { + "epoch": 0.78, + "grad_norm": 1.2764650402360698, + "learning_rate": 1.2091421655255403e-06, + "loss": 0.2591, + "step": 26916 + }, + { + "epoch": 0.78, + "grad_norm": 1.6079950381620514, + "learning_rate": 1.2088359069759458e-06, + "loss": 0.2687, + "step": 26917 + }, + { + "epoch": 0.78, + "grad_norm": 1.4122069270178492, + "learning_rate": 1.2085296818834146e-06, + "loss": 0.2825, + "step": 26918 + }, + { + "epoch": 0.78, + "grad_norm": 1.3702042166314994, + "learning_rate": 1.2082234902506512e-06, + "loss": 0.2844, + "step": 26919 + }, + { + "epoch": 0.78, + "grad_norm": 1.3500434385403834, + "learning_rate": 1.2079173320803577e-06, + "loss": 0.2883, + "step": 26920 + }, + { + "epoch": 0.78, + "grad_norm": 1.2446620077701083, + "learning_rate": 1.2076112073752356e-06, + "loss": 0.2849, + "step": 26921 + }, + { + "epoch": 0.78, + "grad_norm": 1.3769735702224135, + "learning_rate": 1.2073051161379867e-06, + "loss": 0.2754, + "step": 26922 + }, + { + "epoch": 0.78, + "grad_norm": 1.4227735578834284, + "learning_rate": 1.2069990583713126e-06, + "loss": 0.2655, + "step": 26923 + }, + { + "epoch": 0.78, + "grad_norm": 1.5483731539289411, + "learning_rate": 1.2066930340779132e-06, + "loss": 0.2716, + "step": 26924 + }, + { + "epoch": 0.78, + "grad_norm": 1.44630170842318, + "learning_rate": 1.2063870432604918e-06, + "loss": 0.2898, + "step": 26925 + }, + { + "epoch": 0.78, + "grad_norm": 1.2742220353871752, + "learning_rate": 1.2060810859217454e-06, + "loss": 0.2609, + "step": 26926 + }, + { + "epoch": 0.78, + "grad_norm": 1.3287610130084675, + "learning_rate": 1.2057751620643748e-06, + "loss": 0.2827, + "step": 26927 + }, + { + "epoch": 0.78, + "grad_norm": 1.259876831797028, + "learning_rate": 1.205469271691081e-06, + "loss": 0.3167, + "step": 26928 + }, + { + "epoch": 0.78, + "grad_norm": 1.460687720753371, + "learning_rate": 1.2051634148045633e-06, + "loss": 0.2777, + "step": 26929 + }, + { + "epoch": 0.78, + "grad_norm": 1.4207040281249677, + "learning_rate": 1.20485759140752e-06, + "loss": 0.2876, + "step": 26930 + }, + { + "epoch": 0.78, + "grad_norm": 1.2517750032602493, + "learning_rate": 1.2045518015026508e-06, + "loss": 0.282, + "step": 26931 + }, + { + "epoch": 0.78, + "grad_norm": 1.6865396515393787, + "learning_rate": 1.2042460450926558e-06, + "loss": 0.3193, + "step": 26932 + }, + { + "epoch": 0.78, + "grad_norm": 1.7875942266972131, + "learning_rate": 1.2039403221802297e-06, + "loss": 0.2679, + "step": 26933 + }, + { + "epoch": 0.78, + "grad_norm": 1.4931516735616122, + "learning_rate": 1.2036346327680726e-06, + "loss": 0.3154, + "step": 26934 + }, + { + "epoch": 0.78, + "grad_norm": 1.3537034561237098, + "learning_rate": 1.203328976858882e-06, + "loss": 0.2645, + "step": 26935 + }, + { + "epoch": 0.78, + "grad_norm": 1.3913815966359204, + "learning_rate": 1.2030233544553566e-06, + "loss": 0.2943, + "step": 26936 + }, + { + "epoch": 0.78, + "grad_norm": 1.6068696588819458, + "learning_rate": 1.2027177655601906e-06, + "loss": 0.2783, + "step": 26937 + }, + { + "epoch": 0.78, + "grad_norm": 1.275535182487264, + "learning_rate": 1.2024122101760826e-06, + "loss": 0.2833, + "step": 26938 + }, + { + "epoch": 0.78, + "grad_norm": 1.2960095883308187, + "learning_rate": 1.2021066883057291e-06, + "loss": 0.2842, + "step": 26939 + }, + { + "epoch": 0.78, + "grad_norm": 1.4916806962334535, + "learning_rate": 1.2018011999518258e-06, + "loss": 0.2822, + "step": 26940 + }, + { + "epoch": 0.78, + "grad_norm": 2.793138569426439, + "learning_rate": 1.2014957451170695e-06, + "loss": 0.2696, + "step": 26941 + }, + { + "epoch": 0.78, + "grad_norm": 1.446718534398195, + "learning_rate": 1.201190323804155e-06, + "loss": 0.2806, + "step": 26942 + }, + { + "epoch": 0.78, + "grad_norm": 1.2784572915082484, + "learning_rate": 1.2008849360157787e-06, + "loss": 0.2545, + "step": 26943 + }, + { + "epoch": 0.78, + "grad_norm": 1.3541653793206863, + "learning_rate": 1.2005795817546356e-06, + "loss": 0.2681, + "step": 26944 + }, + { + "epoch": 0.78, + "grad_norm": 1.725174704168746, + "learning_rate": 1.200274261023419e-06, + "loss": 0.2677, + "step": 26945 + }, + { + "epoch": 0.78, + "grad_norm": 1.8798693102210526, + "learning_rate": 1.199968973824824e-06, + "loss": 0.2771, + "step": 26946 + }, + { + "epoch": 0.78, + "grad_norm": 1.2982647719783407, + "learning_rate": 1.1996637201615447e-06, + "loss": 0.2604, + "step": 26947 + }, + { + "epoch": 0.78, + "grad_norm": 1.2803454230748395, + "learning_rate": 1.1993585000362756e-06, + "loss": 0.309, + "step": 26948 + }, + { + "epoch": 0.78, + "grad_norm": 1.5486357663895487, + "learning_rate": 1.1990533134517096e-06, + "loss": 0.2701, + "step": 26949 + }, + { + "epoch": 0.78, + "grad_norm": 1.3875656179537392, + "learning_rate": 1.1987481604105406e-06, + "loss": 0.2847, + "step": 26950 + }, + { + "epoch": 0.78, + "grad_norm": 1.286231231708185, + "learning_rate": 1.1984430409154612e-06, + "loss": 0.2853, + "step": 26951 + }, + { + "epoch": 0.78, + "grad_norm": 1.4072909359240569, + "learning_rate": 1.1981379549691641e-06, + "loss": 0.2791, + "step": 26952 + }, + { + "epoch": 0.78, + "grad_norm": 1.2221989659492505, + "learning_rate": 1.1978329025743429e-06, + "loss": 0.2749, + "step": 26953 + }, + { + "epoch": 0.78, + "grad_norm": 2.103276831376991, + "learning_rate": 1.1975278837336872e-06, + "loss": 0.2707, + "step": 26954 + }, + { + "epoch": 0.78, + "grad_norm": 1.2492053933184253, + "learning_rate": 1.1972228984498908e-06, + "loss": 0.2838, + "step": 26955 + }, + { + "epoch": 0.78, + "grad_norm": 1.358130110556376, + "learning_rate": 1.1969179467256436e-06, + "loss": 0.2777, + "step": 26956 + }, + { + "epoch": 0.78, + "grad_norm": 1.2176085690238105, + "learning_rate": 1.196613028563638e-06, + "loss": 0.2634, + "step": 26957 + }, + { + "epoch": 0.78, + "grad_norm": 1.4072899963351375, + "learning_rate": 1.1963081439665646e-06, + "loss": 0.2636, + "step": 26958 + }, + { + "epoch": 0.78, + "grad_norm": 1.5971967965077818, + "learning_rate": 1.196003292937114e-06, + "loss": 0.2696, + "step": 26959 + }, + { + "epoch": 0.78, + "grad_norm": 1.6653646126968906, + "learning_rate": 1.1956984754779783e-06, + "loss": 0.2849, + "step": 26960 + }, + { + "epoch": 0.78, + "grad_norm": 1.2610109196505408, + "learning_rate": 1.195393691591844e-06, + "loss": 0.2648, + "step": 26961 + }, + { + "epoch": 0.78, + "grad_norm": 1.651791101684331, + "learning_rate": 1.1950889412814026e-06, + "loss": 0.2733, + "step": 26962 + }, + { + "epoch": 0.78, + "grad_norm": 1.528153341406491, + "learning_rate": 1.1947842245493446e-06, + "loss": 0.2699, + "step": 26963 + }, + { + "epoch": 0.78, + "grad_norm": 1.557600864419238, + "learning_rate": 1.1944795413983572e-06, + "loss": 0.276, + "step": 26964 + }, + { + "epoch": 0.78, + "grad_norm": 1.654425401134337, + "learning_rate": 1.1941748918311297e-06, + "loss": 0.2786, + "step": 26965 + }, + { + "epoch": 0.78, + "grad_norm": 1.2235099384956885, + "learning_rate": 1.1938702758503512e-06, + "loss": 0.2768, + "step": 26966 + }, + { + "epoch": 0.78, + "grad_norm": 1.5411113096116675, + "learning_rate": 1.1935656934587097e-06, + "loss": 0.2767, + "step": 26967 + }, + { + "epoch": 0.78, + "grad_norm": 1.1876072321324418, + "learning_rate": 1.1932611446588931e-06, + "loss": 0.2714, + "step": 26968 + }, + { + "epoch": 0.78, + "grad_norm": 1.766407206236602, + "learning_rate": 1.1929566294535893e-06, + "loss": 0.2748, + "step": 26969 + }, + { + "epoch": 0.78, + "grad_norm": 1.69971443247284, + "learning_rate": 1.192652147845485e-06, + "loss": 0.2667, + "step": 26970 + }, + { + "epoch": 0.78, + "grad_norm": 1.5845203783659099, + "learning_rate": 1.1923476998372685e-06, + "loss": 0.2828, + "step": 26971 + }, + { + "epoch": 0.78, + "grad_norm": 1.3628618295325798, + "learning_rate": 1.1920432854316266e-06, + "loss": 0.25, + "step": 26972 + }, + { + "epoch": 0.78, + "grad_norm": 1.3025758034036525, + "learning_rate": 1.191738904631244e-06, + "loss": 0.2646, + "step": 26973 + }, + { + "epoch": 0.78, + "grad_norm": 1.2661202481880187, + "learning_rate": 1.1914345574388076e-06, + "loss": 0.2691, + "step": 26974 + }, + { + "epoch": 0.78, + "grad_norm": 2.0465068583200705, + "learning_rate": 1.1911302438570032e-06, + "loss": 0.2608, + "step": 26975 + }, + { + "epoch": 0.78, + "grad_norm": 1.4945236268613316, + "learning_rate": 1.1908259638885168e-06, + "loss": 0.2843, + "step": 26976 + }, + { + "epoch": 0.78, + "grad_norm": 1.4376365946155232, + "learning_rate": 1.1905217175360333e-06, + "loss": 0.261, + "step": 26977 + }, + { + "epoch": 0.78, + "grad_norm": 1.2681616131653486, + "learning_rate": 1.1902175048022385e-06, + "loss": 0.2638, + "step": 26978 + }, + { + "epoch": 0.78, + "grad_norm": 1.3687981607942377, + "learning_rate": 1.189913325689816e-06, + "loss": 0.2678, + "step": 26979 + }, + { + "epoch": 0.78, + "grad_norm": 1.3179492008909814, + "learning_rate": 1.1896091802014525e-06, + "loss": 0.2899, + "step": 26980 + }, + { + "epoch": 0.78, + "grad_norm": 1.3005920385042717, + "learning_rate": 1.1893050683398283e-06, + "loss": 0.2676, + "step": 26981 + }, + { + "epoch": 0.78, + "grad_norm": 1.615653332766301, + "learning_rate": 1.1890009901076295e-06, + "loss": 0.3061, + "step": 26982 + }, + { + "epoch": 0.78, + "grad_norm": 1.245556067144229, + "learning_rate": 1.1886969455075392e-06, + "loss": 0.2808, + "step": 26983 + }, + { + "epoch": 0.78, + "grad_norm": 1.5579795865600439, + "learning_rate": 1.1883929345422408e-06, + "loss": 0.2713, + "step": 26984 + }, + { + "epoch": 0.78, + "grad_norm": 1.4506606834194236, + "learning_rate": 1.1880889572144166e-06, + "loss": 0.2657, + "step": 26985 + }, + { + "epoch": 0.78, + "grad_norm": 2.7750588785833923, + "learning_rate": 1.18778501352675e-06, + "loss": 0.2751, + "step": 26986 + }, + { + "epoch": 0.78, + "grad_norm": 1.3722993143755964, + "learning_rate": 1.1874811034819228e-06, + "loss": 0.2748, + "step": 26987 + }, + { + "epoch": 0.78, + "grad_norm": 1.7550284360316413, + "learning_rate": 1.1871772270826187e-06, + "loss": 0.2948, + "step": 26988 + }, + { + "epoch": 0.78, + "grad_norm": 1.6114221908616164, + "learning_rate": 1.1868733843315167e-06, + "loss": 0.2614, + "step": 26989 + }, + { + "epoch": 0.78, + "grad_norm": 1.2453544601372517, + "learning_rate": 1.1865695752313e-06, + "loss": 0.2685, + "step": 26990 + }, + { + "epoch": 0.78, + "grad_norm": 1.4486965734406514, + "learning_rate": 1.1862657997846482e-06, + "loss": 0.2704, + "step": 26991 + }, + { + "epoch": 0.78, + "grad_norm": 1.4792354061775739, + "learning_rate": 1.1859620579942433e-06, + "loss": 0.2693, + "step": 26992 + }, + { + "epoch": 0.78, + "grad_norm": 1.3019514951084432, + "learning_rate": 1.185658349862765e-06, + "loss": 0.2704, + "step": 26993 + }, + { + "epoch": 0.78, + "grad_norm": 1.4991379764954116, + "learning_rate": 1.1853546753928947e-06, + "loss": 0.2627, + "step": 26994 + }, + { + "epoch": 0.78, + "grad_norm": 1.441200579577504, + "learning_rate": 1.1850510345873112e-06, + "loss": 0.3088, + "step": 26995 + }, + { + "epoch": 0.78, + "grad_norm": 0.9742005352437239, + "learning_rate": 1.1847474274486948e-06, + "loss": 0.5794, + "step": 26996 + }, + { + "epoch": 0.78, + "grad_norm": 1.6729879116905138, + "learning_rate": 1.1844438539797242e-06, + "loss": 0.2868, + "step": 26997 + }, + { + "epoch": 0.78, + "grad_norm": 1.363513215221934, + "learning_rate": 1.1841403141830793e-06, + "loss": 0.2749, + "step": 26998 + }, + { + "epoch": 0.78, + "grad_norm": 1.2673773248653621, + "learning_rate": 1.1838368080614405e-06, + "loss": 0.2746, + "step": 26999 + }, + { + "epoch": 0.78, + "grad_norm": 1.37432081416825, + "learning_rate": 1.1835333356174816e-06, + "loss": 0.2783, + "step": 27000 + }, + { + "epoch": 0.78, + "grad_norm": 2.827856434196856, + "learning_rate": 1.1832298968538842e-06, + "loss": 0.2808, + "step": 27001 + }, + { + "epoch": 0.78, + "grad_norm": 1.3058843209651916, + "learning_rate": 1.1829264917733252e-06, + "loss": 0.2643, + "step": 27002 + }, + { + "epoch": 0.78, + "grad_norm": 1.3187743939152046, + "learning_rate": 1.1826231203784822e-06, + "loss": 0.255, + "step": 27003 + }, + { + "epoch": 0.78, + "grad_norm": 1.2362166321497732, + "learning_rate": 1.1823197826720322e-06, + "loss": 0.2627, + "step": 27004 + }, + { + "epoch": 0.78, + "grad_norm": 1.2544243035222875, + "learning_rate": 1.182016478656653e-06, + "loss": 0.2748, + "step": 27005 + }, + { + "epoch": 0.78, + "grad_norm": 1.3721893685365445, + "learning_rate": 1.1817132083350203e-06, + "loss": 0.2923, + "step": 27006 + }, + { + "epoch": 0.78, + "grad_norm": 1.4270012970639419, + "learning_rate": 1.181409971709811e-06, + "loss": 0.262, + "step": 27007 + }, + { + "epoch": 0.78, + "grad_norm": 1.2503659358995325, + "learning_rate": 1.1811067687837024e-06, + "loss": 0.2694, + "step": 27008 + }, + { + "epoch": 0.78, + "grad_norm": 1.713619478866005, + "learning_rate": 1.1808035995593674e-06, + "loss": 0.2635, + "step": 27009 + }, + { + "epoch": 0.78, + "grad_norm": 1.8742979609053763, + "learning_rate": 1.1805004640394835e-06, + "loss": 0.2903, + "step": 27010 + }, + { + "epoch": 0.78, + "grad_norm": 1.2612425986850888, + "learning_rate": 1.1801973622267248e-06, + "loss": 0.2585, + "step": 27011 + }, + { + "epoch": 0.78, + "grad_norm": 1.3271013789534, + "learning_rate": 1.1798942941237674e-06, + "loss": 0.2919, + "step": 27012 + }, + { + "epoch": 0.78, + "grad_norm": 1.1679698631020392, + "learning_rate": 1.1795912597332847e-06, + "loss": 0.2587, + "step": 27013 + }, + { + "epoch": 0.78, + "grad_norm": 1.299611495950018, + "learning_rate": 1.1792882590579518e-06, + "loss": 0.2632, + "step": 27014 + }, + { + "epoch": 0.78, + "grad_norm": 1.3776445348733695, + "learning_rate": 1.1789852921004419e-06, + "loss": 0.2979, + "step": 27015 + }, + { + "epoch": 0.78, + "grad_norm": 1.429654801247419, + "learning_rate": 1.178682358863431e-06, + "loss": 0.2679, + "step": 27016 + }, + { + "epoch": 0.78, + "grad_norm": 1.3486781926473763, + "learning_rate": 1.1783794593495895e-06, + "loss": 0.2606, + "step": 27017 + }, + { + "epoch": 0.78, + "grad_norm": 1.538025270726056, + "learning_rate": 1.178076593561593e-06, + "loss": 0.2847, + "step": 27018 + }, + { + "epoch": 0.78, + "grad_norm": 1.3971574770267043, + "learning_rate": 1.1777737615021112e-06, + "loss": 0.2705, + "step": 27019 + }, + { + "epoch": 0.78, + "grad_norm": 1.458677248338973, + "learning_rate": 1.1774709631738185e-06, + "loss": 0.2827, + "step": 27020 + }, + { + "epoch": 0.78, + "grad_norm": 1.3568833566035146, + "learning_rate": 1.177168198579387e-06, + "loss": 0.2679, + "step": 27021 + }, + { + "epoch": 0.78, + "grad_norm": 1.768511670232079, + "learning_rate": 1.1768654677214886e-06, + "loss": 0.2988, + "step": 27022 + }, + { + "epoch": 0.78, + "grad_norm": 1.613075138991063, + "learning_rate": 1.176562770602795e-06, + "loss": 0.2871, + "step": 27023 + }, + { + "epoch": 0.78, + "grad_norm": 3.8744044092455847, + "learning_rate": 1.1762601072259766e-06, + "loss": 0.275, + "step": 27024 + }, + { + "epoch": 0.78, + "grad_norm": 1.7347951145716125, + "learning_rate": 1.175957477593706e-06, + "loss": 0.2619, + "step": 27025 + }, + { + "epoch": 0.78, + "grad_norm": 1.6123607781428295, + "learning_rate": 1.1756548817086544e-06, + "loss": 0.257, + "step": 27026 + }, + { + "epoch": 0.78, + "grad_norm": 1.3749602435898103, + "learning_rate": 1.1753523195734889e-06, + "loss": 0.2917, + "step": 27027 + }, + { + "epoch": 0.78, + "grad_norm": 1.3704621191114112, + "learning_rate": 1.1750497911908821e-06, + "loss": 0.2582, + "step": 27028 + }, + { + "epoch": 0.78, + "grad_norm": 0.9426243895839025, + "learning_rate": 1.1747472965635033e-06, + "loss": 0.5618, + "step": 27029 + }, + { + "epoch": 0.78, + "grad_norm": 1.4443746936989776, + "learning_rate": 1.174444835694022e-06, + "loss": 0.2843, + "step": 27030 + }, + { + "epoch": 0.78, + "grad_norm": 1.1585966063667952, + "learning_rate": 1.1741424085851077e-06, + "loss": 0.2578, + "step": 27031 + }, + { + "epoch": 0.78, + "grad_norm": 1.4210559143923134, + "learning_rate": 1.1738400152394286e-06, + "loss": 0.2773, + "step": 27032 + }, + { + "epoch": 0.78, + "grad_norm": 1.2282979122727582, + "learning_rate": 1.173537655659654e-06, + "loss": 0.2757, + "step": 27033 + }, + { + "epoch": 0.78, + "grad_norm": 1.2674781112126532, + "learning_rate": 1.1732353298484521e-06, + "loss": 0.2663, + "step": 27034 + }, + { + "epoch": 0.78, + "grad_norm": 1.3856393422099853, + "learning_rate": 1.1729330378084924e-06, + "loss": 0.289, + "step": 27035 + }, + { + "epoch": 0.78, + "grad_norm": 1.9833078962866861, + "learning_rate": 1.1726307795424396e-06, + "loss": 0.2623, + "step": 27036 + }, + { + "epoch": 0.78, + "grad_norm": 1.4718691906160961, + "learning_rate": 1.1723285550529623e-06, + "loss": 0.2986, + "step": 27037 + }, + { + "epoch": 0.78, + "grad_norm": 1.281303571127593, + "learning_rate": 1.1720263643427283e-06, + "loss": 0.2686, + "step": 27038 + }, + { + "epoch": 0.78, + "grad_norm": 1.3531074189463297, + "learning_rate": 1.1717242074144043e-06, + "loss": 0.2701, + "step": 27039 + }, + { + "epoch": 0.78, + "grad_norm": 1.229361626049812, + "learning_rate": 1.1714220842706564e-06, + "loss": 0.2569, + "step": 27040 + }, + { + "epoch": 0.78, + "grad_norm": 1.3466020928277564, + "learning_rate": 1.171119994914151e-06, + "loss": 0.2743, + "step": 27041 + }, + { + "epoch": 0.78, + "grad_norm": 2.3466180423867904, + "learning_rate": 1.1708179393475538e-06, + "loss": 0.2719, + "step": 27042 + }, + { + "epoch": 0.78, + "grad_norm": 1.4144000188715873, + "learning_rate": 1.1705159175735314e-06, + "loss": 0.2754, + "step": 27043 + }, + { + "epoch": 0.78, + "grad_norm": 1.3202695846270167, + "learning_rate": 1.1702139295947495e-06, + "loss": 0.2769, + "step": 27044 + }, + { + "epoch": 0.78, + "grad_norm": 1.2859194153767841, + "learning_rate": 1.1699119754138727e-06, + "loss": 0.2816, + "step": 27045 + }, + { + "epoch": 0.78, + "grad_norm": 1.5786727650047714, + "learning_rate": 1.1696100550335632e-06, + "loss": 0.2657, + "step": 27046 + }, + { + "epoch": 0.78, + "grad_norm": 1.282187666938471, + "learning_rate": 1.1693081684564878e-06, + "loss": 0.2611, + "step": 27047 + }, + { + "epoch": 0.78, + "grad_norm": 1.3330223054591268, + "learning_rate": 1.1690063156853099e-06, + "loss": 0.2799, + "step": 27048 + }, + { + "epoch": 0.78, + "grad_norm": 1.451513796434246, + "learning_rate": 1.168704496722694e-06, + "loss": 0.3284, + "step": 27049 + }, + { + "epoch": 0.78, + "grad_norm": 1.2891929297448446, + "learning_rate": 1.1684027115713037e-06, + "loss": 0.2757, + "step": 27050 + }, + { + "epoch": 0.78, + "grad_norm": 1.654281938761663, + "learning_rate": 1.1681009602338016e-06, + "loss": 0.2733, + "step": 27051 + }, + { + "epoch": 0.78, + "grad_norm": 1.4426675277837098, + "learning_rate": 1.1677992427128516e-06, + "loss": 0.3048, + "step": 27052 + }, + { + "epoch": 0.78, + "grad_norm": 1.2348092097833596, + "learning_rate": 1.1674975590111147e-06, + "loss": 0.2588, + "step": 27053 + }, + { + "epoch": 0.78, + "grad_norm": 1.2803633359460298, + "learning_rate": 1.1671959091312568e-06, + "loss": 0.2646, + "step": 27054 + }, + { + "epoch": 0.78, + "grad_norm": 1.3382073992086716, + "learning_rate": 1.1668942930759358e-06, + "loss": 0.2609, + "step": 27055 + }, + { + "epoch": 0.78, + "grad_norm": 1.186393898780037, + "learning_rate": 1.1665927108478154e-06, + "loss": 0.258, + "step": 27056 + }, + { + "epoch": 0.78, + "grad_norm": 1.7180194183861843, + "learning_rate": 1.166291162449556e-06, + "loss": 0.2864, + "step": 27057 + }, + { + "epoch": 0.78, + "grad_norm": 1.480081419672199, + "learning_rate": 1.1659896478838206e-06, + "loss": 0.2881, + "step": 27058 + }, + { + "epoch": 0.78, + "grad_norm": 1.5096621950141347, + "learning_rate": 1.1656881671532688e-06, + "loss": 0.2731, + "step": 27059 + }, + { + "epoch": 0.78, + "grad_norm": 1.4081409565560465, + "learning_rate": 1.1653867202605613e-06, + "loss": 0.3001, + "step": 27060 + }, + { + "epoch": 0.78, + "grad_norm": 1.8997868305116186, + "learning_rate": 1.1650853072083585e-06, + "loss": 0.2836, + "step": 27061 + }, + { + "epoch": 0.78, + "grad_norm": 1.4466682396890809, + "learning_rate": 1.1647839279993217e-06, + "loss": 0.3011, + "step": 27062 + }, + { + "epoch": 0.78, + "grad_norm": 1.7191268744582255, + "learning_rate": 1.1644825826361084e-06, + "loss": 0.3304, + "step": 27063 + }, + { + "epoch": 0.78, + "grad_norm": 1.2806765290622757, + "learning_rate": 1.1641812711213785e-06, + "loss": 0.2705, + "step": 27064 + }, + { + "epoch": 0.79, + "grad_norm": 1.6858207517252704, + "learning_rate": 1.163879993457791e-06, + "loss": 0.2585, + "step": 27065 + }, + { + "epoch": 0.79, + "grad_norm": 1.2612285771900205, + "learning_rate": 1.163578749648006e-06, + "loss": 0.2772, + "step": 27066 + }, + { + "epoch": 0.79, + "grad_norm": 1.4351950609481854, + "learning_rate": 1.1632775396946804e-06, + "loss": 0.2647, + "step": 27067 + }, + { + "epoch": 0.79, + "grad_norm": 1.5569811969546128, + "learning_rate": 1.1629763636004738e-06, + "loss": 0.2905, + "step": 27068 + }, + { + "epoch": 0.79, + "grad_norm": 1.5138211116491058, + "learning_rate": 1.1626752213680425e-06, + "loss": 0.2711, + "step": 27069 + }, + { + "epoch": 0.79, + "grad_norm": 2.193973308838916, + "learning_rate": 1.1623741130000454e-06, + "loss": 0.2678, + "step": 27070 + }, + { + "epoch": 0.79, + "grad_norm": 1.3636835579912212, + "learning_rate": 1.1620730384991407e-06, + "loss": 0.2684, + "step": 27071 + }, + { + "epoch": 0.79, + "grad_norm": 1.8579268562814097, + "learning_rate": 1.1617719978679826e-06, + "loss": 0.2876, + "step": 27072 + }, + { + "epoch": 0.79, + "grad_norm": 1.2401733598578508, + "learning_rate": 1.1614709911092303e-06, + "loss": 0.2574, + "step": 27073 + }, + { + "epoch": 0.79, + "grad_norm": 1.2554373472339897, + "learning_rate": 1.161170018225538e-06, + "loss": 0.2693, + "step": 27074 + }, + { + "epoch": 0.79, + "grad_norm": 1.187829708905643, + "learning_rate": 1.1608690792195632e-06, + "loss": 0.2545, + "step": 27075 + }, + { + "epoch": 0.79, + "grad_norm": 1.3912238730979083, + "learning_rate": 1.160568174093961e-06, + "loss": 0.2763, + "step": 27076 + }, + { + "epoch": 0.79, + "grad_norm": 1.2383082526011129, + "learning_rate": 1.1602673028513872e-06, + "loss": 0.2618, + "step": 27077 + }, + { + "epoch": 0.79, + "grad_norm": 1.362599304102163, + "learning_rate": 1.159966465494497e-06, + "loss": 0.2915, + "step": 27078 + }, + { + "epoch": 0.79, + "grad_norm": 1.5870662206312032, + "learning_rate": 1.1596656620259455e-06, + "loss": 0.2891, + "step": 27079 + }, + { + "epoch": 0.79, + "grad_norm": 1.3405543148955343, + "learning_rate": 1.1593648924483868e-06, + "loss": 0.2681, + "step": 27080 + }, + { + "epoch": 0.79, + "grad_norm": 1.4525363557424775, + "learning_rate": 1.1590641567644773e-06, + "loss": 0.2925, + "step": 27081 + }, + { + "epoch": 0.79, + "grad_norm": 1.3794377419267378, + "learning_rate": 1.1587634549768679e-06, + "loss": 0.2576, + "step": 27082 + }, + { + "epoch": 0.79, + "grad_norm": 1.2877852303660307, + "learning_rate": 1.1584627870882136e-06, + "loss": 0.2841, + "step": 27083 + }, + { + "epoch": 0.79, + "grad_norm": 1.9620422880120438, + "learning_rate": 1.1581621531011677e-06, + "loss": 0.2823, + "step": 27084 + }, + { + "epoch": 0.79, + "grad_norm": 1.2551000355678377, + "learning_rate": 1.1578615530183835e-06, + "loss": 0.2718, + "step": 27085 + }, + { + "epoch": 0.79, + "grad_norm": 1.4478239529055072, + "learning_rate": 1.1575609868425137e-06, + "loss": 0.2781, + "step": 27086 + }, + { + "epoch": 0.79, + "grad_norm": 1.467904728175353, + "learning_rate": 1.1572604545762107e-06, + "loss": 0.2697, + "step": 27087 + }, + { + "epoch": 0.79, + "grad_norm": 1.4340740626032544, + "learning_rate": 1.1569599562221267e-06, + "loss": 0.2698, + "step": 27088 + }, + { + "epoch": 0.79, + "grad_norm": 1.9697881841918423, + "learning_rate": 1.1566594917829138e-06, + "loss": 0.268, + "step": 27089 + }, + { + "epoch": 0.79, + "grad_norm": 1.2200286546606915, + "learning_rate": 1.1563590612612253e-06, + "loss": 0.277, + "step": 27090 + }, + { + "epoch": 0.79, + "grad_norm": 1.327994120712868, + "learning_rate": 1.1560586646597089e-06, + "loss": 0.2698, + "step": 27091 + }, + { + "epoch": 0.79, + "grad_norm": 0.9556608875125149, + "learning_rate": 1.1557583019810175e-06, + "loss": 0.6004, + "step": 27092 + }, + { + "epoch": 0.79, + "grad_norm": 1.5554102362039546, + "learning_rate": 1.1554579732278016e-06, + "loss": 0.2956, + "step": 27093 + }, + { + "epoch": 0.79, + "grad_norm": 1.446998738417904, + "learning_rate": 1.1551576784027118e-06, + "loss": 0.2848, + "step": 27094 + }, + { + "epoch": 0.79, + "grad_norm": 1.4547733457644616, + "learning_rate": 1.1548574175083982e-06, + "loss": 0.2654, + "step": 27095 + }, + { + "epoch": 0.79, + "grad_norm": 1.7113998679022149, + "learning_rate": 1.1545571905475105e-06, + "loss": 0.2935, + "step": 27096 + }, + { + "epoch": 0.79, + "grad_norm": 1.3605507623659236, + "learning_rate": 1.1542569975226986e-06, + "loss": 0.2717, + "step": 27097 + }, + { + "epoch": 0.79, + "grad_norm": 1.3500902943849729, + "learning_rate": 1.1539568384366122e-06, + "loss": 0.2894, + "step": 27098 + }, + { + "epoch": 0.79, + "grad_norm": 1.3493567999340879, + "learning_rate": 1.1536567132918986e-06, + "loss": 0.2713, + "step": 27099 + }, + { + "epoch": 0.79, + "grad_norm": 1.219337093847915, + "learning_rate": 1.1533566220912068e-06, + "loss": 0.2524, + "step": 27100 + }, + { + "epoch": 0.79, + "grad_norm": 1.2503165690614155, + "learning_rate": 1.1530565648371873e-06, + "loss": 0.2618, + "step": 27101 + }, + { + "epoch": 0.79, + "grad_norm": 1.3809464025202065, + "learning_rate": 1.1527565415324843e-06, + "loss": 0.2815, + "step": 27102 + }, + { + "epoch": 0.79, + "grad_norm": 0.9864306317019453, + "learning_rate": 1.1524565521797475e-06, + "loss": 0.5938, + "step": 27103 + }, + { + "epoch": 0.79, + "grad_norm": 1.3376899307243555, + "learning_rate": 1.1521565967816245e-06, + "loss": 0.2462, + "step": 27104 + }, + { + "epoch": 0.79, + "grad_norm": 1.2934488925399321, + "learning_rate": 1.1518566753407622e-06, + "loss": 0.2672, + "step": 27105 + }, + { + "epoch": 0.79, + "grad_norm": 1.5428185685861942, + "learning_rate": 1.1515567878598076e-06, + "loss": 0.2836, + "step": 27106 + }, + { + "epoch": 0.79, + "grad_norm": 1.3752201513971325, + "learning_rate": 1.1512569343414072e-06, + "loss": 0.285, + "step": 27107 + }, + { + "epoch": 0.79, + "grad_norm": 1.4430840426390794, + "learning_rate": 1.1509571147882064e-06, + "loss": 0.2862, + "step": 27108 + }, + { + "epoch": 0.79, + "grad_norm": 1.6910850877748762, + "learning_rate": 1.150657329202854e-06, + "loss": 0.275, + "step": 27109 + }, + { + "epoch": 0.79, + "grad_norm": 1.3201662296355776, + "learning_rate": 1.150357577587991e-06, + "loss": 0.2479, + "step": 27110 + }, + { + "epoch": 0.79, + "grad_norm": 1.3029060741159422, + "learning_rate": 1.1500578599462657e-06, + "loss": 0.2657, + "step": 27111 + }, + { + "epoch": 0.79, + "grad_norm": 1.7472906636046803, + "learning_rate": 1.1497581762803223e-06, + "loss": 0.2823, + "step": 27112 + }, + { + "epoch": 0.79, + "grad_norm": 1.4474992368914072, + "learning_rate": 1.149458526592806e-06, + "loss": 0.2809, + "step": 27113 + }, + { + "epoch": 0.79, + "grad_norm": 1.5602325137903448, + "learning_rate": 1.149158910886361e-06, + "loss": 0.2544, + "step": 27114 + }, + { + "epoch": 0.79, + "grad_norm": 1.7060946900459089, + "learning_rate": 1.1488593291636308e-06, + "loss": 0.3366, + "step": 27115 + }, + { + "epoch": 0.79, + "grad_norm": 1.250846787808407, + "learning_rate": 1.1485597814272597e-06, + "loss": 0.259, + "step": 27116 + }, + { + "epoch": 0.79, + "grad_norm": 1.3308672509435095, + "learning_rate": 1.1482602676798933e-06, + "loss": 0.2667, + "step": 27117 + }, + { + "epoch": 0.79, + "grad_norm": 1.4187187494583167, + "learning_rate": 1.1479607879241712e-06, + "loss": 0.2978, + "step": 27118 + }, + { + "epoch": 0.79, + "grad_norm": 1.2446505656994586, + "learning_rate": 1.1476613421627375e-06, + "loss": 0.269, + "step": 27119 + }, + { + "epoch": 0.79, + "grad_norm": 1.300375153254336, + "learning_rate": 1.1473619303982353e-06, + "loss": 0.2713, + "step": 27120 + }, + { + "epoch": 0.79, + "grad_norm": 1.4570803409300457, + "learning_rate": 1.1470625526333067e-06, + "loss": 0.2887, + "step": 27121 + }, + { + "epoch": 0.79, + "grad_norm": 1.432103915624705, + "learning_rate": 1.1467632088705944e-06, + "loss": 0.2805, + "step": 27122 + }, + { + "epoch": 0.79, + "grad_norm": 1.386368605476145, + "learning_rate": 1.1464638991127386e-06, + "loss": 0.2705, + "step": 27123 + }, + { + "epoch": 0.79, + "grad_norm": 0.9739617557419897, + "learning_rate": 1.1461646233623825e-06, + "loss": 0.577, + "step": 27124 + }, + { + "epoch": 0.79, + "grad_norm": 1.3500871415713673, + "learning_rate": 1.145865381622166e-06, + "loss": 0.2979, + "step": 27125 + }, + { + "epoch": 0.79, + "grad_norm": 1.5656908772576954, + "learning_rate": 1.1455661738947316e-06, + "loss": 0.2713, + "step": 27126 + }, + { + "epoch": 0.79, + "grad_norm": 1.311414828187581, + "learning_rate": 1.145267000182717e-06, + "loss": 0.2936, + "step": 27127 + }, + { + "epoch": 0.79, + "grad_norm": 1.3727375787307623, + "learning_rate": 1.1449678604887642e-06, + "loss": 0.2671, + "step": 27128 + }, + { + "epoch": 0.79, + "grad_norm": 1.423505635850593, + "learning_rate": 1.144668754815514e-06, + "loss": 0.2683, + "step": 27129 + }, + { + "epoch": 0.79, + "grad_norm": 0.9584256986117201, + "learning_rate": 1.1443696831656037e-06, + "loss": 0.5492, + "step": 27130 + }, + { + "epoch": 0.79, + "grad_norm": 1.5172391437023005, + "learning_rate": 1.1440706455416735e-06, + "loss": 0.2587, + "step": 27131 + }, + { + "epoch": 0.79, + "grad_norm": 1.2819279559285501, + "learning_rate": 1.1437716419463623e-06, + "loss": 0.2772, + "step": 27132 + }, + { + "epoch": 0.79, + "grad_norm": 1.4110376942027993, + "learning_rate": 1.1434726723823093e-06, + "loss": 0.2845, + "step": 27133 + }, + { + "epoch": 0.79, + "grad_norm": 1.337928936230706, + "learning_rate": 1.1431737368521528e-06, + "loss": 0.2679, + "step": 27134 + }, + { + "epoch": 0.79, + "grad_norm": 2.193588012533649, + "learning_rate": 1.1428748353585312e-06, + "loss": 0.2833, + "step": 27135 + }, + { + "epoch": 0.79, + "grad_norm": 0.8797929807327994, + "learning_rate": 1.1425759679040832e-06, + "loss": 0.5176, + "step": 27136 + }, + { + "epoch": 0.79, + "grad_norm": 1.6822981377610473, + "learning_rate": 1.1422771344914436e-06, + "loss": 0.2636, + "step": 27137 + }, + { + "epoch": 0.79, + "grad_norm": 1.8658470581201594, + "learning_rate": 1.141978335123251e-06, + "loss": 0.2755, + "step": 27138 + }, + { + "epoch": 0.79, + "grad_norm": 0.9486877631453224, + "learning_rate": 1.141679569802142e-06, + "loss": 0.5801, + "step": 27139 + }, + { + "epoch": 0.79, + "grad_norm": 1.501233191167378, + "learning_rate": 1.1413808385307546e-06, + "loss": 0.3043, + "step": 27140 + }, + { + "epoch": 0.79, + "grad_norm": 1.26896308295583, + "learning_rate": 1.1410821413117235e-06, + "loss": 0.2885, + "step": 27141 + }, + { + "epoch": 0.79, + "grad_norm": 1.4676475095346424, + "learning_rate": 1.140783478147685e-06, + "loss": 0.2918, + "step": 27142 + }, + { + "epoch": 0.79, + "grad_norm": 1.6691844932528237, + "learning_rate": 1.1404848490412756e-06, + "loss": 0.3132, + "step": 27143 + }, + { + "epoch": 0.79, + "grad_norm": 1.2642105813750935, + "learning_rate": 1.1401862539951302e-06, + "loss": 0.2783, + "step": 27144 + }, + { + "epoch": 0.79, + "grad_norm": 0.9892893563665986, + "learning_rate": 1.1398876930118852e-06, + "loss": 0.6002, + "step": 27145 + }, + { + "epoch": 0.79, + "grad_norm": 1.2531539228974207, + "learning_rate": 1.1395891660941726e-06, + "loss": 0.2869, + "step": 27146 + }, + { + "epoch": 0.79, + "grad_norm": 1.4281482987979488, + "learning_rate": 1.1392906732446284e-06, + "loss": 0.2789, + "step": 27147 + }, + { + "epoch": 0.79, + "grad_norm": 1.4852141526805895, + "learning_rate": 1.1389922144658871e-06, + "loss": 0.2784, + "step": 27148 + }, + { + "epoch": 0.79, + "grad_norm": 1.2794585137044865, + "learning_rate": 1.1386937897605827e-06, + "loss": 0.2518, + "step": 27149 + }, + { + "epoch": 0.79, + "grad_norm": 1.48760400012061, + "learning_rate": 1.1383953991313478e-06, + "loss": 0.2581, + "step": 27150 + }, + { + "epoch": 0.79, + "grad_norm": 1.5775947197262667, + "learning_rate": 1.1380970425808162e-06, + "loss": 0.278, + "step": 27151 + }, + { + "epoch": 0.79, + "grad_norm": 1.3215100458782245, + "learning_rate": 1.137798720111622e-06, + "loss": 0.2813, + "step": 27152 + }, + { + "epoch": 0.79, + "grad_norm": 1.380962250369441, + "learning_rate": 1.1375004317263978e-06, + "loss": 0.2531, + "step": 27153 + }, + { + "epoch": 0.79, + "grad_norm": 1.3310119026665044, + "learning_rate": 1.137202177427773e-06, + "loss": 0.2604, + "step": 27154 + }, + { + "epoch": 0.79, + "grad_norm": 1.4270030227232715, + "learning_rate": 1.1369039572183826e-06, + "loss": 0.2595, + "step": 27155 + }, + { + "epoch": 0.79, + "grad_norm": 1.3133547273052606, + "learning_rate": 1.1366057711008576e-06, + "loss": 0.2721, + "step": 27156 + }, + { + "epoch": 0.79, + "grad_norm": 1.2999071070656714, + "learning_rate": 1.1363076190778311e-06, + "loss": 0.2707, + "step": 27157 + }, + { + "epoch": 0.79, + "grad_norm": 1.4689920756076649, + "learning_rate": 1.1360095011519313e-06, + "loss": 0.2688, + "step": 27158 + }, + { + "epoch": 0.79, + "grad_norm": 1.868667429560068, + "learning_rate": 1.1357114173257898e-06, + "loss": 0.2723, + "step": 27159 + }, + { + "epoch": 0.79, + "grad_norm": 3.1818968706546444, + "learning_rate": 1.1354133676020385e-06, + "loss": 0.294, + "step": 27160 + }, + { + "epoch": 0.79, + "grad_norm": 1.3500720452886343, + "learning_rate": 1.1351153519833068e-06, + "loss": 0.2799, + "step": 27161 + }, + { + "epoch": 0.79, + "grad_norm": 2.105119824506094, + "learning_rate": 1.1348173704722254e-06, + "loss": 0.2467, + "step": 27162 + }, + { + "epoch": 0.79, + "grad_norm": 1.3819509344547234, + "learning_rate": 1.1345194230714235e-06, + "loss": 0.2847, + "step": 27163 + }, + { + "epoch": 0.79, + "grad_norm": 1.5493407455956563, + "learning_rate": 1.1342215097835318e-06, + "loss": 0.2736, + "step": 27164 + }, + { + "epoch": 0.79, + "grad_norm": 1.418889958931029, + "learning_rate": 1.1339236306111766e-06, + "loss": 0.2691, + "step": 27165 + }, + { + "epoch": 0.79, + "grad_norm": 1.225198552281091, + "learning_rate": 1.1336257855569888e-06, + "loss": 0.2591, + "step": 27166 + }, + { + "epoch": 0.79, + "grad_norm": 1.2675428950518468, + "learning_rate": 1.1333279746235959e-06, + "loss": 0.2585, + "step": 27167 + }, + { + "epoch": 0.79, + "grad_norm": 1.2231653468300303, + "learning_rate": 1.1330301978136266e-06, + "loss": 0.2657, + "step": 27168 + }, + { + "epoch": 0.79, + "grad_norm": 1.1987435958664745, + "learning_rate": 1.1327324551297092e-06, + "loss": 0.2719, + "step": 27169 + }, + { + "epoch": 0.79, + "grad_norm": 1.195273525928924, + "learning_rate": 1.13243474657447e-06, + "loss": 0.2589, + "step": 27170 + }, + { + "epoch": 0.79, + "grad_norm": 1.543947135068237, + "learning_rate": 1.1321370721505376e-06, + "loss": 0.3334, + "step": 27171 + }, + { + "epoch": 0.79, + "grad_norm": 1.233833729116381, + "learning_rate": 1.1318394318605396e-06, + "loss": 0.2764, + "step": 27172 + }, + { + "epoch": 0.79, + "grad_norm": 1.275437121334856, + "learning_rate": 1.1315418257070998e-06, + "loss": 0.27, + "step": 27173 + }, + { + "epoch": 0.79, + "grad_norm": 1.4676165883848464, + "learning_rate": 1.1312442536928469e-06, + "loss": 0.2633, + "step": 27174 + }, + { + "epoch": 0.79, + "grad_norm": 1.2760820110504307, + "learning_rate": 1.1309467158204062e-06, + "loss": 0.2891, + "step": 27175 + }, + { + "epoch": 0.79, + "grad_norm": 1.4307000360221915, + "learning_rate": 1.1306492120924035e-06, + "loss": 0.2893, + "step": 27176 + }, + { + "epoch": 0.79, + "grad_norm": 1.5807763937699575, + "learning_rate": 1.1303517425114646e-06, + "loss": 0.282, + "step": 27177 + }, + { + "epoch": 0.79, + "grad_norm": 1.3304863057376632, + "learning_rate": 1.1300543070802144e-06, + "loss": 0.2846, + "step": 27178 + }, + { + "epoch": 0.79, + "grad_norm": 1.4792685359358386, + "learning_rate": 1.1297569058012782e-06, + "loss": 0.2805, + "step": 27179 + }, + { + "epoch": 0.79, + "grad_norm": 1.357925575442153, + "learning_rate": 1.1294595386772794e-06, + "loss": 0.3027, + "step": 27180 + }, + { + "epoch": 0.79, + "grad_norm": 1.3027136186399428, + "learning_rate": 1.1291622057108454e-06, + "loss": 0.2623, + "step": 27181 + }, + { + "epoch": 0.79, + "grad_norm": 1.2452758422598313, + "learning_rate": 1.128864906904596e-06, + "loss": 0.2544, + "step": 27182 + }, + { + "epoch": 0.79, + "grad_norm": 1.4623702964625913, + "learning_rate": 1.1285676422611568e-06, + "loss": 0.2591, + "step": 27183 + }, + { + "epoch": 0.79, + "grad_norm": 2.1176959178815995, + "learning_rate": 1.1282704117831512e-06, + "loss": 0.3142, + "step": 27184 + }, + { + "epoch": 0.79, + "grad_norm": 1.2910486575188154, + "learning_rate": 1.1279732154732036e-06, + "loss": 0.2704, + "step": 27185 + }, + { + "epoch": 0.79, + "grad_norm": 1.284197667926715, + "learning_rate": 1.1276760533339337e-06, + "loss": 0.251, + "step": 27186 + }, + { + "epoch": 0.79, + "grad_norm": 1.710976167554745, + "learning_rate": 1.1273789253679656e-06, + "loss": 0.283, + "step": 27187 + }, + { + "epoch": 0.79, + "grad_norm": 0.9465281013307738, + "learning_rate": 1.127081831577922e-06, + "loss": 0.5237, + "step": 27188 + }, + { + "epoch": 0.79, + "grad_norm": 1.6327135966837922, + "learning_rate": 1.1267847719664238e-06, + "loss": 0.2733, + "step": 27189 + }, + { + "epoch": 0.79, + "grad_norm": 1.465325126769973, + "learning_rate": 1.126487746536093e-06, + "loss": 0.2675, + "step": 27190 + }, + { + "epoch": 0.79, + "grad_norm": 1.81863137750857, + "learning_rate": 1.1261907552895518e-06, + "loss": 0.2603, + "step": 27191 + }, + { + "epoch": 0.79, + "grad_norm": 1.5091224619266257, + "learning_rate": 1.1258937982294193e-06, + "loss": 0.2787, + "step": 27192 + }, + { + "epoch": 0.79, + "grad_norm": 1.4017471407065172, + "learning_rate": 1.1255968753583168e-06, + "loss": 0.2769, + "step": 27193 + }, + { + "epoch": 0.79, + "grad_norm": 1.2577960057377982, + "learning_rate": 1.125299986678865e-06, + "loss": 0.2879, + "step": 27194 + }, + { + "epoch": 0.79, + "grad_norm": 2.2756616066564352, + "learning_rate": 1.1250031321936834e-06, + "loss": 0.2936, + "step": 27195 + }, + { + "epoch": 0.79, + "grad_norm": 1.456957773491221, + "learning_rate": 1.1247063119053924e-06, + "loss": 0.2835, + "step": 27196 + }, + { + "epoch": 0.79, + "grad_norm": 1.2740395637666173, + "learning_rate": 1.124409525816611e-06, + "loss": 0.271, + "step": 27197 + }, + { + "epoch": 0.79, + "grad_norm": 1.50305552537776, + "learning_rate": 1.124112773929959e-06, + "loss": 0.2611, + "step": 27198 + }, + { + "epoch": 0.79, + "grad_norm": 1.350981491747363, + "learning_rate": 1.1238160562480544e-06, + "loss": 0.2692, + "step": 27199 + }, + { + "epoch": 0.79, + "grad_norm": 1.8051090685344606, + "learning_rate": 1.1235193727735178e-06, + "loss": 0.2748, + "step": 27200 + }, + { + "epoch": 0.79, + "grad_norm": 1.4342693947858263, + "learning_rate": 1.1232227235089637e-06, + "loss": 0.2659, + "step": 27201 + }, + { + "epoch": 0.79, + "grad_norm": 1.3137666824612682, + "learning_rate": 1.1229261084570125e-06, + "loss": 0.2704, + "step": 27202 + }, + { + "epoch": 0.79, + "grad_norm": 1.5061491256942545, + "learning_rate": 1.1226295276202815e-06, + "loss": 0.282, + "step": 27203 + }, + { + "epoch": 0.79, + "grad_norm": 0.9380178197797584, + "learning_rate": 1.1223329810013884e-06, + "loss": 0.5447, + "step": 27204 + }, + { + "epoch": 0.79, + "grad_norm": 1.6173337556365388, + "learning_rate": 1.122036468602949e-06, + "loss": 0.2732, + "step": 27205 + }, + { + "epoch": 0.79, + "grad_norm": 1.5469458037596553, + "learning_rate": 1.1217399904275812e-06, + "loss": 0.2897, + "step": 27206 + }, + { + "epoch": 0.79, + "grad_norm": 1.8975746441747583, + "learning_rate": 1.1214435464779006e-06, + "loss": 0.2687, + "step": 27207 + }, + { + "epoch": 0.79, + "grad_norm": 1.227579977443954, + "learning_rate": 1.1211471367565252e-06, + "loss": 0.2716, + "step": 27208 + }, + { + "epoch": 0.79, + "grad_norm": 1.6113691388012195, + "learning_rate": 1.120850761266068e-06, + "loss": 0.2682, + "step": 27209 + }, + { + "epoch": 0.79, + "grad_norm": 1.696331271629583, + "learning_rate": 1.1205544200091462e-06, + "loss": 0.2749, + "step": 27210 + }, + { + "epoch": 0.79, + "grad_norm": 1.296403686365535, + "learning_rate": 1.1202581129883744e-06, + "loss": 0.2798, + "step": 27211 + }, + { + "epoch": 0.79, + "grad_norm": 1.3887539844174146, + "learning_rate": 1.1199618402063679e-06, + "loss": 0.2671, + "step": 27212 + }, + { + "epoch": 0.79, + "grad_norm": 1.247001146593195, + "learning_rate": 1.119665601665742e-06, + "loss": 0.2721, + "step": 27213 + }, + { + "epoch": 0.79, + "grad_norm": 1.3526266692481002, + "learning_rate": 1.1193693973691095e-06, + "loss": 0.2863, + "step": 27214 + }, + { + "epoch": 0.79, + "grad_norm": 0.9422453102090699, + "learning_rate": 1.1190732273190851e-06, + "loss": 0.5596, + "step": 27215 + }, + { + "epoch": 0.79, + "grad_norm": 1.4848805825403883, + "learning_rate": 1.118777091518282e-06, + "loss": 0.277, + "step": 27216 + }, + { + "epoch": 0.79, + "grad_norm": 1.3112037257265194, + "learning_rate": 1.118480989969315e-06, + "loss": 0.2785, + "step": 27217 + }, + { + "epoch": 0.79, + "grad_norm": 1.3245832285674393, + "learning_rate": 1.1181849226747955e-06, + "loss": 0.2672, + "step": 27218 + }, + { + "epoch": 0.79, + "grad_norm": 1.2862101519655604, + "learning_rate": 1.117888889637339e-06, + "loss": 0.2527, + "step": 27219 + }, + { + "epoch": 0.79, + "grad_norm": 1.4983064986735983, + "learning_rate": 1.1175928908595546e-06, + "loss": 0.284, + "step": 27220 + }, + { + "epoch": 0.79, + "grad_norm": 1.3829627110344143, + "learning_rate": 1.1172969263440558e-06, + "loss": 0.2889, + "step": 27221 + }, + { + "epoch": 0.79, + "grad_norm": 1.3540552019034446, + "learning_rate": 1.1170009960934547e-06, + "loss": 0.2541, + "step": 27222 + }, + { + "epoch": 0.79, + "grad_norm": 1.4130364149615164, + "learning_rate": 1.1167051001103634e-06, + "loss": 0.254, + "step": 27223 + }, + { + "epoch": 0.79, + "grad_norm": 2.1847780511708117, + "learning_rate": 1.1164092383973924e-06, + "loss": 0.254, + "step": 27224 + }, + { + "epoch": 0.79, + "grad_norm": 1.457614652530625, + "learning_rate": 1.1161134109571525e-06, + "loss": 0.2751, + "step": 27225 + }, + { + "epoch": 0.79, + "grad_norm": 1.3105631391618977, + "learning_rate": 1.1158176177922554e-06, + "loss": 0.2636, + "step": 27226 + }, + { + "epoch": 0.79, + "grad_norm": 1.6570886591681848, + "learning_rate": 1.1155218589053118e-06, + "loss": 0.2685, + "step": 27227 + }, + { + "epoch": 0.79, + "grad_norm": 1.3684480168462572, + "learning_rate": 1.11522613429893e-06, + "loss": 0.2552, + "step": 27228 + }, + { + "epoch": 0.79, + "grad_norm": 1.3458850993725608, + "learning_rate": 1.1149304439757202e-06, + "loss": 0.2898, + "step": 27229 + }, + { + "epoch": 0.79, + "grad_norm": 0.9689539702464283, + "learning_rate": 1.1146347879382929e-06, + "loss": 0.5682, + "step": 27230 + }, + { + "epoch": 0.79, + "grad_norm": 1.838375585828739, + "learning_rate": 1.114339166189256e-06, + "loss": 0.2664, + "step": 27231 + }, + { + "epoch": 0.79, + "grad_norm": 1.398841250209539, + "learning_rate": 1.1140435787312198e-06, + "loss": 0.2746, + "step": 27232 + }, + { + "epoch": 0.79, + "grad_norm": 1.1939373216600264, + "learning_rate": 1.1137480255667915e-06, + "loss": 0.2528, + "step": 27233 + }, + { + "epoch": 0.79, + "grad_norm": 1.9044766165691511, + "learning_rate": 1.1134525066985808e-06, + "loss": 0.28, + "step": 27234 + }, + { + "epoch": 0.79, + "grad_norm": 2.2730285738079945, + "learning_rate": 1.1131570221291936e-06, + "loss": 0.2915, + "step": 27235 + }, + { + "epoch": 0.79, + "grad_norm": 1.2900657793509496, + "learning_rate": 1.1128615718612413e-06, + "loss": 0.2792, + "step": 27236 + }, + { + "epoch": 0.79, + "grad_norm": 1.2970161330027434, + "learning_rate": 1.1125661558973271e-06, + "loss": 0.2534, + "step": 27237 + }, + { + "epoch": 0.79, + "grad_norm": 1.2430898628368534, + "learning_rate": 1.1122707742400595e-06, + "loss": 0.2639, + "step": 27238 + }, + { + "epoch": 0.79, + "grad_norm": 1.6357446403060716, + "learning_rate": 1.1119754268920452e-06, + "loss": 0.2872, + "step": 27239 + }, + { + "epoch": 0.79, + "grad_norm": 1.501254683807957, + "learning_rate": 1.1116801138558913e-06, + "loss": 0.2868, + "step": 27240 + }, + { + "epoch": 0.79, + "grad_norm": 1.2607334015706098, + "learning_rate": 1.111384835134205e-06, + "loss": 0.2778, + "step": 27241 + }, + { + "epoch": 0.79, + "grad_norm": 1.4039592602336286, + "learning_rate": 1.1110895907295888e-06, + "loss": 0.2735, + "step": 27242 + }, + { + "epoch": 0.79, + "grad_norm": 1.4331931650461407, + "learning_rate": 1.110794380644651e-06, + "loss": 0.2751, + "step": 27243 + }, + { + "epoch": 0.79, + "grad_norm": 1.4045011389463102, + "learning_rate": 1.1104992048819952e-06, + "loss": 0.2731, + "step": 27244 + }, + { + "epoch": 0.79, + "grad_norm": 1.3435535717299798, + "learning_rate": 1.1102040634442268e-06, + "loss": 0.2797, + "step": 27245 + }, + { + "epoch": 0.79, + "grad_norm": 1.2227727537444875, + "learning_rate": 1.1099089563339527e-06, + "loss": 0.2587, + "step": 27246 + }, + { + "epoch": 0.79, + "grad_norm": 1.6439661549369655, + "learning_rate": 1.1096138835537734e-06, + "loss": 0.2781, + "step": 27247 + }, + { + "epoch": 0.79, + "grad_norm": 1.2198489008760722, + "learning_rate": 1.1093188451062952e-06, + "loss": 0.2667, + "step": 27248 + }, + { + "epoch": 0.79, + "grad_norm": 1.2596299965010576, + "learning_rate": 1.1090238409941206e-06, + "loss": 0.2843, + "step": 27249 + }, + { + "epoch": 0.79, + "grad_norm": 1.2302357233297296, + "learning_rate": 1.1087288712198547e-06, + "loss": 0.2774, + "step": 27250 + }, + { + "epoch": 0.79, + "grad_norm": 1.9879765118678898, + "learning_rate": 1.108433935786099e-06, + "loss": 0.2767, + "step": 27251 + }, + { + "epoch": 0.79, + "grad_norm": 1.4127891263783574, + "learning_rate": 1.1081390346954574e-06, + "loss": 0.2637, + "step": 27252 + }, + { + "epoch": 0.79, + "grad_norm": 1.3415500735353911, + "learning_rate": 1.107844167950532e-06, + "loss": 0.2929, + "step": 27253 + }, + { + "epoch": 0.79, + "grad_norm": 1.2997232727022368, + "learning_rate": 1.1075493355539247e-06, + "loss": 0.2758, + "step": 27254 + }, + { + "epoch": 0.79, + "grad_norm": 1.2407945387756276, + "learning_rate": 1.107254537508239e-06, + "loss": 0.267, + "step": 27255 + }, + { + "epoch": 0.79, + "grad_norm": 1.3766380296083487, + "learning_rate": 1.1069597738160741e-06, + "loss": 0.2586, + "step": 27256 + }, + { + "epoch": 0.79, + "grad_norm": 1.8986418862072725, + "learning_rate": 1.106665044480032e-06, + "loss": 0.3037, + "step": 27257 + }, + { + "epoch": 0.79, + "grad_norm": 2.2052184166798807, + "learning_rate": 1.1063703495027145e-06, + "loss": 0.2778, + "step": 27258 + }, + { + "epoch": 0.79, + "grad_norm": 1.246583976437282, + "learning_rate": 1.106075688886722e-06, + "loss": 0.2795, + "step": 27259 + }, + { + "epoch": 0.79, + "grad_norm": 1.2553725250661696, + "learning_rate": 1.1057810626346545e-06, + "loss": 0.2667, + "step": 27260 + }, + { + "epoch": 0.79, + "grad_norm": 1.370636385939726, + "learning_rate": 1.105486470749112e-06, + "loss": 0.2694, + "step": 27261 + }, + { + "epoch": 0.79, + "grad_norm": 1.4723636978352685, + "learning_rate": 1.105191913232695e-06, + "loss": 0.2845, + "step": 27262 + }, + { + "epoch": 0.79, + "grad_norm": 1.4282344722678786, + "learning_rate": 1.1048973900880038e-06, + "loss": 0.2687, + "step": 27263 + }, + { + "epoch": 0.79, + "grad_norm": 1.277868773898735, + "learning_rate": 1.1046029013176346e-06, + "loss": 0.2795, + "step": 27264 + }, + { + "epoch": 0.79, + "grad_norm": 1.268845911180936, + "learning_rate": 1.1043084469241889e-06, + "loss": 0.2609, + "step": 27265 + }, + { + "epoch": 0.79, + "grad_norm": 1.466768456418082, + "learning_rate": 1.1040140269102633e-06, + "loss": 0.2868, + "step": 27266 + }, + { + "epoch": 0.79, + "grad_norm": 1.4097407309194196, + "learning_rate": 1.103719641278458e-06, + "loss": 0.2754, + "step": 27267 + }, + { + "epoch": 0.79, + "grad_norm": 1.3899270366979573, + "learning_rate": 1.1034252900313696e-06, + "loss": 0.2732, + "step": 27268 + }, + { + "epoch": 0.79, + "grad_norm": 1.3193739428816078, + "learning_rate": 1.1031309731715977e-06, + "loss": 0.2686, + "step": 27269 + }, + { + "epoch": 0.79, + "grad_norm": 1.4274785633234637, + "learning_rate": 1.1028366907017373e-06, + "loss": 0.2648, + "step": 27270 + }, + { + "epoch": 0.79, + "grad_norm": 1.4151082122257124, + "learning_rate": 1.1025424426243857e-06, + "loss": 0.2844, + "step": 27271 + }, + { + "epoch": 0.79, + "grad_norm": 1.3759697164699476, + "learning_rate": 1.102248228942141e-06, + "loss": 0.2943, + "step": 27272 + }, + { + "epoch": 0.79, + "grad_norm": 1.5164381639543818, + "learning_rate": 1.1019540496575981e-06, + "loss": 0.2915, + "step": 27273 + }, + { + "epoch": 0.79, + "grad_norm": 1.311201773945014, + "learning_rate": 1.1016599047733557e-06, + "loss": 0.2677, + "step": 27274 + }, + { + "epoch": 0.79, + "grad_norm": 1.2997852828821366, + "learning_rate": 1.1013657942920063e-06, + "loss": 0.2728, + "step": 27275 + }, + { + "epoch": 0.79, + "grad_norm": 1.3117693414196112, + "learning_rate": 1.1010717182161474e-06, + "loss": 0.2964, + "step": 27276 + }, + { + "epoch": 0.79, + "grad_norm": 1.317751805942006, + "learning_rate": 1.1007776765483736e-06, + "loss": 0.2747, + "step": 27277 + }, + { + "epoch": 0.79, + "grad_norm": 1.2895708418802325, + "learning_rate": 1.10048366929128e-06, + "loss": 0.2588, + "step": 27278 + }, + { + "epoch": 0.79, + "grad_norm": 1.5452191694774284, + "learning_rate": 1.1001896964474618e-06, + "loss": 0.3025, + "step": 27279 + }, + { + "epoch": 0.79, + "grad_norm": 1.1989325669627067, + "learning_rate": 1.0998957580195124e-06, + "loss": 0.2622, + "step": 27280 + }, + { + "epoch": 0.79, + "grad_norm": 1.585703183228928, + "learning_rate": 1.099601854010026e-06, + "loss": 0.2724, + "step": 27281 + }, + { + "epoch": 0.79, + "grad_norm": 1.7297473391054976, + "learning_rate": 1.0993079844215981e-06, + "loss": 0.2624, + "step": 27282 + }, + { + "epoch": 0.79, + "grad_norm": 1.3734949124253566, + "learning_rate": 1.099014149256819e-06, + "loss": 0.2786, + "step": 27283 + }, + { + "epoch": 0.79, + "grad_norm": 2.495597268270153, + "learning_rate": 1.0987203485182834e-06, + "loss": 0.2795, + "step": 27284 + }, + { + "epoch": 0.79, + "grad_norm": 1.4460604737340206, + "learning_rate": 1.0984265822085844e-06, + "loss": 0.2616, + "step": 27285 + }, + { + "epoch": 0.79, + "grad_norm": 1.222893166692475, + "learning_rate": 1.0981328503303135e-06, + "loss": 0.2706, + "step": 27286 + }, + { + "epoch": 0.79, + "grad_norm": 1.3635083634563678, + "learning_rate": 1.0978391528860638e-06, + "loss": 0.2788, + "step": 27287 + }, + { + "epoch": 0.79, + "grad_norm": 1.394084989996095, + "learning_rate": 1.0975454898784266e-06, + "loss": 0.2831, + "step": 27288 + }, + { + "epoch": 0.79, + "grad_norm": 1.3057667632221832, + "learning_rate": 1.0972518613099936e-06, + "loss": 0.2811, + "step": 27289 + }, + { + "epoch": 0.79, + "grad_norm": 1.291536291604936, + "learning_rate": 1.0969582671833568e-06, + "loss": 0.2994, + "step": 27290 + }, + { + "epoch": 0.79, + "grad_norm": 1.3795173182790608, + "learning_rate": 1.0966647075011077e-06, + "loss": 0.2616, + "step": 27291 + }, + { + "epoch": 0.79, + "grad_norm": 1.3224437757853256, + "learning_rate": 1.0963711822658346e-06, + "loss": 0.2658, + "step": 27292 + }, + { + "epoch": 0.79, + "grad_norm": 1.2785283772561908, + "learning_rate": 1.096077691480129e-06, + "loss": 0.2771, + "step": 27293 + }, + { + "epoch": 0.79, + "grad_norm": 1.2092369878205482, + "learning_rate": 1.0957842351465808e-06, + "loss": 0.2836, + "step": 27294 + }, + { + "epoch": 0.79, + "grad_norm": 1.2854238097317512, + "learning_rate": 1.0954908132677805e-06, + "loss": 0.2772, + "step": 27295 + }, + { + "epoch": 0.79, + "grad_norm": 1.777320789461905, + "learning_rate": 1.0951974258463166e-06, + "loss": 0.2706, + "step": 27296 + }, + { + "epoch": 0.79, + "grad_norm": 1.9299940644669318, + "learning_rate": 1.0949040728847804e-06, + "loss": 0.2881, + "step": 27297 + }, + { + "epoch": 0.79, + "grad_norm": 1.499452613565349, + "learning_rate": 1.0946107543857582e-06, + "loss": 0.2681, + "step": 27298 + }, + { + "epoch": 0.79, + "grad_norm": 1.2242974420796793, + "learning_rate": 1.0943174703518383e-06, + "loss": 0.2551, + "step": 27299 + }, + { + "epoch": 0.79, + "grad_norm": 1.255138494479253, + "learning_rate": 1.0940242207856112e-06, + "loss": 0.2709, + "step": 27300 + }, + { + "epoch": 0.79, + "grad_norm": 1.2993549494348156, + "learning_rate": 1.0937310056896643e-06, + "loss": 0.2759, + "step": 27301 + }, + { + "epoch": 0.79, + "grad_norm": 1.3506586975455368, + "learning_rate": 1.0934378250665834e-06, + "loss": 0.2755, + "step": 27302 + }, + { + "epoch": 0.79, + "grad_norm": 2.0743240631825284, + "learning_rate": 1.093144678918957e-06, + "loss": 0.2795, + "step": 27303 + }, + { + "epoch": 0.79, + "grad_norm": 1.569481430284, + "learning_rate": 1.0928515672493723e-06, + "loss": 0.2669, + "step": 27304 + }, + { + "epoch": 0.79, + "grad_norm": 1.525693702414495, + "learning_rate": 1.0925584900604158e-06, + "loss": 0.2634, + "step": 27305 + }, + { + "epoch": 0.79, + "grad_norm": 1.341579961104714, + "learning_rate": 1.092265447354674e-06, + "loss": 0.2767, + "step": 27306 + }, + { + "epoch": 0.79, + "grad_norm": 1.3322096792833034, + "learning_rate": 1.091972439134733e-06, + "loss": 0.2906, + "step": 27307 + }, + { + "epoch": 0.79, + "grad_norm": 1.2371818008148079, + "learning_rate": 1.0916794654031788e-06, + "loss": 0.2814, + "step": 27308 + }, + { + "epoch": 0.79, + "grad_norm": 1.4707628601921126, + "learning_rate": 1.091386526162596e-06, + "loss": 0.2633, + "step": 27309 + }, + { + "epoch": 0.79, + "grad_norm": 2.0711498624343183, + "learning_rate": 1.0910936214155726e-06, + "loss": 0.2832, + "step": 27310 + }, + { + "epoch": 0.79, + "grad_norm": 1.3527439611425753, + "learning_rate": 1.0908007511646895e-06, + "loss": 0.2699, + "step": 27311 + }, + { + "epoch": 0.79, + "grad_norm": 1.5485289812411223, + "learning_rate": 1.090507915412533e-06, + "loss": 0.3043, + "step": 27312 + }, + { + "epoch": 0.79, + "grad_norm": 1.4153504169209026, + "learning_rate": 1.090215114161688e-06, + "loss": 0.3019, + "step": 27313 + }, + { + "epoch": 0.79, + "grad_norm": 1.3142592176014047, + "learning_rate": 1.0899223474147374e-06, + "loss": 0.2802, + "step": 27314 + }, + { + "epoch": 0.79, + "grad_norm": 1.3350262689708194, + "learning_rate": 1.0896296151742663e-06, + "loss": 0.2864, + "step": 27315 + }, + { + "epoch": 0.79, + "grad_norm": 1.3326947031666614, + "learning_rate": 1.0893369174428564e-06, + "loss": 0.26, + "step": 27316 + }, + { + "epoch": 0.79, + "grad_norm": 1.4307371561541113, + "learning_rate": 1.0890442542230923e-06, + "loss": 0.276, + "step": 27317 + }, + { + "epoch": 0.79, + "grad_norm": 1.423955233949179, + "learning_rate": 1.0887516255175568e-06, + "loss": 0.2663, + "step": 27318 + }, + { + "epoch": 0.79, + "grad_norm": 2.4238289778864868, + "learning_rate": 1.0884590313288307e-06, + "loss": 0.2869, + "step": 27319 + }, + { + "epoch": 0.79, + "grad_norm": 1.3071308615924182, + "learning_rate": 1.088166471659497e-06, + "loss": 0.2572, + "step": 27320 + }, + { + "epoch": 0.79, + "grad_norm": 1.3852624219844565, + "learning_rate": 1.0878739465121368e-06, + "loss": 0.2656, + "step": 27321 + }, + { + "epoch": 0.79, + "grad_norm": 1.8037326084257121, + "learning_rate": 1.0875814558893333e-06, + "loss": 0.2658, + "step": 27322 + }, + { + "epoch": 0.79, + "grad_norm": 1.2301299679017415, + "learning_rate": 1.0872889997936664e-06, + "loss": 0.2734, + "step": 27323 + }, + { + "epoch": 0.79, + "grad_norm": 1.416212554585002, + "learning_rate": 1.086996578227718e-06, + "loss": 0.2853, + "step": 27324 + }, + { + "epoch": 0.79, + "grad_norm": 1.4660928981333188, + "learning_rate": 1.086704191194069e-06, + "loss": 0.2922, + "step": 27325 + }, + { + "epoch": 0.79, + "grad_norm": 1.6115419538930789, + "learning_rate": 1.0864118386952976e-06, + "loss": 0.2839, + "step": 27326 + }, + { + "epoch": 0.79, + "grad_norm": 1.5845148807139808, + "learning_rate": 1.0861195207339858e-06, + "loss": 0.3117, + "step": 27327 + }, + { + "epoch": 0.79, + "grad_norm": 1.6033156927774979, + "learning_rate": 1.0858272373127132e-06, + "loss": 0.2648, + "step": 27328 + }, + { + "epoch": 0.79, + "grad_norm": 1.4386031291560826, + "learning_rate": 1.0855349884340577e-06, + "loss": 0.2711, + "step": 27329 + }, + { + "epoch": 0.79, + "grad_norm": 1.254570021580822, + "learning_rate": 1.0852427741005995e-06, + "loss": 0.2623, + "step": 27330 + }, + { + "epoch": 0.79, + "grad_norm": 1.385070964351979, + "learning_rate": 1.0849505943149168e-06, + "loss": 0.274, + "step": 27331 + }, + { + "epoch": 0.79, + "grad_norm": 1.343362576445443, + "learning_rate": 1.0846584490795886e-06, + "loss": 0.2717, + "step": 27332 + }, + { + "epoch": 0.79, + "grad_norm": 1.3162541843251656, + "learning_rate": 1.0843663383971937e-06, + "loss": 0.2726, + "step": 27333 + }, + { + "epoch": 0.79, + "grad_norm": 1.3504570411230752, + "learning_rate": 1.0840742622703087e-06, + "loss": 0.2609, + "step": 27334 + }, + { + "epoch": 0.79, + "grad_norm": 1.921049035923983, + "learning_rate": 1.0837822207015114e-06, + "loss": 0.268, + "step": 27335 + }, + { + "epoch": 0.79, + "grad_norm": 1.4767616615876387, + "learning_rate": 1.08349021369338e-06, + "loss": 0.2751, + "step": 27336 + }, + { + "epoch": 0.79, + "grad_norm": 1.522084906902276, + "learning_rate": 1.0831982412484925e-06, + "loss": 0.2832, + "step": 27337 + }, + { + "epoch": 0.79, + "grad_norm": 1.4562143407829247, + "learning_rate": 1.0829063033694221e-06, + "loss": 0.2883, + "step": 27338 + }, + { + "epoch": 0.79, + "grad_norm": 1.6016035753737723, + "learning_rate": 1.0826144000587474e-06, + "loss": 0.2969, + "step": 27339 + }, + { + "epoch": 0.79, + "grad_norm": 1.2440324720747877, + "learning_rate": 1.082322531319044e-06, + "loss": 0.2706, + "step": 27340 + }, + { + "epoch": 0.79, + "grad_norm": 1.2105710300925177, + "learning_rate": 1.0820306971528876e-06, + "loss": 0.2592, + "step": 27341 + }, + { + "epoch": 0.79, + "grad_norm": 1.3584797877644668, + "learning_rate": 1.0817388975628535e-06, + "loss": 0.2588, + "step": 27342 + }, + { + "epoch": 0.79, + "grad_norm": 1.273953398109103, + "learning_rate": 1.0814471325515174e-06, + "loss": 0.2676, + "step": 27343 + }, + { + "epoch": 0.79, + "grad_norm": 1.3048822606933166, + "learning_rate": 1.0811554021214539e-06, + "loss": 0.2453, + "step": 27344 + }, + { + "epoch": 0.79, + "grad_norm": 1.3672915120143037, + "learning_rate": 1.0808637062752376e-06, + "loss": 0.2701, + "step": 27345 + }, + { + "epoch": 0.79, + "grad_norm": 1.4906823161041496, + "learning_rate": 1.0805720450154433e-06, + "loss": 0.2727, + "step": 27346 + }, + { + "epoch": 0.79, + "grad_norm": 1.3754993539855802, + "learning_rate": 1.080280418344643e-06, + "loss": 0.2752, + "step": 27347 + }, + { + "epoch": 0.79, + "grad_norm": 1.3264715516750754, + "learning_rate": 1.0799888262654118e-06, + "loss": 0.2808, + "step": 27348 + }, + { + "epoch": 0.79, + "grad_norm": 1.388473208619688, + "learning_rate": 1.0796972687803226e-06, + "loss": 0.2811, + "step": 27349 + }, + { + "epoch": 0.79, + "grad_norm": 1.37927166718243, + "learning_rate": 1.0794057458919487e-06, + "loss": 0.2983, + "step": 27350 + }, + { + "epoch": 0.79, + "grad_norm": 1.3582325754141917, + "learning_rate": 1.0791142576028623e-06, + "loss": 0.2926, + "step": 27351 + }, + { + "epoch": 0.79, + "grad_norm": 1.3415570026906836, + "learning_rate": 1.0788228039156362e-06, + "loss": 0.2876, + "step": 27352 + }, + { + "epoch": 0.79, + "grad_norm": 1.4741747090409605, + "learning_rate": 1.0785313848328422e-06, + "loss": 0.2803, + "step": 27353 + }, + { + "epoch": 0.79, + "grad_norm": 1.5300370857260872, + "learning_rate": 1.0782400003570538e-06, + "loss": 0.2663, + "step": 27354 + }, + { + "epoch": 0.79, + "grad_norm": 1.3403789345509165, + "learning_rate": 1.0779486504908392e-06, + "loss": 0.2797, + "step": 27355 + }, + { + "epoch": 0.79, + "grad_norm": 2.9320970914881817, + "learning_rate": 1.077657335236773e-06, + "loss": 0.2534, + "step": 27356 + }, + { + "epoch": 0.79, + "grad_norm": 1.6431705452843532, + "learning_rate": 1.0773660545974229e-06, + "loss": 0.2516, + "step": 27357 + }, + { + "epoch": 0.79, + "grad_norm": 1.1938041313484449, + "learning_rate": 1.0770748085753608e-06, + "loss": 0.2487, + "step": 27358 + }, + { + "epoch": 0.79, + "grad_norm": 1.6089086394825072, + "learning_rate": 1.0767835971731572e-06, + "loss": 0.273, + "step": 27359 + }, + { + "epoch": 0.79, + "grad_norm": 1.2479756586272448, + "learning_rate": 1.0764924203933814e-06, + "loss": 0.2893, + "step": 27360 + }, + { + "epoch": 0.79, + "grad_norm": 1.232448932858792, + "learning_rate": 1.0762012782386038e-06, + "loss": 0.2699, + "step": 27361 + }, + { + "epoch": 0.79, + "grad_norm": 1.246250670625487, + "learning_rate": 1.0759101707113933e-06, + "loss": 0.2625, + "step": 27362 + }, + { + "epoch": 0.79, + "grad_norm": 1.2470619722604184, + "learning_rate": 1.0756190978143193e-06, + "loss": 0.2653, + "step": 27363 + }, + { + "epoch": 0.79, + "grad_norm": 1.3944823755937426, + "learning_rate": 1.0753280595499522e-06, + "loss": 0.2841, + "step": 27364 + }, + { + "epoch": 0.79, + "grad_norm": 1.2434480072478948, + "learning_rate": 1.0750370559208562e-06, + "loss": 0.2768, + "step": 27365 + }, + { + "epoch": 0.79, + "grad_norm": 1.2007701526783388, + "learning_rate": 1.0747460869296023e-06, + "loss": 0.2542, + "step": 27366 + }, + { + "epoch": 0.79, + "grad_norm": 1.4002842467742014, + "learning_rate": 1.0744551525787573e-06, + "loss": 0.2539, + "step": 27367 + }, + { + "epoch": 0.79, + "grad_norm": 1.3027667881519711, + "learning_rate": 1.0741642528708896e-06, + "loss": 0.282, + "step": 27368 + }, + { + "epoch": 0.79, + "grad_norm": 1.4395704812219552, + "learning_rate": 1.0738733878085655e-06, + "loss": 0.2678, + "step": 27369 + }, + { + "epoch": 0.79, + "grad_norm": 1.564808962827795, + "learning_rate": 1.0735825573943526e-06, + "loss": 0.2842, + "step": 27370 + }, + { + "epoch": 0.79, + "grad_norm": 1.5329114993451793, + "learning_rate": 1.073291761630817e-06, + "loss": 0.2931, + "step": 27371 + }, + { + "epoch": 0.79, + "grad_norm": 1.4554410404805411, + "learning_rate": 1.073001000520525e-06, + "loss": 0.3057, + "step": 27372 + }, + { + "epoch": 0.79, + "grad_norm": 1.3710497400195638, + "learning_rate": 1.0727102740660443e-06, + "loss": 0.2886, + "step": 27373 + }, + { + "epoch": 0.79, + "grad_norm": 1.4042729308170256, + "learning_rate": 1.072419582269938e-06, + "loss": 0.2829, + "step": 27374 + }, + { + "epoch": 0.79, + "grad_norm": 1.6296074422219688, + "learning_rate": 1.0721289251347721e-06, + "loss": 0.2709, + "step": 27375 + }, + { + "epoch": 0.79, + "grad_norm": 1.273696109681736, + "learning_rate": 1.071838302663112e-06, + "loss": 0.2845, + "step": 27376 + }, + { + "epoch": 0.79, + "grad_norm": 1.3174380003812662, + "learning_rate": 1.071547714857522e-06, + "loss": 0.2833, + "step": 27377 + }, + { + "epoch": 0.79, + "grad_norm": 2.6776295440106677, + "learning_rate": 1.0712571617205681e-06, + "loss": 0.2693, + "step": 27378 + }, + { + "epoch": 0.79, + "grad_norm": 1.352849498614018, + "learning_rate": 1.070966643254812e-06, + "loss": 0.2793, + "step": 27379 + }, + { + "epoch": 0.79, + "grad_norm": 1.493509359992807, + "learning_rate": 1.07067615946282e-06, + "loss": 0.2823, + "step": 27380 + }, + { + "epoch": 0.79, + "grad_norm": 1.338146435947829, + "learning_rate": 1.0703857103471537e-06, + "loss": 0.277, + "step": 27381 + }, + { + "epoch": 0.79, + "grad_norm": 1.3172417853524239, + "learning_rate": 1.0700952959103788e-06, + "loss": 0.2705, + "step": 27382 + }, + { + "epoch": 0.79, + "grad_norm": 1.5561881162132432, + "learning_rate": 1.0698049161550567e-06, + "loss": 0.2779, + "step": 27383 + }, + { + "epoch": 0.79, + "grad_norm": 1.3342820888525297, + "learning_rate": 1.0695145710837484e-06, + "loss": 0.2687, + "step": 27384 + }, + { + "epoch": 0.79, + "grad_norm": 1.422385778285871, + "learning_rate": 1.0692242606990177e-06, + "loss": 0.2645, + "step": 27385 + }, + { + "epoch": 0.79, + "grad_norm": 1.1902845787076957, + "learning_rate": 1.0689339850034258e-06, + "loss": 0.2664, + "step": 27386 + }, + { + "epoch": 0.79, + "grad_norm": 1.4753074761344978, + "learning_rate": 1.0686437439995357e-06, + "loss": 0.2613, + "step": 27387 + }, + { + "epoch": 0.79, + "grad_norm": 0.9730678802695807, + "learning_rate": 1.0683535376899085e-06, + "loss": 0.5628, + "step": 27388 + }, + { + "epoch": 0.79, + "grad_norm": 2.341394059516323, + "learning_rate": 1.068063366077104e-06, + "loss": 0.2871, + "step": 27389 + }, + { + "epoch": 0.79, + "grad_norm": 1.3457709252610335, + "learning_rate": 1.0677732291636844e-06, + "loss": 0.2678, + "step": 27390 + }, + { + "epoch": 0.79, + "grad_norm": 1.3109215647642998, + "learning_rate": 1.0674831269522102e-06, + "loss": 0.2891, + "step": 27391 + }, + { + "epoch": 0.79, + "grad_norm": 1.291474072670058, + "learning_rate": 1.0671930594452419e-06, + "loss": 0.274, + "step": 27392 + }, + { + "epoch": 0.79, + "grad_norm": 1.8083923702615534, + "learning_rate": 1.0669030266453367e-06, + "loss": 0.3621, + "step": 27393 + }, + { + "epoch": 0.79, + "grad_norm": 1.385766433383911, + "learning_rate": 1.0666130285550564e-06, + "loss": 0.2878, + "step": 27394 + }, + { + "epoch": 0.79, + "grad_norm": 1.2812466393385435, + "learning_rate": 1.0663230651769596e-06, + "loss": 0.2698, + "step": 27395 + }, + { + "epoch": 0.79, + "grad_norm": 1.5682794729817415, + "learning_rate": 1.0660331365136056e-06, + "loss": 0.2767, + "step": 27396 + }, + { + "epoch": 0.79, + "grad_norm": 1.4306004648699602, + "learning_rate": 1.0657432425675523e-06, + "loss": 0.2703, + "step": 27397 + }, + { + "epoch": 0.79, + "grad_norm": 1.3657814426567731, + "learning_rate": 1.0654533833413594e-06, + "loss": 0.271, + "step": 27398 + }, + { + "epoch": 0.79, + "grad_norm": 1.3708122107104375, + "learning_rate": 1.065163558837583e-06, + "loss": 0.274, + "step": 27399 + }, + { + "epoch": 0.79, + "grad_norm": 1.720550862625774, + "learning_rate": 1.064873769058784e-06, + "loss": 0.2929, + "step": 27400 + }, + { + "epoch": 0.79, + "grad_norm": 1.4592792180644447, + "learning_rate": 1.064584014007516e-06, + "loss": 0.2838, + "step": 27401 + }, + { + "epoch": 0.79, + "grad_norm": 1.2616716266246493, + "learning_rate": 1.0642942936863375e-06, + "loss": 0.2843, + "step": 27402 + }, + { + "epoch": 0.79, + "grad_norm": 1.4286357507485243, + "learning_rate": 1.0640046080978061e-06, + "loss": 0.289, + "step": 27403 + }, + { + "epoch": 0.79, + "grad_norm": 1.4048337147701808, + "learning_rate": 1.0637149572444772e-06, + "loss": 0.274, + "step": 27404 + }, + { + "epoch": 0.79, + "grad_norm": 1.373675519613654, + "learning_rate": 1.063425341128908e-06, + "loss": 0.2833, + "step": 27405 + }, + { + "epoch": 0.79, + "grad_norm": 1.417828133865759, + "learning_rate": 1.0631357597536535e-06, + "loss": 0.2609, + "step": 27406 + }, + { + "epoch": 0.79, + "grad_norm": 1.2075135109608999, + "learning_rate": 1.0628462131212691e-06, + "loss": 0.2795, + "step": 27407 + }, + { + "epoch": 0.79, + "grad_norm": 1.2940338980909105, + "learning_rate": 1.0625567012343113e-06, + "loss": 0.2738, + "step": 27408 + }, + { + "epoch": 0.79, + "grad_norm": 1.2942431899277147, + "learning_rate": 1.0622672240953351e-06, + "loss": 0.2601, + "step": 27409 + }, + { + "epoch": 0.8, + "grad_norm": 1.2917045338357274, + "learning_rate": 1.0619777817068932e-06, + "loss": 0.2813, + "step": 27410 + }, + { + "epoch": 0.8, + "grad_norm": 1.5757224394883156, + "learning_rate": 1.0616883740715428e-06, + "loss": 0.2879, + "step": 27411 + }, + { + "epoch": 0.8, + "grad_norm": 3.6451331083192033, + "learning_rate": 1.0613990011918345e-06, + "loss": 0.2846, + "step": 27412 + }, + { + "epoch": 0.8, + "grad_norm": 1.4314197716095005, + "learning_rate": 1.0611096630703237e-06, + "loss": 0.2857, + "step": 27413 + }, + { + "epoch": 0.8, + "grad_norm": 1.3951697765563187, + "learning_rate": 1.060820359709564e-06, + "loss": 0.3055, + "step": 27414 + }, + { + "epoch": 0.8, + "grad_norm": 1.496981152344416, + "learning_rate": 1.060531091112108e-06, + "loss": 0.2784, + "step": 27415 + }, + { + "epoch": 0.8, + "grad_norm": 1.3426793820140377, + "learning_rate": 1.060241857280509e-06, + "loss": 0.2615, + "step": 27416 + }, + { + "epoch": 0.8, + "grad_norm": 1.3259943903815907, + "learning_rate": 1.0599526582173192e-06, + "loss": 0.2799, + "step": 27417 + }, + { + "epoch": 0.8, + "grad_norm": 1.4802445275988503, + "learning_rate": 1.059663493925091e-06, + "loss": 0.2795, + "step": 27418 + }, + { + "epoch": 0.8, + "grad_norm": 1.8096286186161612, + "learning_rate": 1.0593743644063775e-06, + "loss": 0.2635, + "step": 27419 + }, + { + "epoch": 0.8, + "grad_norm": 1.4467030718871237, + "learning_rate": 1.0590852696637278e-06, + "loss": 0.281, + "step": 27420 + }, + { + "epoch": 0.8, + "grad_norm": 1.5674016407393656, + "learning_rate": 1.058796209699694e-06, + "loss": 0.2815, + "step": 27421 + }, + { + "epoch": 0.8, + "grad_norm": 1.2653294373645372, + "learning_rate": 1.058507184516827e-06, + "loss": 0.2847, + "step": 27422 + }, + { + "epoch": 0.8, + "grad_norm": 1.2996987923231451, + "learning_rate": 1.058218194117679e-06, + "loss": 0.2621, + "step": 27423 + }, + { + "epoch": 0.8, + "grad_norm": 1.6200255120685771, + "learning_rate": 1.0579292385047978e-06, + "loss": 0.2639, + "step": 27424 + }, + { + "epoch": 0.8, + "grad_norm": 1.202495493862016, + "learning_rate": 1.0576403176807359e-06, + "loss": 0.3163, + "step": 27425 + }, + { + "epoch": 0.8, + "grad_norm": 1.4762589485460302, + "learning_rate": 1.0573514316480415e-06, + "loss": 0.2551, + "step": 27426 + }, + { + "epoch": 0.8, + "grad_norm": 1.3410160370001853, + "learning_rate": 1.0570625804092643e-06, + "loss": 0.2792, + "step": 27427 + }, + { + "epoch": 0.8, + "grad_norm": 1.324739346286493, + "learning_rate": 1.056773763966955e-06, + "loss": 0.2865, + "step": 27428 + }, + { + "epoch": 0.8, + "grad_norm": 1.2430499071902816, + "learning_rate": 1.05648498232366e-06, + "loss": 0.255, + "step": 27429 + }, + { + "epoch": 0.8, + "grad_norm": 1.3484327963614802, + "learning_rate": 1.0561962354819282e-06, + "loss": 0.2627, + "step": 27430 + }, + { + "epoch": 0.8, + "grad_norm": 1.4456272776131909, + "learning_rate": 1.055907523444309e-06, + "loss": 0.2802, + "step": 27431 + }, + { + "epoch": 0.8, + "grad_norm": 1.2704246965453159, + "learning_rate": 1.0556188462133493e-06, + "loss": 0.2769, + "step": 27432 + }, + { + "epoch": 0.8, + "grad_norm": 1.3906715524851279, + "learning_rate": 1.0553302037915968e-06, + "loss": 0.3201, + "step": 27433 + }, + { + "epoch": 0.8, + "grad_norm": 1.4948008573531184, + "learning_rate": 1.0550415961815992e-06, + "loss": 0.2943, + "step": 27434 + }, + { + "epoch": 0.8, + "grad_norm": 1.3105148466579237, + "learning_rate": 1.0547530233859033e-06, + "loss": 0.2433, + "step": 27435 + }, + { + "epoch": 0.8, + "grad_norm": 1.3057809936809743, + "learning_rate": 1.0544644854070574e-06, + "loss": 0.2717, + "step": 27436 + }, + { + "epoch": 0.8, + "grad_norm": 1.512253767800445, + "learning_rate": 1.0541759822476044e-06, + "loss": 0.2636, + "step": 27437 + }, + { + "epoch": 0.8, + "grad_norm": 1.3173237518308367, + "learning_rate": 1.0538875139100934e-06, + "loss": 0.2754, + "step": 27438 + }, + { + "epoch": 0.8, + "grad_norm": 1.3707209178545658, + "learning_rate": 1.053599080397068e-06, + "loss": 0.2815, + "step": 27439 + }, + { + "epoch": 0.8, + "grad_norm": 1.219832800031881, + "learning_rate": 1.0533106817110745e-06, + "loss": 0.269, + "step": 27440 + }, + { + "epoch": 0.8, + "grad_norm": 1.5507939626612712, + "learning_rate": 1.0530223178546578e-06, + "loss": 0.2731, + "step": 27441 + }, + { + "epoch": 0.8, + "grad_norm": 1.2797332245786035, + "learning_rate": 1.0527339888303629e-06, + "loss": 0.2891, + "step": 27442 + }, + { + "epoch": 0.8, + "grad_norm": 1.3899106161150434, + "learning_rate": 1.052445694640734e-06, + "loss": 0.266, + "step": 27443 + }, + { + "epoch": 0.8, + "grad_norm": 1.3725835351068354, + "learning_rate": 1.0521574352883162e-06, + "loss": 0.2782, + "step": 27444 + }, + { + "epoch": 0.8, + "grad_norm": 2.216898774449747, + "learning_rate": 1.0518692107756524e-06, + "loss": 0.2626, + "step": 27445 + }, + { + "epoch": 0.8, + "grad_norm": 1.430855121191521, + "learning_rate": 1.0515810211052867e-06, + "loss": 0.3029, + "step": 27446 + }, + { + "epoch": 0.8, + "grad_norm": 1.4633894084935724, + "learning_rate": 1.051292866279764e-06, + "loss": 0.2571, + "step": 27447 + }, + { + "epoch": 0.8, + "grad_norm": 1.3609215652033657, + "learning_rate": 1.0510047463016238e-06, + "loss": 0.2797, + "step": 27448 + }, + { + "epoch": 0.8, + "grad_norm": 1.2870090155170628, + "learning_rate": 1.050716661173411e-06, + "loss": 0.2866, + "step": 27449 + }, + { + "epoch": 0.8, + "grad_norm": 1.3634297886074467, + "learning_rate": 1.0504286108976668e-06, + "loss": 0.2629, + "step": 27450 + }, + { + "epoch": 0.8, + "grad_norm": 1.292411224759929, + "learning_rate": 1.0501405954769345e-06, + "loss": 0.2691, + "step": 27451 + }, + { + "epoch": 0.8, + "grad_norm": 1.3620608397318996, + "learning_rate": 1.049852614913755e-06, + "loss": 0.2875, + "step": 27452 + }, + { + "epoch": 0.8, + "grad_norm": 1.5461888917642592, + "learning_rate": 1.0495646692106704e-06, + "loss": 0.286, + "step": 27453 + }, + { + "epoch": 0.8, + "grad_norm": 2.405305084202979, + "learning_rate": 1.0492767583702208e-06, + "loss": 0.2666, + "step": 27454 + }, + { + "epoch": 0.8, + "grad_norm": 1.3623488061571316, + "learning_rate": 1.048988882394949e-06, + "loss": 0.273, + "step": 27455 + }, + { + "epoch": 0.8, + "grad_norm": 1.3124736031780515, + "learning_rate": 1.0487010412873933e-06, + "loss": 0.2547, + "step": 27456 + }, + { + "epoch": 0.8, + "grad_norm": 1.4167696307109303, + "learning_rate": 1.0484132350500942e-06, + "loss": 0.2596, + "step": 27457 + }, + { + "epoch": 0.8, + "grad_norm": 1.344030406561979, + "learning_rate": 1.0481254636855926e-06, + "loss": 0.2892, + "step": 27458 + }, + { + "epoch": 0.8, + "grad_norm": 1.5205129136232522, + "learning_rate": 1.0478377271964275e-06, + "loss": 0.2769, + "step": 27459 + }, + { + "epoch": 0.8, + "grad_norm": 1.9104693163810798, + "learning_rate": 1.0475500255851384e-06, + "loss": 0.2711, + "step": 27460 + }, + { + "epoch": 0.8, + "grad_norm": 2.0069764890953765, + "learning_rate": 1.047262358854264e-06, + "loss": 0.2749, + "step": 27461 + }, + { + "epoch": 0.8, + "grad_norm": 1.3758462552488298, + "learning_rate": 1.046974727006343e-06, + "loss": 0.2721, + "step": 27462 + }, + { + "epoch": 0.8, + "grad_norm": 1.3583891215265806, + "learning_rate": 1.0466871300439142e-06, + "loss": 0.2661, + "step": 27463 + }, + { + "epoch": 0.8, + "grad_norm": 1.4001390963212244, + "learning_rate": 1.0463995679695165e-06, + "loss": 0.2774, + "step": 27464 + }, + { + "epoch": 0.8, + "grad_norm": 1.3065717611052448, + "learning_rate": 1.046112040785685e-06, + "loss": 0.2675, + "step": 27465 + }, + { + "epoch": 0.8, + "grad_norm": 1.300498201544318, + "learning_rate": 1.0458245484949602e-06, + "loss": 0.2577, + "step": 27466 + }, + { + "epoch": 0.8, + "grad_norm": 1.2833246412158403, + "learning_rate": 1.0455370910998764e-06, + "loss": 0.2689, + "step": 27467 + }, + { + "epoch": 0.8, + "grad_norm": 1.3794832283333716, + "learning_rate": 1.0452496686029711e-06, + "loss": 0.2723, + "step": 27468 + }, + { + "epoch": 0.8, + "grad_norm": 2.0428240488990483, + "learning_rate": 1.0449622810067816e-06, + "loss": 0.2844, + "step": 27469 + }, + { + "epoch": 0.8, + "grad_norm": 1.6250739418336615, + "learning_rate": 1.0446749283138442e-06, + "loss": 0.2805, + "step": 27470 + }, + { + "epoch": 0.8, + "grad_norm": 1.5415687654663912, + "learning_rate": 1.0443876105266936e-06, + "loss": 0.281, + "step": 27471 + }, + { + "epoch": 0.8, + "grad_norm": 1.309796355073765, + "learning_rate": 1.0441003276478672e-06, + "loss": 0.2549, + "step": 27472 + }, + { + "epoch": 0.8, + "grad_norm": 1.701443064127207, + "learning_rate": 1.0438130796798985e-06, + "loss": 0.2802, + "step": 27473 + }, + { + "epoch": 0.8, + "grad_norm": 1.298644505558661, + "learning_rate": 1.043525866625325e-06, + "loss": 0.2901, + "step": 27474 + }, + { + "epoch": 0.8, + "grad_norm": 0.9772022231009491, + "learning_rate": 1.0432386884866786e-06, + "loss": 0.594, + "step": 27475 + }, + { + "epoch": 0.8, + "grad_norm": 1.2305586025975794, + "learning_rate": 1.042951545266494e-06, + "loss": 0.276, + "step": 27476 + }, + { + "epoch": 0.8, + "grad_norm": 1.3074378957503014, + "learning_rate": 1.042664436967306e-06, + "loss": 0.2849, + "step": 27477 + }, + { + "epoch": 0.8, + "grad_norm": 1.3685133292552891, + "learning_rate": 1.0423773635916484e-06, + "loss": 0.2607, + "step": 27478 + }, + { + "epoch": 0.8, + "grad_norm": 1.3432931763243783, + "learning_rate": 1.042090325142055e-06, + "loss": 0.2808, + "step": 27479 + }, + { + "epoch": 0.8, + "grad_norm": 1.4861431644488683, + "learning_rate": 1.0418033216210577e-06, + "loss": 0.2672, + "step": 27480 + }, + { + "epoch": 0.8, + "grad_norm": 1.3244256113351018, + "learning_rate": 1.0415163530311901e-06, + "loss": 0.2778, + "step": 27481 + }, + { + "epoch": 0.8, + "grad_norm": 1.2657742545725166, + "learning_rate": 1.041229419374985e-06, + "loss": 0.2641, + "step": 27482 + }, + { + "epoch": 0.8, + "grad_norm": 1.4165331848085054, + "learning_rate": 1.040942520654975e-06, + "loss": 0.2693, + "step": 27483 + }, + { + "epoch": 0.8, + "grad_norm": 1.2650466415818726, + "learning_rate": 1.0406556568736903e-06, + "loss": 0.253, + "step": 27484 + }, + { + "epoch": 0.8, + "grad_norm": 1.2219314005907076, + "learning_rate": 1.0403688280336626e-06, + "loss": 0.2646, + "step": 27485 + }, + { + "epoch": 0.8, + "grad_norm": 1.3244989612344547, + "learning_rate": 1.0400820341374246e-06, + "loss": 0.2891, + "step": 27486 + }, + { + "epoch": 0.8, + "grad_norm": 1.244387362457406, + "learning_rate": 1.0397952751875067e-06, + "loss": 0.2547, + "step": 27487 + }, + { + "epoch": 0.8, + "grad_norm": 1.307247118433011, + "learning_rate": 1.0395085511864389e-06, + "loss": 0.2765, + "step": 27488 + }, + { + "epoch": 0.8, + "grad_norm": 1.3733546504309297, + "learning_rate": 1.0392218621367522e-06, + "loss": 0.2919, + "step": 27489 + }, + { + "epoch": 0.8, + "grad_norm": 1.2636257819105319, + "learning_rate": 1.038935208040977e-06, + "loss": 0.2779, + "step": 27490 + }, + { + "epoch": 0.8, + "grad_norm": 1.9772020101868142, + "learning_rate": 1.0386485889016435e-06, + "loss": 0.2568, + "step": 27491 + }, + { + "epoch": 0.8, + "grad_norm": 1.3403088278173116, + "learning_rate": 1.0383620047212783e-06, + "loss": 0.3018, + "step": 27492 + }, + { + "epoch": 0.8, + "grad_norm": 12.097849336392986, + "learning_rate": 1.038075455502413e-06, + "loss": 0.2792, + "step": 27493 + }, + { + "epoch": 0.8, + "grad_norm": 1.459772224562416, + "learning_rate": 1.0377889412475772e-06, + "loss": 0.2675, + "step": 27494 + }, + { + "epoch": 0.8, + "grad_norm": 1.4826768846593108, + "learning_rate": 1.037502461959296e-06, + "loss": 0.2783, + "step": 27495 + }, + { + "epoch": 0.8, + "grad_norm": 1.324484266138753, + "learning_rate": 1.0372160176401002e-06, + "loss": 0.2681, + "step": 27496 + }, + { + "epoch": 0.8, + "grad_norm": 1.3176660376445504, + "learning_rate": 1.0369296082925167e-06, + "loss": 0.2994, + "step": 27497 + }, + { + "epoch": 0.8, + "grad_norm": 1.6151981213081057, + "learning_rate": 1.0366432339190735e-06, + "loss": 0.2768, + "step": 27498 + }, + { + "epoch": 0.8, + "grad_norm": 1.2617356823788417, + "learning_rate": 1.0363568945222973e-06, + "loss": 0.2776, + "step": 27499 + }, + { + "epoch": 0.8, + "grad_norm": 1.280242625846359, + "learning_rate": 1.0360705901047157e-06, + "loss": 0.2901, + "step": 27500 + }, + { + "epoch": 0.8, + "grad_norm": 1.3317871828038566, + "learning_rate": 1.0357843206688555e-06, + "loss": 0.2466, + "step": 27501 + }, + { + "epoch": 0.8, + "grad_norm": 3.491717199883999, + "learning_rate": 1.0354980862172437e-06, + "loss": 0.2825, + "step": 27502 + }, + { + "epoch": 0.8, + "grad_norm": 3.1203317476488843, + "learning_rate": 1.0352118867524037e-06, + "loss": 0.2558, + "step": 27503 + }, + { + "epoch": 0.8, + "grad_norm": 1.4048942785762748, + "learning_rate": 1.0349257222768633e-06, + "loss": 0.258, + "step": 27504 + }, + { + "epoch": 0.8, + "grad_norm": 1.4694649128435064, + "learning_rate": 1.0346395927931469e-06, + "loss": 0.2751, + "step": 27505 + }, + { + "epoch": 0.8, + "grad_norm": 1.3826640790073206, + "learning_rate": 1.03435349830378e-06, + "loss": 0.2724, + "step": 27506 + }, + { + "epoch": 0.8, + "grad_norm": 1.7346421228432396, + "learning_rate": 1.0340674388112875e-06, + "loss": 0.29, + "step": 27507 + }, + { + "epoch": 0.8, + "grad_norm": 1.3932669816244845, + "learning_rate": 1.0337814143181946e-06, + "loss": 0.2604, + "step": 27508 + }, + { + "epoch": 0.8, + "grad_norm": 1.6940534748973415, + "learning_rate": 1.033495424827024e-06, + "loss": 0.2628, + "step": 27509 + }, + { + "epoch": 0.8, + "grad_norm": 1.3724434431345258, + "learning_rate": 1.0332094703403018e-06, + "loss": 0.2633, + "step": 27510 + }, + { + "epoch": 0.8, + "grad_norm": 2.0898053293342334, + "learning_rate": 1.032923550860549e-06, + "loss": 0.2724, + "step": 27511 + }, + { + "epoch": 0.8, + "grad_norm": 1.689988038648721, + "learning_rate": 1.0326376663902892e-06, + "loss": 0.2811, + "step": 27512 + }, + { + "epoch": 0.8, + "grad_norm": 1.3249729743965544, + "learning_rate": 1.0323518169320462e-06, + "loss": 0.3311, + "step": 27513 + }, + { + "epoch": 0.8, + "grad_norm": 1.3559617851141532, + "learning_rate": 1.0320660024883428e-06, + "loss": 0.2639, + "step": 27514 + }, + { + "epoch": 0.8, + "grad_norm": 3.240565587268304, + "learning_rate": 1.0317802230617008e-06, + "loss": 0.2682, + "step": 27515 + }, + { + "epoch": 0.8, + "grad_norm": 1.2368824548076431, + "learning_rate": 1.0314944786546422e-06, + "loss": 0.2599, + "step": 27516 + }, + { + "epoch": 0.8, + "grad_norm": 1.2435058952548625, + "learning_rate": 1.0312087692696893e-06, + "loss": 0.2761, + "step": 27517 + }, + { + "epoch": 0.8, + "grad_norm": 1.3406774020970318, + "learning_rate": 1.0309230949093624e-06, + "loss": 0.2733, + "step": 27518 + }, + { + "epoch": 0.8, + "grad_norm": 1.2992735733880443, + "learning_rate": 1.030637455576185e-06, + "loss": 0.2469, + "step": 27519 + }, + { + "epoch": 0.8, + "grad_norm": 1.3241899250503903, + "learning_rate": 1.0303518512726746e-06, + "loss": 0.2612, + "step": 27520 + }, + { + "epoch": 0.8, + "grad_norm": 1.4505648296824625, + "learning_rate": 1.030066282001353e-06, + "loss": 0.2457, + "step": 27521 + }, + { + "epoch": 0.8, + "grad_norm": 1.5990144891210016, + "learning_rate": 1.029780747764742e-06, + "loss": 0.2842, + "step": 27522 + }, + { + "epoch": 0.8, + "grad_norm": 1.3522325587110322, + "learning_rate": 1.0294952485653587e-06, + "loss": 0.2776, + "step": 27523 + }, + { + "epoch": 0.8, + "grad_norm": 1.392060986279289, + "learning_rate": 1.0292097844057236e-06, + "loss": 0.2847, + "step": 27524 + }, + { + "epoch": 0.8, + "grad_norm": 1.9444989223821092, + "learning_rate": 1.0289243552883566e-06, + "loss": 0.262, + "step": 27525 + }, + { + "epoch": 0.8, + "grad_norm": 1.6495275159953873, + "learning_rate": 1.0286389612157759e-06, + "loss": 0.2763, + "step": 27526 + }, + { + "epoch": 0.8, + "grad_norm": 1.507182372576568, + "learning_rate": 1.0283536021905005e-06, + "loss": 0.2608, + "step": 27527 + }, + { + "epoch": 0.8, + "grad_norm": 1.3773136326622186, + "learning_rate": 1.0280682782150486e-06, + "loss": 0.2703, + "step": 27528 + }, + { + "epoch": 0.8, + "grad_norm": 1.3263913223731925, + "learning_rate": 1.0277829892919394e-06, + "loss": 0.2747, + "step": 27529 + }, + { + "epoch": 0.8, + "grad_norm": 1.456146505509853, + "learning_rate": 1.027497735423688e-06, + "loss": 0.2817, + "step": 27530 + }, + { + "epoch": 0.8, + "grad_norm": 1.2846329021186302, + "learning_rate": 1.027212516612814e-06, + "loss": 0.2834, + "step": 27531 + }, + { + "epoch": 0.8, + "grad_norm": 1.4459754645672476, + "learning_rate": 1.0269273328618328e-06, + "loss": 0.2497, + "step": 27532 + }, + { + "epoch": 0.8, + "grad_norm": 0.9806682430080828, + "learning_rate": 1.026642184173262e-06, + "loss": 0.5965, + "step": 27533 + }, + { + "epoch": 0.8, + "grad_norm": 1.3232138104474374, + "learning_rate": 1.0263570705496184e-06, + "loss": 0.2701, + "step": 27534 + }, + { + "epoch": 0.8, + "grad_norm": 1.2762334032967775, + "learning_rate": 1.0260719919934176e-06, + "loss": 0.2944, + "step": 27535 + }, + { + "epoch": 0.8, + "grad_norm": 1.285338939429056, + "learning_rate": 1.0257869485071752e-06, + "loss": 0.2646, + "step": 27536 + }, + { + "epoch": 0.8, + "grad_norm": 1.2944003680770106, + "learning_rate": 1.0255019400934074e-06, + "loss": 0.2952, + "step": 27537 + }, + { + "epoch": 0.8, + "grad_norm": 1.4909379737707942, + "learning_rate": 1.0252169667546302e-06, + "loss": 0.2643, + "step": 27538 + }, + { + "epoch": 0.8, + "grad_norm": 1.3046924015801566, + "learning_rate": 1.024932028493356e-06, + "loss": 0.2777, + "step": 27539 + }, + { + "epoch": 0.8, + "grad_norm": 1.3951867016615747, + "learning_rate": 1.024647125312101e-06, + "loss": 0.2671, + "step": 27540 + }, + { + "epoch": 0.8, + "grad_norm": 1.2875264115742242, + "learning_rate": 1.0243622572133793e-06, + "loss": 0.277, + "step": 27541 + }, + { + "epoch": 0.8, + "grad_norm": 1.3424317640096655, + "learning_rate": 1.0240774241997042e-06, + "loss": 0.2698, + "step": 27542 + }, + { + "epoch": 0.8, + "grad_norm": 1.0595632300424926, + "learning_rate": 1.023792626273591e-06, + "loss": 0.6377, + "step": 27543 + }, + { + "epoch": 0.8, + "grad_norm": 1.5288098032795019, + "learning_rate": 1.0235078634375512e-06, + "loss": 0.2614, + "step": 27544 + }, + { + "epoch": 0.8, + "grad_norm": 1.459661889952442, + "learning_rate": 1.0232231356940986e-06, + "loss": 0.2746, + "step": 27545 + }, + { + "epoch": 0.8, + "grad_norm": 1.281939983059331, + "learning_rate": 1.0229384430457474e-06, + "loss": 0.2899, + "step": 27546 + }, + { + "epoch": 0.8, + "grad_norm": 1.2428543531790304, + "learning_rate": 1.0226537854950068e-06, + "loss": 0.2716, + "step": 27547 + }, + { + "epoch": 0.8, + "grad_norm": 1.227728525542587, + "learning_rate": 1.0223691630443915e-06, + "loss": 0.2694, + "step": 27548 + }, + { + "epoch": 0.8, + "grad_norm": 1.9267075838487366, + "learning_rate": 1.022084575696412e-06, + "loss": 0.2805, + "step": 27549 + }, + { + "epoch": 0.8, + "grad_norm": 1.4050191036654618, + "learning_rate": 1.0218000234535818e-06, + "loss": 0.2875, + "step": 27550 + }, + { + "epoch": 0.8, + "grad_norm": 1.2675731728280024, + "learning_rate": 1.021515506318409e-06, + "loss": 0.2715, + "step": 27551 + }, + { + "epoch": 0.8, + "grad_norm": 1.4133926569494797, + "learning_rate": 1.021231024293406e-06, + "loss": 0.2599, + "step": 27552 + }, + { + "epoch": 0.8, + "grad_norm": 1.3921777206133816, + "learning_rate": 1.0209465773810834e-06, + "loss": 0.2745, + "step": 27553 + }, + { + "epoch": 0.8, + "grad_norm": 1.377938951133609, + "learning_rate": 1.020662165583951e-06, + "loss": 0.2785, + "step": 27554 + }, + { + "epoch": 0.8, + "grad_norm": 1.2358557152628504, + "learning_rate": 1.0203777889045197e-06, + "loss": 0.2714, + "step": 27555 + }, + { + "epoch": 0.8, + "grad_norm": 1.4000919695867113, + "learning_rate": 1.0200934473452983e-06, + "loss": 0.292, + "step": 27556 + }, + { + "epoch": 0.8, + "grad_norm": 1.5386951936650155, + "learning_rate": 1.0198091409087978e-06, + "loss": 0.2827, + "step": 27557 + }, + { + "epoch": 0.8, + "grad_norm": 1.5316858354326082, + "learning_rate": 1.0195248695975246e-06, + "loss": 0.275, + "step": 27558 + }, + { + "epoch": 0.8, + "grad_norm": 1.252665925569011, + "learning_rate": 1.0192406334139882e-06, + "loss": 0.2669, + "step": 27559 + }, + { + "epoch": 0.8, + "grad_norm": 1.302161597747127, + "learning_rate": 1.0189564323606976e-06, + "loss": 0.2788, + "step": 27560 + }, + { + "epoch": 0.8, + "grad_norm": 1.242054221866519, + "learning_rate": 1.018672266440161e-06, + "loss": 0.2965, + "step": 27561 + }, + { + "epoch": 0.8, + "grad_norm": 1.4244204412411718, + "learning_rate": 1.0183881356548853e-06, + "loss": 0.2756, + "step": 27562 + }, + { + "epoch": 0.8, + "grad_norm": 1.3108038590598592, + "learning_rate": 1.0181040400073784e-06, + "loss": 0.3016, + "step": 27563 + }, + { + "epoch": 0.8, + "grad_norm": 1.4892032447264023, + "learning_rate": 1.0178199795001481e-06, + "loss": 0.3062, + "step": 27564 + }, + { + "epoch": 0.8, + "grad_norm": 1.3908153430939996, + "learning_rate": 1.0175359541357016e-06, + "loss": 0.277, + "step": 27565 + }, + { + "epoch": 0.8, + "grad_norm": 1.257886143021464, + "learning_rate": 1.017251963916543e-06, + "loss": 0.2791, + "step": 27566 + }, + { + "epoch": 0.8, + "grad_norm": 1.5326924259153965, + "learning_rate": 1.01696800884518e-06, + "loss": 0.281, + "step": 27567 + }, + { + "epoch": 0.8, + "grad_norm": 1.4227701026919688, + "learning_rate": 1.0166840889241187e-06, + "loss": 0.2766, + "step": 27568 + }, + { + "epoch": 0.8, + "grad_norm": 1.5607040736490274, + "learning_rate": 1.0164002041558647e-06, + "loss": 0.2731, + "step": 27569 + }, + { + "epoch": 0.8, + "grad_norm": 1.8494146605512798, + "learning_rate": 1.016116354542923e-06, + "loss": 0.2825, + "step": 27570 + }, + { + "epoch": 0.8, + "grad_norm": 1.4078740738840707, + "learning_rate": 1.0158325400877983e-06, + "loss": 0.2725, + "step": 27571 + }, + { + "epoch": 0.8, + "grad_norm": 1.2915043594752915, + "learning_rate": 1.015548760792996e-06, + "loss": 0.259, + "step": 27572 + }, + { + "epoch": 0.8, + "grad_norm": 2.4313403819147625, + "learning_rate": 1.01526501666102e-06, + "loss": 0.2818, + "step": 27573 + }, + { + "epoch": 0.8, + "grad_norm": 1.6419168999551017, + "learning_rate": 1.0149813076943755e-06, + "loss": 0.2623, + "step": 27574 + }, + { + "epoch": 0.8, + "grad_norm": 1.7966623150907277, + "learning_rate": 1.0146976338955643e-06, + "loss": 0.2813, + "step": 27575 + }, + { + "epoch": 0.8, + "grad_norm": 1.4070889860646314, + "learning_rate": 1.0144139952670907e-06, + "loss": 0.2673, + "step": 27576 + }, + { + "epoch": 0.8, + "grad_norm": 2.8574115779292057, + "learning_rate": 1.014130391811457e-06, + "loss": 0.2656, + "step": 27577 + }, + { + "epoch": 0.8, + "grad_norm": 1.5727345089132556, + "learning_rate": 1.013846823531169e-06, + "loss": 0.2805, + "step": 27578 + }, + { + "epoch": 0.8, + "grad_norm": 1.3795636084304799, + "learning_rate": 1.013563290428725e-06, + "loss": 0.279, + "step": 27579 + }, + { + "epoch": 0.8, + "grad_norm": 1.5098253191465845, + "learning_rate": 1.0132797925066302e-06, + "loss": 0.2713, + "step": 27580 + }, + { + "epoch": 0.8, + "grad_norm": 1.2231028501637986, + "learning_rate": 1.0129963297673845e-06, + "loss": 0.2681, + "step": 27581 + }, + { + "epoch": 0.8, + "grad_norm": 1.003541393458924, + "learning_rate": 1.0127129022134907e-06, + "loss": 0.5631, + "step": 27582 + }, + { + "epoch": 0.8, + "grad_norm": 1.3624355103423653, + "learning_rate": 1.0124295098474502e-06, + "loss": 0.2981, + "step": 27583 + }, + { + "epoch": 0.8, + "grad_norm": 1.219122443818042, + "learning_rate": 1.0121461526717642e-06, + "loss": 0.2675, + "step": 27584 + }, + { + "epoch": 0.8, + "grad_norm": 1.3810357082879925, + "learning_rate": 1.0118628306889316e-06, + "loss": 0.2975, + "step": 27585 + }, + { + "epoch": 0.8, + "grad_norm": 1.3807382277481934, + "learning_rate": 1.0115795439014535e-06, + "loss": 0.2648, + "step": 27586 + }, + { + "epoch": 0.8, + "grad_norm": 2.3577510504138934, + "learning_rate": 1.0112962923118308e-06, + "loss": 0.2787, + "step": 27587 + }, + { + "epoch": 0.8, + "grad_norm": 1.4824656731142276, + "learning_rate": 1.0110130759225616e-06, + "loss": 0.2618, + "step": 27588 + }, + { + "epoch": 0.8, + "grad_norm": 1.430991179074635, + "learning_rate": 1.0107298947361472e-06, + "loss": 0.288, + "step": 27589 + }, + { + "epoch": 0.8, + "grad_norm": 1.3497694940561358, + "learning_rate": 1.010446748755085e-06, + "loss": 0.2741, + "step": 27590 + }, + { + "epoch": 0.8, + "grad_norm": 1.6381182429686716, + "learning_rate": 1.0101636379818753e-06, + "loss": 0.2893, + "step": 27591 + }, + { + "epoch": 0.8, + "grad_norm": 1.2967009176664828, + "learning_rate": 1.009880562419015e-06, + "loss": 0.2816, + "step": 27592 + }, + { + "epoch": 0.8, + "grad_norm": 1.4824100158241431, + "learning_rate": 1.0095975220690045e-06, + "loss": 0.2846, + "step": 27593 + }, + { + "epoch": 0.8, + "grad_norm": 1.3206266177503094, + "learning_rate": 1.0093145169343393e-06, + "loss": 0.2784, + "step": 27594 + }, + { + "epoch": 0.8, + "grad_norm": 1.2649056421397749, + "learning_rate": 1.0090315470175173e-06, + "loss": 0.2689, + "step": 27595 + }, + { + "epoch": 0.8, + "grad_norm": 1.2511258348960437, + "learning_rate": 1.008748612321036e-06, + "loss": 0.2531, + "step": 27596 + }, + { + "epoch": 0.8, + "grad_norm": 1.2910456255701588, + "learning_rate": 1.0084657128473928e-06, + "loss": 0.2576, + "step": 27597 + }, + { + "epoch": 0.8, + "grad_norm": 1.289458290063662, + "learning_rate": 1.0081828485990841e-06, + "loss": 0.3006, + "step": 27598 + }, + { + "epoch": 0.8, + "grad_norm": 1.680222239554162, + "learning_rate": 1.007900019578606e-06, + "loss": 0.2702, + "step": 27599 + }, + { + "epoch": 0.8, + "grad_norm": 1.2634018358557566, + "learning_rate": 1.0076172257884543e-06, + "loss": 0.2543, + "step": 27600 + }, + { + "epoch": 0.8, + "grad_norm": 1.575648963123019, + "learning_rate": 1.0073344672311263e-06, + "loss": 0.2668, + "step": 27601 + }, + { + "epoch": 0.8, + "grad_norm": 1.2365731203295767, + "learning_rate": 1.0070517439091144e-06, + "loss": 0.2918, + "step": 27602 + }, + { + "epoch": 0.8, + "grad_norm": 1.3133157527166317, + "learning_rate": 1.0067690558249156e-06, + "loss": 0.2625, + "step": 27603 + }, + { + "epoch": 0.8, + "grad_norm": 1.3007782476339198, + "learning_rate": 1.006486402981024e-06, + "loss": 0.2772, + "step": 27604 + }, + { + "epoch": 0.8, + "grad_norm": 1.263419222436857, + "learning_rate": 1.006203785379934e-06, + "loss": 0.2547, + "step": 27605 + }, + { + "epoch": 0.8, + "grad_norm": 1.3862938858420184, + "learning_rate": 1.005921203024141e-06, + "loss": 0.2546, + "step": 27606 + }, + { + "epoch": 0.8, + "grad_norm": 1.2432355055360038, + "learning_rate": 1.005638655916137e-06, + "loss": 0.2643, + "step": 27607 + }, + { + "epoch": 0.8, + "grad_norm": 1.3171680308691942, + "learning_rate": 1.005356144058416e-06, + "loss": 0.2882, + "step": 27608 + }, + { + "epoch": 0.8, + "grad_norm": 1.672941066626764, + "learning_rate": 1.0050736674534712e-06, + "loss": 0.2904, + "step": 27609 + }, + { + "epoch": 0.8, + "grad_norm": 1.275732915711851, + "learning_rate": 1.004791226103795e-06, + "loss": 0.2601, + "step": 27610 + }, + { + "epoch": 0.8, + "grad_norm": 1.243887314788148, + "learning_rate": 1.0045088200118812e-06, + "loss": 0.2478, + "step": 27611 + }, + { + "epoch": 0.8, + "grad_norm": 1.3819977614625427, + "learning_rate": 1.0042264491802227e-06, + "loss": 0.2808, + "step": 27612 + }, + { + "epoch": 0.8, + "grad_norm": 1.924797583453217, + "learning_rate": 1.0039441136113086e-06, + "loss": 0.2695, + "step": 27613 + }, + { + "epoch": 0.8, + "grad_norm": 1.2080541143903925, + "learning_rate": 1.0036618133076325e-06, + "loss": 0.2368, + "step": 27614 + }, + { + "epoch": 0.8, + "grad_norm": 1.706899218170599, + "learning_rate": 1.0033795482716851e-06, + "loss": 0.2795, + "step": 27615 + }, + { + "epoch": 0.8, + "grad_norm": 1.3948954885232627, + "learning_rate": 1.0030973185059573e-06, + "loss": 0.2585, + "step": 27616 + }, + { + "epoch": 0.8, + "grad_norm": 1.280247632342219, + "learning_rate": 1.0028151240129397e-06, + "loss": 0.2604, + "step": 27617 + }, + { + "epoch": 0.8, + "grad_norm": 1.3914557370091538, + "learning_rate": 1.0025329647951237e-06, + "loss": 0.268, + "step": 27618 + }, + { + "epoch": 0.8, + "grad_norm": 1.5175526099577468, + "learning_rate": 1.0022508408549981e-06, + "loss": 0.2783, + "step": 27619 + }, + { + "epoch": 0.8, + "grad_norm": 1.4526594347246213, + "learning_rate": 1.0019687521950544e-06, + "loss": 0.2641, + "step": 27620 + }, + { + "epoch": 0.8, + "grad_norm": 1.348911120490458, + "learning_rate": 1.0016866988177797e-06, + "loss": 0.2797, + "step": 27621 + }, + { + "epoch": 0.8, + "grad_norm": 1.3506949870485894, + "learning_rate": 1.0014046807256644e-06, + "loss": 0.2862, + "step": 27622 + }, + { + "epoch": 0.8, + "grad_norm": 1.427128883372389, + "learning_rate": 1.001122697921197e-06, + "loss": 0.2857, + "step": 27623 + }, + { + "epoch": 0.8, + "grad_norm": 1.2413612228146715, + "learning_rate": 1.0008407504068662e-06, + "loss": 0.2582, + "step": 27624 + }, + { + "epoch": 0.8, + "grad_norm": 1.4113839758923155, + "learning_rate": 1.00055883818516e-06, + "loss": 0.2746, + "step": 27625 + }, + { + "epoch": 0.8, + "grad_norm": 1.2152694053652016, + "learning_rate": 1.0002769612585667e-06, + "loss": 0.2519, + "step": 27626 + }, + { + "epoch": 0.8, + "grad_norm": 1.986914964708537, + "learning_rate": 9.999951196295733e-07, + "loss": 0.275, + "step": 27627 + }, + { + "epoch": 0.8, + "grad_norm": 1.3908009886082977, + "learning_rate": 9.997133133006675e-07, + "loss": 0.2707, + "step": 27628 + }, + { + "epoch": 0.8, + "grad_norm": 1.3085092124675433, + "learning_rate": 9.994315422743372e-07, + "loss": 0.2814, + "step": 27629 + }, + { + "epoch": 0.8, + "grad_norm": 1.2510346887801056, + "learning_rate": 9.99149806553067e-07, + "loss": 0.2649, + "step": 27630 + }, + { + "epoch": 0.8, + "grad_norm": 1.3051326957251368, + "learning_rate": 9.988681061393441e-07, + "loss": 0.2859, + "step": 27631 + }, + { + "epoch": 0.8, + "grad_norm": 1.9537624633481248, + "learning_rate": 9.985864410356545e-07, + "loss": 0.2744, + "step": 27632 + }, + { + "epoch": 0.8, + "grad_norm": 1.4742793928592395, + "learning_rate": 9.983048112444838e-07, + "loss": 0.2731, + "step": 27633 + }, + { + "epoch": 0.8, + "grad_norm": 1.657118260255836, + "learning_rate": 9.980232167683173e-07, + "loss": 0.2472, + "step": 27634 + }, + { + "epoch": 0.8, + "grad_norm": 1.4578258694163642, + "learning_rate": 9.977416576096416e-07, + "loss": 0.2923, + "step": 27635 + }, + { + "epoch": 0.8, + "grad_norm": 1.5255156091858109, + "learning_rate": 9.974601337709394e-07, + "loss": 0.3014, + "step": 27636 + }, + { + "epoch": 0.8, + "grad_norm": 1.6096834421644872, + "learning_rate": 9.971786452546955e-07, + "loss": 0.3128, + "step": 27637 + }, + { + "epoch": 0.8, + "grad_norm": 0.9017440992828311, + "learning_rate": 9.968971920633946e-07, + "loss": 0.5431, + "step": 27638 + }, + { + "epoch": 0.8, + "grad_norm": 1.3717031510278292, + "learning_rate": 9.96615774199522e-07, + "loss": 0.265, + "step": 27639 + }, + { + "epoch": 0.8, + "grad_norm": 1.223624819634439, + "learning_rate": 9.963343916655576e-07, + "loss": 0.2667, + "step": 27640 + }, + { + "epoch": 0.8, + "grad_norm": 1.3518120874112731, + "learning_rate": 9.96053044463987e-07, + "loss": 0.2839, + "step": 27641 + }, + { + "epoch": 0.8, + "grad_norm": 1.409103932370236, + "learning_rate": 9.957717325972927e-07, + "loss": 0.2745, + "step": 27642 + }, + { + "epoch": 0.8, + "grad_norm": 1.8121998414241427, + "learning_rate": 9.954904560679574e-07, + "loss": 0.2728, + "step": 27643 + }, + { + "epoch": 0.8, + "grad_norm": 2.033267069306656, + "learning_rate": 9.952092148784631e-07, + "loss": 0.2931, + "step": 27644 + }, + { + "epoch": 0.8, + "grad_norm": 1.3271641346066063, + "learning_rate": 9.949280090312918e-07, + "loss": 0.3376, + "step": 27645 + }, + { + "epoch": 0.8, + "grad_norm": 1.647517275485927, + "learning_rate": 9.946468385289253e-07, + "loss": 0.2639, + "step": 27646 + }, + { + "epoch": 0.8, + "grad_norm": 1.1883892644176308, + "learning_rate": 9.943657033738451e-07, + "loss": 0.2543, + "step": 27647 + }, + { + "epoch": 0.8, + "grad_norm": 1.6490836540861251, + "learning_rate": 9.940846035685337e-07, + "loss": 0.2647, + "step": 27648 + }, + { + "epoch": 0.8, + "grad_norm": 1.749280083358577, + "learning_rate": 9.938035391154683e-07, + "loss": 0.2994, + "step": 27649 + }, + { + "epoch": 0.8, + "grad_norm": 1.2743042721755906, + "learning_rate": 9.935225100171308e-07, + "loss": 0.2663, + "step": 27650 + }, + { + "epoch": 0.8, + "grad_norm": 1.3615843759979545, + "learning_rate": 9.932415162760018e-07, + "loss": 0.2699, + "step": 27651 + }, + { + "epoch": 0.8, + "grad_norm": 1.4054228883740842, + "learning_rate": 9.92960557894561e-07, + "loss": 0.2782, + "step": 27652 + }, + { + "epoch": 0.8, + "grad_norm": 1.2264129525725989, + "learning_rate": 9.926796348752877e-07, + "loss": 0.2507, + "step": 27653 + }, + { + "epoch": 0.8, + "grad_norm": 1.45572752880519, + "learning_rate": 9.923987472206608e-07, + "loss": 0.2923, + "step": 27654 + }, + { + "epoch": 0.8, + "grad_norm": 1.30755071542964, + "learning_rate": 9.9211789493316e-07, + "loss": 0.2727, + "step": 27655 + }, + { + "epoch": 0.8, + "grad_norm": 1.4452079718078756, + "learning_rate": 9.918370780152642e-07, + "loss": 0.2594, + "step": 27656 + }, + { + "epoch": 0.8, + "grad_norm": 3.592410425985476, + "learning_rate": 9.915562964694492e-07, + "loss": 0.2984, + "step": 27657 + }, + { + "epoch": 0.8, + "grad_norm": 1.2987167141490226, + "learning_rate": 9.912755502981947e-07, + "loss": 0.2734, + "step": 27658 + }, + { + "epoch": 0.8, + "grad_norm": 1.8616051516631344, + "learning_rate": 9.909948395039775e-07, + "loss": 0.2613, + "step": 27659 + }, + { + "epoch": 0.8, + "grad_norm": 1.3648028122624998, + "learning_rate": 9.907141640892758e-07, + "loss": 0.2746, + "step": 27660 + }, + { + "epoch": 0.8, + "grad_norm": 2.0738398790995274, + "learning_rate": 9.90433524056566e-07, + "loss": 0.2852, + "step": 27661 + }, + { + "epoch": 0.8, + "grad_norm": 1.4003625811331237, + "learning_rate": 9.901529194083248e-07, + "loss": 0.2901, + "step": 27662 + }, + { + "epoch": 0.8, + "grad_norm": 1.5294321433177964, + "learning_rate": 9.898723501470298e-07, + "loss": 0.3008, + "step": 27663 + }, + { + "epoch": 0.8, + "grad_norm": 1.421510862632592, + "learning_rate": 9.895918162751544e-07, + "loss": 0.2696, + "step": 27664 + }, + { + "epoch": 0.8, + "grad_norm": 1.794122953433805, + "learning_rate": 9.893113177951758e-07, + "loss": 0.2723, + "step": 27665 + }, + { + "epoch": 0.8, + "grad_norm": 1.6725892059261698, + "learning_rate": 9.890308547095706e-07, + "loss": 0.2747, + "step": 27666 + }, + { + "epoch": 0.8, + "grad_norm": 1.4602325927990882, + "learning_rate": 9.88750427020811e-07, + "loss": 0.285, + "step": 27667 + }, + { + "epoch": 0.8, + "grad_norm": 1.5063069834570124, + "learning_rate": 9.884700347313736e-07, + "loss": 0.2596, + "step": 27668 + }, + { + "epoch": 0.8, + "grad_norm": 1.2705145017833317, + "learning_rate": 9.881896778437328e-07, + "loss": 0.291, + "step": 27669 + }, + { + "epoch": 0.8, + "grad_norm": 1.7063202330520766, + "learning_rate": 9.879093563603625e-07, + "loss": 0.2881, + "step": 27670 + }, + { + "epoch": 0.8, + "grad_norm": 1.3371128257873899, + "learning_rate": 9.876290702837366e-07, + "loss": 0.2735, + "step": 27671 + }, + { + "epoch": 0.8, + "grad_norm": 1.3766713446805463, + "learning_rate": 9.87348819616329e-07, + "loss": 0.3113, + "step": 27672 + }, + { + "epoch": 0.8, + "grad_norm": 3.9712293054854353, + "learning_rate": 9.87068604360612e-07, + "loss": 0.2505, + "step": 27673 + }, + { + "epoch": 0.8, + "grad_norm": 0.9773076444569972, + "learning_rate": 9.867884245190596e-07, + "loss": 0.5525, + "step": 27674 + }, + { + "epoch": 0.8, + "grad_norm": 1.273339242515668, + "learning_rate": 9.865082800941445e-07, + "loss": 0.2693, + "step": 27675 + }, + { + "epoch": 0.8, + "grad_norm": 1.7011725831587678, + "learning_rate": 9.862281710883375e-07, + "loss": 0.2975, + "step": 27676 + }, + { + "epoch": 0.8, + "grad_norm": 1.3879061156872436, + "learning_rate": 9.859480975041113e-07, + "loss": 0.2928, + "step": 27677 + }, + { + "epoch": 0.8, + "grad_norm": 1.3957008989182658, + "learning_rate": 9.856680593439384e-07, + "loss": 0.2613, + "step": 27678 + }, + { + "epoch": 0.8, + "grad_norm": 1.5255867834975563, + "learning_rate": 9.853880566102885e-07, + "loss": 0.2669, + "step": 27679 + }, + { + "epoch": 0.8, + "grad_norm": 1.3715600699907347, + "learning_rate": 9.851080893056335e-07, + "loss": 0.2944, + "step": 27680 + }, + { + "epoch": 0.8, + "grad_norm": 1.3325608616078073, + "learning_rate": 9.848281574324447e-07, + "loss": 0.2947, + "step": 27681 + }, + { + "epoch": 0.8, + "grad_norm": 1.350688095702406, + "learning_rate": 9.845482609931917e-07, + "loss": 0.2576, + "step": 27682 + }, + { + "epoch": 0.8, + "grad_norm": 1.9648098780683436, + "learning_rate": 9.842683999903453e-07, + "loss": 0.2796, + "step": 27683 + }, + { + "epoch": 0.8, + "grad_norm": 1.3990150907584102, + "learning_rate": 9.839885744263755e-07, + "loss": 0.2615, + "step": 27684 + }, + { + "epoch": 0.8, + "grad_norm": 1.6421471377422905, + "learning_rate": 9.837087843037495e-07, + "loss": 0.2813, + "step": 27685 + }, + { + "epoch": 0.8, + "grad_norm": 1.2870272188329752, + "learning_rate": 9.834290296249387e-07, + "loss": 0.2859, + "step": 27686 + }, + { + "epoch": 0.8, + "grad_norm": 1.2617112387406788, + "learning_rate": 9.831493103924112e-07, + "loss": 0.2611, + "step": 27687 + }, + { + "epoch": 0.8, + "grad_norm": 1.3475829599671194, + "learning_rate": 9.828696266086358e-07, + "loss": 0.2641, + "step": 27688 + }, + { + "epoch": 0.8, + "grad_norm": 1.2770276706874208, + "learning_rate": 9.8258997827608e-07, + "loss": 0.2694, + "step": 27689 + }, + { + "epoch": 0.8, + "grad_norm": 1.6730808532550363, + "learning_rate": 9.823103653972122e-07, + "loss": 0.28, + "step": 27690 + }, + { + "epoch": 0.8, + "grad_norm": 1.354478899410226, + "learning_rate": 9.820307879745017e-07, + "loss": 0.2545, + "step": 27691 + }, + { + "epoch": 0.8, + "grad_norm": 1.530815227092922, + "learning_rate": 9.817512460104122e-07, + "loss": 0.3012, + "step": 27692 + }, + { + "epoch": 0.8, + "grad_norm": 1.4536607009885782, + "learning_rate": 9.81471739507413e-07, + "loss": 0.2852, + "step": 27693 + }, + { + "epoch": 0.8, + "grad_norm": 1.3074181281547095, + "learning_rate": 9.811922684679714e-07, + "loss": 0.2853, + "step": 27694 + }, + { + "epoch": 0.8, + "grad_norm": 1.4282783170472164, + "learning_rate": 9.809128328945517e-07, + "loss": 0.2576, + "step": 27695 + }, + { + "epoch": 0.8, + "grad_norm": 1.4209599890107592, + "learning_rate": 9.806334327896205e-07, + "loss": 0.255, + "step": 27696 + }, + { + "epoch": 0.8, + "grad_norm": 1.263584552772495, + "learning_rate": 9.80354068155644e-07, + "loss": 0.26, + "step": 27697 + }, + { + "epoch": 0.8, + "grad_norm": 1.4644762557969009, + "learning_rate": 9.80074738995087e-07, + "loss": 0.3027, + "step": 27698 + }, + { + "epoch": 0.8, + "grad_norm": 3.5955338982722855, + "learning_rate": 9.797954453104154e-07, + "loss": 0.2718, + "step": 27699 + }, + { + "epoch": 0.8, + "grad_norm": 2.4959692106019817, + "learning_rate": 9.795161871040932e-07, + "loss": 0.2847, + "step": 27700 + }, + { + "epoch": 0.8, + "grad_norm": 1.3999046724166382, + "learning_rate": 9.79236964378586e-07, + "loss": 0.2773, + "step": 27701 + }, + { + "epoch": 0.8, + "grad_norm": 1.6218670732881868, + "learning_rate": 9.789577771363578e-07, + "loss": 0.2681, + "step": 27702 + }, + { + "epoch": 0.8, + "grad_norm": 1.3633532169671858, + "learning_rate": 9.786786253798709e-07, + "loss": 0.2582, + "step": 27703 + }, + { + "epoch": 0.8, + "grad_norm": 1.4601708645531786, + "learning_rate": 9.783995091115894e-07, + "loss": 0.2599, + "step": 27704 + }, + { + "epoch": 0.8, + "grad_norm": 1.442294885847211, + "learning_rate": 9.781204283339775e-07, + "loss": 0.2779, + "step": 27705 + }, + { + "epoch": 0.8, + "grad_norm": 1.255030939498362, + "learning_rate": 9.778413830494964e-07, + "loss": 0.2758, + "step": 27706 + }, + { + "epoch": 0.8, + "grad_norm": 1.93333279422506, + "learning_rate": 9.775623732606104e-07, + "loss": 0.3006, + "step": 27707 + }, + { + "epoch": 0.8, + "grad_norm": 1.3768776216010774, + "learning_rate": 9.77283398969781e-07, + "loss": 0.2535, + "step": 27708 + }, + { + "epoch": 0.8, + "grad_norm": 0.928019951094098, + "learning_rate": 9.770044601794704e-07, + "loss": 0.542, + "step": 27709 + }, + { + "epoch": 0.8, + "grad_norm": 1.4242038910589339, + "learning_rate": 9.767255568921396e-07, + "loss": 0.2738, + "step": 27710 + }, + { + "epoch": 0.8, + "grad_norm": 1.4026733297375784, + "learning_rate": 9.764466891102525e-07, + "loss": 0.2821, + "step": 27711 + }, + { + "epoch": 0.8, + "grad_norm": 1.3089860627775722, + "learning_rate": 9.761678568362658e-07, + "loss": 0.274, + "step": 27712 + }, + { + "epoch": 0.8, + "grad_norm": 1.7038571614174298, + "learning_rate": 9.758890600726428e-07, + "loss": 0.2701, + "step": 27713 + }, + { + "epoch": 0.8, + "grad_norm": 1.2605137769498587, + "learning_rate": 9.756102988218436e-07, + "loss": 0.2674, + "step": 27714 + }, + { + "epoch": 0.8, + "grad_norm": 1.4456796009577768, + "learning_rate": 9.753315730863284e-07, + "loss": 0.2669, + "step": 27715 + }, + { + "epoch": 0.8, + "grad_norm": 3.8459863677631523, + "learning_rate": 9.750528828685562e-07, + "loss": 0.2704, + "step": 27716 + }, + { + "epoch": 0.8, + "grad_norm": 1.4004870143293897, + "learning_rate": 9.74774228170987e-07, + "loss": 0.2652, + "step": 27717 + }, + { + "epoch": 0.8, + "grad_norm": 1.3924737202244235, + "learning_rate": 9.7449560899608e-07, + "loss": 0.2883, + "step": 27718 + }, + { + "epoch": 0.8, + "grad_norm": 1.3373163009044662, + "learning_rate": 9.742170253462952e-07, + "loss": 0.2996, + "step": 27719 + }, + { + "epoch": 0.8, + "grad_norm": 1.3016612541824983, + "learning_rate": 9.739384772240883e-07, + "loss": 0.2559, + "step": 27720 + }, + { + "epoch": 0.8, + "grad_norm": 1.2613591774484463, + "learning_rate": 9.736599646319206e-07, + "loss": 0.2794, + "step": 27721 + }, + { + "epoch": 0.8, + "grad_norm": 1.4098941004795087, + "learning_rate": 9.73381487572247e-07, + "loss": 0.2626, + "step": 27722 + }, + { + "epoch": 0.8, + "grad_norm": 1.276913146624905, + "learning_rate": 9.731030460475267e-07, + "loss": 0.2596, + "step": 27723 + }, + { + "epoch": 0.8, + "grad_norm": 1.361777510494186, + "learning_rate": 9.728246400602158e-07, + "loss": 0.2512, + "step": 27724 + }, + { + "epoch": 0.8, + "grad_norm": 1.4886704539626898, + "learning_rate": 9.725462696127729e-07, + "loss": 0.2742, + "step": 27725 + }, + { + "epoch": 0.8, + "grad_norm": 1.2342937418925823, + "learning_rate": 9.72267934707654e-07, + "loss": 0.2611, + "step": 27726 + }, + { + "epoch": 0.8, + "grad_norm": 1.4652150253472578, + "learning_rate": 9.719896353473152e-07, + "loss": 0.2615, + "step": 27727 + }, + { + "epoch": 0.8, + "grad_norm": 1.2830052404051555, + "learning_rate": 9.717113715342118e-07, + "loss": 0.2788, + "step": 27728 + }, + { + "epoch": 0.8, + "grad_norm": 2.0712688305731812, + "learning_rate": 9.714331432708006e-07, + "loss": 0.2829, + "step": 27729 + }, + { + "epoch": 0.8, + "grad_norm": 1.590064858124233, + "learning_rate": 9.711549505595385e-07, + "loss": 0.2752, + "step": 27730 + }, + { + "epoch": 0.8, + "grad_norm": 1.4123880407755547, + "learning_rate": 9.708767934028767e-07, + "loss": 0.2707, + "step": 27731 + }, + { + "epoch": 0.8, + "grad_norm": 1.2905237110579284, + "learning_rate": 9.705986718032723e-07, + "loss": 0.2888, + "step": 27732 + }, + { + "epoch": 0.8, + "grad_norm": 1.2638537467705435, + "learning_rate": 9.703205857631793e-07, + "loss": 0.2954, + "step": 27733 + }, + { + "epoch": 0.8, + "grad_norm": 2.3663088458538875, + "learning_rate": 9.700425352850512e-07, + "loss": 0.271, + "step": 27734 + }, + { + "epoch": 0.8, + "grad_norm": 1.5342342100796895, + "learning_rate": 9.69764520371343e-07, + "loss": 0.2756, + "step": 27735 + }, + { + "epoch": 0.8, + "grad_norm": 1.2677845667395578, + "learning_rate": 9.694865410245074e-07, + "loss": 0.2705, + "step": 27736 + }, + { + "epoch": 0.8, + "grad_norm": 1.912460817506249, + "learning_rate": 9.692085972469977e-07, + "loss": 0.2874, + "step": 27737 + }, + { + "epoch": 0.8, + "grad_norm": 1.4763973298166884, + "learning_rate": 9.689306890412687e-07, + "loss": 0.2642, + "step": 27738 + }, + { + "epoch": 0.8, + "grad_norm": 1.287374910888692, + "learning_rate": 9.686528164097692e-07, + "loss": 0.2743, + "step": 27739 + }, + { + "epoch": 0.8, + "grad_norm": 1.3344642303914236, + "learning_rate": 9.683749793549535e-07, + "loss": 0.2513, + "step": 27740 + }, + { + "epoch": 0.8, + "grad_norm": 2.0970408112055967, + "learning_rate": 9.680971778792737e-07, + "loss": 0.2694, + "step": 27741 + }, + { + "epoch": 0.8, + "grad_norm": 1.3020242653427225, + "learning_rate": 9.678194119851807e-07, + "loss": 0.2733, + "step": 27742 + }, + { + "epoch": 0.8, + "grad_norm": 1.4657164448035087, + "learning_rate": 9.675416816751259e-07, + "loss": 0.2771, + "step": 27743 + }, + { + "epoch": 0.8, + "grad_norm": 1.2475847162607712, + "learning_rate": 9.672639869515609e-07, + "loss": 0.2683, + "step": 27744 + }, + { + "epoch": 0.8, + "grad_norm": 0.9793296561212669, + "learning_rate": 9.669863278169356e-07, + "loss": 0.5793, + "step": 27745 + }, + { + "epoch": 0.8, + "grad_norm": 1.3242459359814334, + "learning_rate": 9.667087042737011e-07, + "loss": 0.2744, + "step": 27746 + }, + { + "epoch": 0.8, + "grad_norm": 1.264983558046257, + "learning_rate": 9.66431116324308e-07, + "loss": 0.2957, + "step": 27747 + }, + { + "epoch": 0.8, + "grad_norm": 1.8672556214896738, + "learning_rate": 9.66153563971204e-07, + "loss": 0.2769, + "step": 27748 + }, + { + "epoch": 0.8, + "grad_norm": 1.5936954719947742, + "learning_rate": 9.658760472168404e-07, + "loss": 0.2912, + "step": 27749 + }, + { + "epoch": 0.8, + "grad_norm": 1.22101759095651, + "learning_rate": 9.655985660636647e-07, + "loss": 0.2653, + "step": 27750 + }, + { + "epoch": 0.8, + "grad_norm": 1.2972597762218252, + "learning_rate": 9.65321120514126e-07, + "loss": 0.2805, + "step": 27751 + }, + { + "epoch": 0.8, + "grad_norm": 1.4667437580887939, + "learning_rate": 9.650437105706734e-07, + "loss": 0.277, + "step": 27752 + }, + { + "epoch": 0.8, + "grad_norm": 1.376419330897275, + "learning_rate": 9.647663362357546e-07, + "loss": 0.2849, + "step": 27753 + }, + { + "epoch": 0.81, + "grad_norm": 1.358134092749546, + "learning_rate": 9.644889975118176e-07, + "loss": 0.2764, + "step": 27754 + }, + { + "epoch": 0.81, + "grad_norm": 1.5783467757439291, + "learning_rate": 9.642116944013103e-07, + "loss": 0.2703, + "step": 27755 + }, + { + "epoch": 0.81, + "grad_norm": 1.2582495463268666, + "learning_rate": 9.639344269066793e-07, + "loss": 0.2554, + "step": 27756 + }, + { + "epoch": 0.81, + "grad_norm": 1.283869177588437, + "learning_rate": 9.63657195030373e-07, + "loss": 0.2733, + "step": 27757 + }, + { + "epoch": 0.81, + "grad_norm": 1.3655689022367716, + "learning_rate": 9.633799987748355e-07, + "loss": 0.2697, + "step": 27758 + }, + { + "epoch": 0.81, + "grad_norm": 1.4476854913439308, + "learning_rate": 9.631028381425144e-07, + "loss": 0.2706, + "step": 27759 + }, + { + "epoch": 0.81, + "grad_norm": 1.3113705283940031, + "learning_rate": 9.62825713135856e-07, + "loss": 0.2764, + "step": 27760 + }, + { + "epoch": 0.81, + "grad_norm": 1.514747658957813, + "learning_rate": 9.625486237573046e-07, + "loss": 0.2651, + "step": 27761 + }, + { + "epoch": 0.81, + "grad_norm": 1.4065406959403153, + "learning_rate": 9.622715700093066e-07, + "loss": 0.2883, + "step": 27762 + }, + { + "epoch": 0.81, + "grad_norm": 0.9935159298502003, + "learning_rate": 9.619945518943068e-07, + "loss": 0.5447, + "step": 27763 + }, + { + "epoch": 0.81, + "grad_norm": 1.302384729808717, + "learning_rate": 9.617175694147502e-07, + "loss": 0.2687, + "step": 27764 + }, + { + "epoch": 0.81, + "grad_norm": 1.266252975883316, + "learning_rate": 9.61440622573081e-07, + "loss": 0.2747, + "step": 27765 + }, + { + "epoch": 0.81, + "grad_norm": 1.3351898602019556, + "learning_rate": 9.611637113717437e-07, + "loss": 0.2785, + "step": 27766 + }, + { + "epoch": 0.81, + "grad_norm": 1.4206017682583256, + "learning_rate": 9.608868358131806e-07, + "loss": 0.2628, + "step": 27767 + }, + { + "epoch": 0.81, + "grad_norm": 1.3429632845861643, + "learning_rate": 9.60609995899836e-07, + "loss": 0.2653, + "step": 27768 + }, + { + "epoch": 0.81, + "grad_norm": 1.733179002004067, + "learning_rate": 9.60333191634153e-07, + "loss": 0.2836, + "step": 27769 + }, + { + "epoch": 0.81, + "grad_norm": 1.2820211581728154, + "learning_rate": 9.600564230185743e-07, + "loss": 0.2826, + "step": 27770 + }, + { + "epoch": 0.81, + "grad_norm": 1.2572765983816723, + "learning_rate": 9.597796900555422e-07, + "loss": 0.2635, + "step": 27771 + }, + { + "epoch": 0.81, + "grad_norm": 2.0356606801036468, + "learning_rate": 9.595029927474997e-07, + "loss": 0.2655, + "step": 27772 + }, + { + "epoch": 0.81, + "grad_norm": 3.6749750970523243, + "learning_rate": 9.59226331096888e-07, + "loss": 0.2625, + "step": 27773 + }, + { + "epoch": 0.81, + "grad_norm": 1.5278458330979614, + "learning_rate": 9.589497051061498e-07, + "loss": 0.269, + "step": 27774 + }, + { + "epoch": 0.81, + "grad_norm": 1.37421739013687, + "learning_rate": 9.58673114777724e-07, + "loss": 0.2774, + "step": 27775 + }, + { + "epoch": 0.81, + "grad_norm": 1.2858698934787705, + "learning_rate": 9.583965601140543e-07, + "loss": 0.2691, + "step": 27776 + }, + { + "epoch": 0.81, + "grad_norm": 1.2106764189468462, + "learning_rate": 9.581200411175785e-07, + "loss": 0.2703, + "step": 27777 + }, + { + "epoch": 0.81, + "grad_norm": 1.418694711118902, + "learning_rate": 9.57843557790738e-07, + "loss": 0.2801, + "step": 27778 + }, + { + "epoch": 0.81, + "grad_norm": 1.4212317794725542, + "learning_rate": 9.57567110135973e-07, + "loss": 0.2857, + "step": 27779 + }, + { + "epoch": 0.81, + "grad_norm": 1.2648413157471188, + "learning_rate": 9.572906981557228e-07, + "loss": 0.2617, + "step": 27780 + }, + { + "epoch": 0.81, + "grad_norm": 1.394241038339526, + "learning_rate": 9.570143218524276e-07, + "loss": 0.2783, + "step": 27781 + }, + { + "epoch": 0.81, + "grad_norm": 1.2991539091780873, + "learning_rate": 9.567379812285254e-07, + "loss": 0.2623, + "step": 27782 + }, + { + "epoch": 0.81, + "grad_norm": 1.2655469592124762, + "learning_rate": 9.56461676286456e-07, + "loss": 0.2721, + "step": 27783 + }, + { + "epoch": 0.81, + "grad_norm": 1.318229517967411, + "learning_rate": 9.561854070286563e-07, + "loss": 0.2599, + "step": 27784 + }, + { + "epoch": 0.81, + "grad_norm": 1.4674172665605592, + "learning_rate": 9.559091734575672e-07, + "loss": 0.2841, + "step": 27785 + }, + { + "epoch": 0.81, + "grad_norm": 0.9305484134314298, + "learning_rate": 9.556329755756227e-07, + "loss": 0.5423, + "step": 27786 + }, + { + "epoch": 0.81, + "grad_norm": 0.9469003137258448, + "learning_rate": 9.553568133852625e-07, + "loss": 0.5837, + "step": 27787 + }, + { + "epoch": 0.81, + "grad_norm": 1.3693313800485967, + "learning_rate": 9.55080686888923e-07, + "loss": 0.2967, + "step": 27788 + }, + { + "epoch": 0.81, + "grad_norm": 1.3447236843026509, + "learning_rate": 9.548045960890417e-07, + "loss": 0.2875, + "step": 27789 + }, + { + "epoch": 0.81, + "grad_norm": 1.3003748987808585, + "learning_rate": 9.545285409880545e-07, + "loss": 0.2808, + "step": 27790 + }, + { + "epoch": 0.81, + "grad_norm": 1.4065949765963923, + "learning_rate": 9.542525215883975e-07, + "loss": 0.2646, + "step": 27791 + }, + { + "epoch": 0.81, + "grad_norm": 1.2419170797972625, + "learning_rate": 9.53976537892507e-07, + "loss": 0.2674, + "step": 27792 + }, + { + "epoch": 0.81, + "grad_norm": 1.5442292621860267, + "learning_rate": 9.5370058990282e-07, + "loss": 0.2814, + "step": 27793 + }, + { + "epoch": 0.81, + "grad_norm": 1.4001467809276156, + "learning_rate": 9.534246776217688e-07, + "loss": 0.2899, + "step": 27794 + }, + { + "epoch": 0.81, + "grad_norm": 1.2883656043772187, + "learning_rate": 9.531488010517897e-07, + "loss": 0.2741, + "step": 27795 + }, + { + "epoch": 0.81, + "grad_norm": 1.255428831540275, + "learning_rate": 9.528729601953174e-07, + "loss": 0.2576, + "step": 27796 + }, + { + "epoch": 0.81, + "grad_norm": 1.2789156516197242, + "learning_rate": 9.525971550547858e-07, + "loss": 0.2495, + "step": 27797 + }, + { + "epoch": 0.81, + "grad_norm": 1.341920789167829, + "learning_rate": 9.523213856326297e-07, + "loss": 0.2934, + "step": 27798 + }, + { + "epoch": 0.81, + "grad_norm": 1.5079390631709662, + "learning_rate": 9.520456519312815e-07, + "loss": 0.2885, + "step": 27799 + }, + { + "epoch": 0.81, + "grad_norm": 1.460890946515153, + "learning_rate": 9.51769953953176e-07, + "loss": 0.2994, + "step": 27800 + }, + { + "epoch": 0.81, + "grad_norm": 1.437960125385938, + "learning_rate": 9.514942917007452e-07, + "loss": 0.2838, + "step": 27801 + }, + { + "epoch": 0.81, + "grad_norm": 1.3530627683520935, + "learning_rate": 9.512186651764238e-07, + "loss": 0.2796, + "step": 27802 + }, + { + "epoch": 0.81, + "grad_norm": 1.5004253253470965, + "learning_rate": 9.509430743826415e-07, + "loss": 0.2536, + "step": 27803 + }, + { + "epoch": 0.81, + "grad_norm": 1.2238802422010207, + "learning_rate": 9.506675193218324e-07, + "loss": 0.251, + "step": 27804 + }, + { + "epoch": 0.81, + "grad_norm": 1.2193201404994771, + "learning_rate": 9.503919999964257e-07, + "loss": 0.2658, + "step": 27805 + }, + { + "epoch": 0.81, + "grad_norm": 1.4372174214974838, + "learning_rate": 9.50116516408855e-07, + "loss": 0.272, + "step": 27806 + }, + { + "epoch": 0.81, + "grad_norm": 1.2954590231993657, + "learning_rate": 9.498410685615511e-07, + "loss": 0.2645, + "step": 27807 + }, + { + "epoch": 0.81, + "grad_norm": 1.3589308059739267, + "learning_rate": 9.495656564569444e-07, + "loss": 0.2749, + "step": 27808 + }, + { + "epoch": 0.81, + "grad_norm": 1.2440407425237896, + "learning_rate": 9.492902800974657e-07, + "loss": 0.2876, + "step": 27809 + }, + { + "epoch": 0.81, + "grad_norm": 1.4482999419603815, + "learning_rate": 9.490149394855452e-07, + "loss": 0.2764, + "step": 27810 + }, + { + "epoch": 0.81, + "grad_norm": 1.5852812262235239, + "learning_rate": 9.487396346236133e-07, + "loss": 0.2808, + "step": 27811 + }, + { + "epoch": 0.81, + "grad_norm": 1.2603532781219375, + "learning_rate": 9.484643655140996e-07, + "loss": 0.2585, + "step": 27812 + }, + { + "epoch": 0.81, + "grad_norm": 1.6298415974184572, + "learning_rate": 9.481891321594322e-07, + "loss": 0.2794, + "step": 27813 + }, + { + "epoch": 0.81, + "grad_norm": 1.7613625620130844, + "learning_rate": 9.479139345620402e-07, + "loss": 0.2663, + "step": 27814 + }, + { + "epoch": 0.81, + "grad_norm": 1.6039562661377986, + "learning_rate": 9.47638772724353e-07, + "loss": 0.262, + "step": 27815 + }, + { + "epoch": 0.81, + "grad_norm": 1.2597319455411586, + "learning_rate": 9.473636466487984e-07, + "loss": 0.2636, + "step": 27816 + }, + { + "epoch": 0.81, + "grad_norm": 1.3531035851838455, + "learning_rate": 9.470885563378046e-07, + "loss": 0.2867, + "step": 27817 + }, + { + "epoch": 0.81, + "grad_norm": 1.261346673520718, + "learning_rate": 9.468135017937991e-07, + "loss": 0.2949, + "step": 27818 + }, + { + "epoch": 0.81, + "grad_norm": 1.3283031042058575, + "learning_rate": 9.465384830192098e-07, + "loss": 0.3296, + "step": 27819 + }, + { + "epoch": 0.81, + "grad_norm": 1.3084152597373073, + "learning_rate": 9.462635000164632e-07, + "loss": 0.2941, + "step": 27820 + }, + { + "epoch": 0.81, + "grad_norm": 1.4440466073753426, + "learning_rate": 9.459885527879876e-07, + "loss": 0.3352, + "step": 27821 + }, + { + "epoch": 0.81, + "grad_norm": 1.3101499506139687, + "learning_rate": 9.457136413362062e-07, + "loss": 0.254, + "step": 27822 + }, + { + "epoch": 0.81, + "grad_norm": 3.2169671249307985, + "learning_rate": 9.454387656635472e-07, + "loss": 0.2838, + "step": 27823 + }, + { + "epoch": 0.81, + "grad_norm": 1.5149542990095135, + "learning_rate": 9.45163925772436e-07, + "loss": 0.281, + "step": 27824 + }, + { + "epoch": 0.81, + "grad_norm": 1.3107654256249246, + "learning_rate": 9.448891216652983e-07, + "loss": 0.277, + "step": 27825 + }, + { + "epoch": 0.81, + "grad_norm": 1.2451182773022076, + "learning_rate": 9.44614353344559e-07, + "loss": 0.271, + "step": 27826 + }, + { + "epoch": 0.81, + "grad_norm": 2.1557053313001426, + "learning_rate": 9.443396208126431e-07, + "loss": 0.2979, + "step": 27827 + }, + { + "epoch": 0.81, + "grad_norm": 1.3723538665413453, + "learning_rate": 9.440649240719752e-07, + "loss": 0.2743, + "step": 27828 + }, + { + "epoch": 0.81, + "grad_norm": 1.3380539851337474, + "learning_rate": 9.437902631249801e-07, + "loss": 0.2931, + "step": 27829 + }, + { + "epoch": 0.81, + "grad_norm": 1.3074600554214129, + "learning_rate": 9.435156379740801e-07, + "loss": 0.2697, + "step": 27830 + }, + { + "epoch": 0.81, + "grad_norm": 1.7247578227395781, + "learning_rate": 9.432410486217003e-07, + "loss": 0.2765, + "step": 27831 + }, + { + "epoch": 0.81, + "grad_norm": 1.3576427408507863, + "learning_rate": 9.429664950702627e-07, + "loss": 0.3046, + "step": 27832 + }, + { + "epoch": 0.81, + "grad_norm": 1.4022748397984723, + "learning_rate": 9.426919773221904e-07, + "loss": 0.2999, + "step": 27833 + }, + { + "epoch": 0.81, + "grad_norm": 1.2954980212660703, + "learning_rate": 9.424174953799065e-07, + "loss": 0.2934, + "step": 27834 + }, + { + "epoch": 0.81, + "grad_norm": 1.2423052685719136, + "learning_rate": 9.421430492458328e-07, + "loss": 0.2685, + "step": 27835 + }, + { + "epoch": 0.81, + "grad_norm": 1.4440257349731145, + "learning_rate": 9.418686389223924e-07, + "loss": 0.2856, + "step": 27836 + }, + { + "epoch": 0.81, + "grad_norm": 1.2836148287650513, + "learning_rate": 9.415942644120058e-07, + "loss": 0.2885, + "step": 27837 + }, + { + "epoch": 0.81, + "grad_norm": 1.2388840010000357, + "learning_rate": 9.413199257170952e-07, + "loss": 0.2457, + "step": 27838 + }, + { + "epoch": 0.81, + "grad_norm": 1.4894143036677474, + "learning_rate": 9.41045622840081e-07, + "loss": 0.3145, + "step": 27839 + }, + { + "epoch": 0.81, + "grad_norm": 1.25978212759669, + "learning_rate": 9.407713557833853e-07, + "loss": 0.2658, + "step": 27840 + }, + { + "epoch": 0.81, + "grad_norm": 1.472814670394299, + "learning_rate": 9.404971245494265e-07, + "loss": 0.2717, + "step": 27841 + }, + { + "epoch": 0.81, + "grad_norm": 1.6065975634086487, + "learning_rate": 9.402229291406256e-07, + "loss": 0.2675, + "step": 27842 + }, + { + "epoch": 0.81, + "grad_norm": 1.2812727923646754, + "learning_rate": 9.399487695594022e-07, + "loss": 0.2515, + "step": 27843 + }, + { + "epoch": 0.81, + "grad_norm": 1.5550513747928385, + "learning_rate": 9.396746458081763e-07, + "loss": 0.2714, + "step": 27844 + }, + { + "epoch": 0.81, + "grad_norm": 1.2646497630410496, + "learning_rate": 9.394005578893661e-07, + "loss": 0.2585, + "step": 27845 + }, + { + "epoch": 0.81, + "grad_norm": 1.6331124243269919, + "learning_rate": 9.391265058053917e-07, + "loss": 0.2792, + "step": 27846 + }, + { + "epoch": 0.81, + "grad_norm": 1.3238716366260768, + "learning_rate": 9.388524895586704e-07, + "loss": 0.2716, + "step": 27847 + }, + { + "epoch": 0.81, + "grad_norm": 1.4986812922927406, + "learning_rate": 9.385785091516225e-07, + "loss": 0.2402, + "step": 27848 + }, + { + "epoch": 0.81, + "grad_norm": 1.3281817070899478, + "learning_rate": 9.383045645866629e-07, + "loss": 0.2836, + "step": 27849 + }, + { + "epoch": 0.81, + "grad_norm": 1.5405454244457124, + "learning_rate": 9.380306558662106e-07, + "loss": 0.3012, + "step": 27850 + }, + { + "epoch": 0.81, + "grad_norm": 1.5089048414358375, + "learning_rate": 9.377567829926831e-07, + "loss": 0.2907, + "step": 27851 + }, + { + "epoch": 0.81, + "grad_norm": 1.3605404341788534, + "learning_rate": 9.374829459684964e-07, + "loss": 0.274, + "step": 27852 + }, + { + "epoch": 0.81, + "grad_norm": 1.3634729132292516, + "learning_rate": 9.372091447960685e-07, + "loss": 0.3194, + "step": 27853 + }, + { + "epoch": 0.81, + "grad_norm": 1.5232605039905618, + "learning_rate": 9.369353794778147e-07, + "loss": 0.2695, + "step": 27854 + }, + { + "epoch": 0.81, + "grad_norm": 1.5137641678085265, + "learning_rate": 9.366616500161508e-07, + "loss": 0.2997, + "step": 27855 + }, + { + "epoch": 0.81, + "grad_norm": 1.5235169014046404, + "learning_rate": 9.363879564134936e-07, + "loss": 0.2753, + "step": 27856 + }, + { + "epoch": 0.81, + "grad_norm": 1.3564435660919187, + "learning_rate": 9.361142986722588e-07, + "loss": 0.2685, + "step": 27857 + }, + { + "epoch": 0.81, + "grad_norm": 1.2889250481077563, + "learning_rate": 9.358406767948592e-07, + "loss": 0.2844, + "step": 27858 + }, + { + "epoch": 0.81, + "grad_norm": 1.3568429246312745, + "learning_rate": 9.355670907837117e-07, + "loss": 0.2866, + "step": 27859 + }, + { + "epoch": 0.81, + "grad_norm": 1.4547485564207756, + "learning_rate": 9.352935406412283e-07, + "loss": 0.2822, + "step": 27860 + }, + { + "epoch": 0.81, + "grad_norm": 1.5388240148619114, + "learning_rate": 9.35020026369825e-07, + "loss": 0.2706, + "step": 27861 + }, + { + "epoch": 0.81, + "grad_norm": 1.4199352970235068, + "learning_rate": 9.347465479719148e-07, + "loss": 0.2728, + "step": 27862 + }, + { + "epoch": 0.81, + "grad_norm": 1.4915884025689066, + "learning_rate": 9.344731054499113e-07, + "loss": 0.259, + "step": 27863 + }, + { + "epoch": 0.81, + "grad_norm": 1.4726216896227888, + "learning_rate": 9.341996988062279e-07, + "loss": 0.2828, + "step": 27864 + }, + { + "epoch": 0.81, + "grad_norm": 1.8028432280983195, + "learning_rate": 9.339263280432775e-07, + "loss": 0.2771, + "step": 27865 + }, + { + "epoch": 0.81, + "grad_norm": 1.3856849964488116, + "learning_rate": 9.33652993163472e-07, + "loss": 0.2757, + "step": 27866 + }, + { + "epoch": 0.81, + "grad_norm": 1.440103689990092, + "learning_rate": 9.333796941692253e-07, + "loss": 0.2777, + "step": 27867 + }, + { + "epoch": 0.81, + "grad_norm": 0.9416140183808661, + "learning_rate": 9.331064310629468e-07, + "loss": 0.5502, + "step": 27868 + }, + { + "epoch": 0.81, + "grad_norm": 1.3323851234740063, + "learning_rate": 9.328332038470489e-07, + "loss": 0.2599, + "step": 27869 + }, + { + "epoch": 0.81, + "grad_norm": 1.3544794153177846, + "learning_rate": 9.325600125239431e-07, + "loss": 0.279, + "step": 27870 + }, + { + "epoch": 0.81, + "grad_norm": 1.3174907223650725, + "learning_rate": 9.322868570960409e-07, + "loss": 0.2669, + "step": 27871 + }, + { + "epoch": 0.81, + "grad_norm": 1.600167110908949, + "learning_rate": 9.320137375657517e-07, + "loss": 0.2932, + "step": 27872 + }, + { + "epoch": 0.81, + "grad_norm": 1.2447750432403535, + "learning_rate": 9.317406539354862e-07, + "loss": 0.2623, + "step": 27873 + }, + { + "epoch": 0.81, + "grad_norm": 1.3218247364192084, + "learning_rate": 9.31467606207655e-07, + "loss": 0.2738, + "step": 27874 + }, + { + "epoch": 0.81, + "grad_norm": 1.4072546302671136, + "learning_rate": 9.311945943846673e-07, + "loss": 0.2724, + "step": 27875 + }, + { + "epoch": 0.81, + "grad_norm": 1.310081802313382, + "learning_rate": 9.309216184689334e-07, + "loss": 0.2769, + "step": 27876 + }, + { + "epoch": 0.81, + "grad_norm": 1.469546603974731, + "learning_rate": 9.306486784628604e-07, + "loss": 0.2633, + "step": 27877 + }, + { + "epoch": 0.81, + "grad_norm": 1.3452152123987446, + "learning_rate": 9.30375774368858e-07, + "loss": 0.2933, + "step": 27878 + }, + { + "epoch": 0.81, + "grad_norm": 0.9727815135586751, + "learning_rate": 9.301029061893346e-07, + "loss": 0.6189, + "step": 27879 + }, + { + "epoch": 0.81, + "grad_norm": 1.3179772106471261, + "learning_rate": 9.29830073926698e-07, + "loss": 0.2639, + "step": 27880 + }, + { + "epoch": 0.81, + "grad_norm": 1.4071068072738122, + "learning_rate": 9.295572775833561e-07, + "loss": 0.2693, + "step": 27881 + }, + { + "epoch": 0.81, + "grad_norm": 1.3668785868606277, + "learning_rate": 9.292845171617165e-07, + "loss": 0.2619, + "step": 27882 + }, + { + "epoch": 0.81, + "grad_norm": 1.4208554333772496, + "learning_rate": 9.290117926641862e-07, + "loss": 0.2732, + "step": 27883 + }, + { + "epoch": 0.81, + "grad_norm": 1.3522275667528465, + "learning_rate": 9.287391040931732e-07, + "loss": 0.258, + "step": 27884 + }, + { + "epoch": 0.81, + "grad_norm": 1.2807448338995642, + "learning_rate": 9.284664514510811e-07, + "loss": 0.257, + "step": 27885 + }, + { + "epoch": 0.81, + "grad_norm": 1.2622639983947364, + "learning_rate": 9.281938347403186e-07, + "loss": 0.2593, + "step": 27886 + }, + { + "epoch": 0.81, + "grad_norm": 1.2829787706551157, + "learning_rate": 9.279212539632915e-07, + "loss": 0.2872, + "step": 27887 + }, + { + "epoch": 0.81, + "grad_norm": 2.4390077931187744, + "learning_rate": 9.276487091224029e-07, + "loss": 0.2883, + "step": 27888 + }, + { + "epoch": 0.81, + "grad_norm": 1.5775172949297795, + "learning_rate": 9.273762002200598e-07, + "loss": 0.2735, + "step": 27889 + }, + { + "epoch": 0.81, + "grad_norm": 1.645770185508151, + "learning_rate": 9.271037272586675e-07, + "loss": 0.2728, + "step": 27890 + }, + { + "epoch": 0.81, + "grad_norm": 0.9574980749764407, + "learning_rate": 9.26831290240629e-07, + "loss": 0.6242, + "step": 27891 + }, + { + "epoch": 0.81, + "grad_norm": 1.5155139459527998, + "learning_rate": 9.265588891683502e-07, + "loss": 0.2674, + "step": 27892 + }, + { + "epoch": 0.81, + "grad_norm": 1.3483882540865306, + "learning_rate": 9.262865240442342e-07, + "loss": 0.2775, + "step": 27893 + }, + { + "epoch": 0.81, + "grad_norm": 1.3539907588391504, + "learning_rate": 9.260141948706852e-07, + "loss": 0.2798, + "step": 27894 + }, + { + "epoch": 0.81, + "grad_norm": 1.28490520409383, + "learning_rate": 9.257419016501068e-07, + "loss": 0.2556, + "step": 27895 + }, + { + "epoch": 0.81, + "grad_norm": 1.7912675199037058, + "learning_rate": 9.254696443849004e-07, + "loss": 0.2809, + "step": 27896 + }, + { + "epoch": 0.81, + "grad_norm": 1.3910836628929404, + "learning_rate": 9.251974230774697e-07, + "loss": 0.2918, + "step": 27897 + }, + { + "epoch": 0.81, + "grad_norm": 2.6378543682967126, + "learning_rate": 9.24925237730217e-07, + "loss": 0.3055, + "step": 27898 + }, + { + "epoch": 0.81, + "grad_norm": 2.0638681190230006, + "learning_rate": 9.24653088345544e-07, + "loss": 0.2703, + "step": 27899 + }, + { + "epoch": 0.81, + "grad_norm": 1.5300228654709758, + "learning_rate": 9.243809749258525e-07, + "loss": 0.2682, + "step": 27900 + }, + { + "epoch": 0.81, + "grad_norm": 1.268222229146306, + "learning_rate": 9.241088974735446e-07, + "loss": 0.272, + "step": 27901 + }, + { + "epoch": 0.81, + "grad_norm": 1.297511492925267, + "learning_rate": 9.238368559910204e-07, + "loss": 0.2684, + "step": 27902 + }, + { + "epoch": 0.81, + "grad_norm": 1.2200959950684582, + "learning_rate": 9.235648504806826e-07, + "loss": 0.2498, + "step": 27903 + }, + { + "epoch": 0.81, + "grad_norm": 1.5281544147340844, + "learning_rate": 9.232928809449293e-07, + "loss": 0.2667, + "step": 27904 + }, + { + "epoch": 0.81, + "grad_norm": 1.8795714598536803, + "learning_rate": 9.230209473861612e-07, + "loss": 0.2595, + "step": 27905 + }, + { + "epoch": 0.81, + "grad_norm": 1.51450913907012, + "learning_rate": 9.227490498067787e-07, + "loss": 0.2675, + "step": 27906 + }, + { + "epoch": 0.81, + "grad_norm": 1.5414669442378999, + "learning_rate": 9.224771882091809e-07, + "loss": 0.2748, + "step": 27907 + }, + { + "epoch": 0.81, + "grad_norm": 1.3588353943272775, + "learning_rate": 9.222053625957672e-07, + "loss": 0.2709, + "step": 27908 + }, + { + "epoch": 0.81, + "grad_norm": 1.2464984170369375, + "learning_rate": 9.219335729689361e-07, + "loss": 0.2833, + "step": 27909 + }, + { + "epoch": 0.81, + "grad_norm": 1.3803995703027367, + "learning_rate": 9.216618193310866e-07, + "loss": 0.2842, + "step": 27910 + }, + { + "epoch": 0.81, + "grad_norm": 1.2411024181946873, + "learning_rate": 9.213901016846172e-07, + "loss": 0.2933, + "step": 27911 + }, + { + "epoch": 0.81, + "grad_norm": 1.48517714353454, + "learning_rate": 9.211184200319262e-07, + "loss": 0.2581, + "step": 27912 + }, + { + "epoch": 0.81, + "grad_norm": 1.2766008513382623, + "learning_rate": 9.208467743754095e-07, + "loss": 0.2741, + "step": 27913 + }, + { + "epoch": 0.81, + "grad_norm": 2.5924765004254775, + "learning_rate": 9.205751647174654e-07, + "loss": 0.2671, + "step": 27914 + }, + { + "epoch": 0.81, + "grad_norm": 1.4770421992843807, + "learning_rate": 9.203035910604901e-07, + "loss": 0.3115, + "step": 27915 + }, + { + "epoch": 0.81, + "grad_norm": 1.3291754698176845, + "learning_rate": 9.200320534068824e-07, + "loss": 0.2571, + "step": 27916 + }, + { + "epoch": 0.81, + "grad_norm": 1.4464504592376022, + "learning_rate": 9.197605517590358e-07, + "loss": 0.2864, + "step": 27917 + }, + { + "epoch": 0.81, + "grad_norm": 1.4529634041476687, + "learning_rate": 9.194890861193478e-07, + "loss": 0.3003, + "step": 27918 + }, + { + "epoch": 0.81, + "grad_norm": 1.4465957425037643, + "learning_rate": 9.192176564902139e-07, + "loss": 0.2739, + "step": 27919 + }, + { + "epoch": 0.81, + "grad_norm": 1.4863121510020392, + "learning_rate": 9.189462628740293e-07, + "loss": 0.2713, + "step": 27920 + }, + { + "epoch": 0.81, + "grad_norm": 0.9621298501312585, + "learning_rate": 9.186749052731891e-07, + "loss": 0.5886, + "step": 27921 + }, + { + "epoch": 0.81, + "grad_norm": 1.275560483142527, + "learning_rate": 9.184035836900895e-07, + "loss": 0.2821, + "step": 27922 + }, + { + "epoch": 0.81, + "grad_norm": 1.3841497604168895, + "learning_rate": 9.181322981271224e-07, + "loss": 0.2751, + "step": 27923 + }, + { + "epoch": 0.81, + "grad_norm": 2.2877748231493134, + "learning_rate": 9.17861048586683e-07, + "loss": 0.294, + "step": 27924 + }, + { + "epoch": 0.81, + "grad_norm": 1.37174595697392, + "learning_rate": 9.175898350711648e-07, + "loss": 0.2596, + "step": 27925 + }, + { + "epoch": 0.81, + "grad_norm": 1.3878928106264194, + "learning_rate": 9.173186575829618e-07, + "loss": 0.265, + "step": 27926 + }, + { + "epoch": 0.81, + "grad_norm": 1.3215065882279198, + "learning_rate": 9.170475161244674e-07, + "loss": 0.275, + "step": 27927 + }, + { + "epoch": 0.81, + "grad_norm": 1.5094998842476526, + "learning_rate": 9.167764106980737e-07, + "loss": 0.2713, + "step": 27928 + }, + { + "epoch": 0.81, + "grad_norm": 1.274063699537564, + "learning_rate": 9.165053413061731e-07, + "loss": 0.2866, + "step": 27929 + }, + { + "epoch": 0.81, + "grad_norm": 1.3033124370485074, + "learning_rate": 9.162343079511582e-07, + "loss": 0.2691, + "step": 27930 + }, + { + "epoch": 0.81, + "grad_norm": 1.6364231673416083, + "learning_rate": 9.159633106354226e-07, + "loss": 0.2731, + "step": 27931 + }, + { + "epoch": 0.81, + "grad_norm": 1.2629426845018559, + "learning_rate": 9.156923493613545e-07, + "loss": 0.2552, + "step": 27932 + }, + { + "epoch": 0.81, + "grad_norm": 1.2710849384790535, + "learning_rate": 9.154214241313469e-07, + "loss": 0.2636, + "step": 27933 + }, + { + "epoch": 0.81, + "grad_norm": 1.2821552710869004, + "learning_rate": 9.151505349477901e-07, + "loss": 0.2573, + "step": 27934 + }, + { + "epoch": 0.81, + "grad_norm": 1.2235537692001082, + "learning_rate": 9.148796818130756e-07, + "loss": 0.2575, + "step": 27935 + }, + { + "epoch": 0.81, + "grad_norm": 1.4094295904551102, + "learning_rate": 9.146088647295931e-07, + "loss": 0.2666, + "step": 27936 + }, + { + "epoch": 0.81, + "grad_norm": 2.5456974390171925, + "learning_rate": 9.143380836997323e-07, + "loss": 0.3104, + "step": 27937 + }, + { + "epoch": 0.81, + "grad_norm": 1.52241120869999, + "learning_rate": 9.140673387258836e-07, + "loss": 0.2802, + "step": 27938 + }, + { + "epoch": 0.81, + "grad_norm": 1.333495745760937, + "learning_rate": 9.137966298104373e-07, + "loss": 0.2728, + "step": 27939 + }, + { + "epoch": 0.81, + "grad_norm": 1.3270519787327288, + "learning_rate": 9.135259569557797e-07, + "loss": 0.2997, + "step": 27940 + }, + { + "epoch": 0.81, + "grad_norm": 1.3179669684747735, + "learning_rate": 9.132553201643013e-07, + "loss": 0.2908, + "step": 27941 + }, + { + "epoch": 0.81, + "grad_norm": 2.1616772156888118, + "learning_rate": 9.129847194383896e-07, + "loss": 0.2664, + "step": 27942 + }, + { + "epoch": 0.81, + "grad_norm": 1.3372018631511107, + "learning_rate": 9.12714154780433e-07, + "loss": 0.282, + "step": 27943 + }, + { + "epoch": 0.81, + "grad_norm": 1.3619898186203971, + "learning_rate": 9.124436261928205e-07, + "loss": 0.2749, + "step": 27944 + }, + { + "epoch": 0.81, + "grad_norm": 1.264842176225198, + "learning_rate": 9.121731336779377e-07, + "loss": 0.2621, + "step": 27945 + }, + { + "epoch": 0.81, + "grad_norm": 1.3191952355626184, + "learning_rate": 9.119026772381718e-07, + "loss": 0.2862, + "step": 27946 + }, + { + "epoch": 0.81, + "grad_norm": 1.231405959349637, + "learning_rate": 9.116322568759101e-07, + "loss": 0.2445, + "step": 27947 + }, + { + "epoch": 0.81, + "grad_norm": 1.2888720451323639, + "learning_rate": 9.113618725935391e-07, + "loss": 0.2449, + "step": 27948 + }, + { + "epoch": 0.81, + "grad_norm": 1.4035063140925608, + "learning_rate": 9.110915243934454e-07, + "loss": 0.2852, + "step": 27949 + }, + { + "epoch": 0.81, + "grad_norm": 1.5164116717750187, + "learning_rate": 9.108212122780153e-07, + "loss": 0.2826, + "step": 27950 + }, + { + "epoch": 0.81, + "grad_norm": 1.3290854302861166, + "learning_rate": 9.105509362496317e-07, + "loss": 0.276, + "step": 27951 + }, + { + "epoch": 0.81, + "grad_norm": 0.9531926822472887, + "learning_rate": 9.102806963106819e-07, + "loss": 0.566, + "step": 27952 + }, + { + "epoch": 0.81, + "grad_norm": 1.317240403632963, + "learning_rate": 9.100104924635506e-07, + "loss": 0.267, + "step": 27953 + }, + { + "epoch": 0.81, + "grad_norm": 1.4117835832623624, + "learning_rate": 9.097403247106218e-07, + "loss": 0.2873, + "step": 27954 + }, + { + "epoch": 0.81, + "grad_norm": 1.4065765017585772, + "learning_rate": 9.094701930542799e-07, + "loss": 0.2659, + "step": 27955 + }, + { + "epoch": 0.81, + "grad_norm": 1.5194643913237114, + "learning_rate": 9.09200097496909e-07, + "loss": 0.3149, + "step": 27956 + }, + { + "epoch": 0.81, + "grad_norm": 1.7059057558520627, + "learning_rate": 9.089300380408927e-07, + "loss": 0.2508, + "step": 27957 + }, + { + "epoch": 0.81, + "grad_norm": 1.2501882696769515, + "learning_rate": 9.086600146886154e-07, + "loss": 0.2758, + "step": 27958 + }, + { + "epoch": 0.81, + "grad_norm": 1.3957726956420098, + "learning_rate": 9.083900274424573e-07, + "loss": 0.2691, + "step": 27959 + }, + { + "epoch": 0.81, + "grad_norm": 1.4057949548497413, + "learning_rate": 9.081200763048031e-07, + "loss": 0.2654, + "step": 27960 + }, + { + "epoch": 0.81, + "grad_norm": 1.29502759526254, + "learning_rate": 9.078501612780344e-07, + "loss": 0.2557, + "step": 27961 + }, + { + "epoch": 0.81, + "grad_norm": 1.463949830732324, + "learning_rate": 9.07580282364533e-07, + "loss": 0.2519, + "step": 27962 + }, + { + "epoch": 0.81, + "grad_norm": 1.3466758759023605, + "learning_rate": 9.073104395666815e-07, + "loss": 0.2682, + "step": 27963 + }, + { + "epoch": 0.81, + "grad_norm": 1.272491322764782, + "learning_rate": 9.070406328868608e-07, + "loss": 0.2738, + "step": 27964 + }, + { + "epoch": 0.81, + "grad_norm": 1.3689206396739524, + "learning_rate": 9.067708623274518e-07, + "loss": 0.2701, + "step": 27965 + }, + { + "epoch": 0.81, + "grad_norm": 1.2587210996175902, + "learning_rate": 9.065011278908353e-07, + "loss": 0.2497, + "step": 27966 + }, + { + "epoch": 0.81, + "grad_norm": 1.23962685682296, + "learning_rate": 9.062314295793929e-07, + "loss": 0.2764, + "step": 27967 + }, + { + "epoch": 0.81, + "grad_norm": 1.4344064997523958, + "learning_rate": 9.059617673955029e-07, + "loss": 0.278, + "step": 27968 + }, + { + "epoch": 0.81, + "grad_norm": 1.379108689640533, + "learning_rate": 9.056921413415448e-07, + "loss": 0.2912, + "step": 27969 + }, + { + "epoch": 0.81, + "grad_norm": 1.4555554253541698, + "learning_rate": 9.054225514198994e-07, + "loss": 0.2512, + "step": 27970 + }, + { + "epoch": 0.81, + "grad_norm": 1.756974255031976, + "learning_rate": 9.05152997632946e-07, + "loss": 0.2729, + "step": 27971 + }, + { + "epoch": 0.81, + "grad_norm": 1.4627838138998464, + "learning_rate": 9.048834799830631e-07, + "loss": 0.2657, + "step": 27972 + }, + { + "epoch": 0.81, + "grad_norm": 1.5962485998576696, + "learning_rate": 9.046139984726282e-07, + "loss": 0.273, + "step": 27973 + }, + { + "epoch": 0.81, + "grad_norm": 1.3701440900069022, + "learning_rate": 9.043445531040201e-07, + "loss": 0.3163, + "step": 27974 + }, + { + "epoch": 0.81, + "grad_norm": 1.3886045854895912, + "learning_rate": 9.040751438796169e-07, + "loss": 0.2818, + "step": 27975 + }, + { + "epoch": 0.81, + "grad_norm": 1.4035867439464238, + "learning_rate": 9.038057708017956e-07, + "loss": 0.3043, + "step": 27976 + }, + { + "epoch": 0.81, + "grad_norm": 1.6148953228896852, + "learning_rate": 9.035364338729352e-07, + "loss": 0.3045, + "step": 27977 + }, + { + "epoch": 0.81, + "grad_norm": 1.538735744832331, + "learning_rate": 9.032671330954101e-07, + "loss": 0.2609, + "step": 27978 + }, + { + "epoch": 0.81, + "grad_norm": 1.2875641357113616, + "learning_rate": 9.029978684715985e-07, + "loss": 0.2586, + "step": 27979 + }, + { + "epoch": 0.81, + "grad_norm": 1.3277369528647003, + "learning_rate": 9.027286400038754e-07, + "loss": 0.2807, + "step": 27980 + }, + { + "epoch": 0.81, + "grad_norm": 1.314556518226977, + "learning_rate": 9.024594476946174e-07, + "loss": 0.2807, + "step": 27981 + }, + { + "epoch": 0.81, + "grad_norm": 1.297544707342809, + "learning_rate": 9.021902915462006e-07, + "loss": 0.267, + "step": 27982 + }, + { + "epoch": 0.81, + "grad_norm": 1.2398143115660671, + "learning_rate": 9.019211715609999e-07, + "loss": 0.2751, + "step": 27983 + }, + { + "epoch": 0.81, + "grad_norm": 1.3722559194345416, + "learning_rate": 9.016520877413904e-07, + "loss": 0.2799, + "step": 27984 + }, + { + "epoch": 0.81, + "grad_norm": 1.473907011786562, + "learning_rate": 9.013830400897467e-07, + "loss": 0.2646, + "step": 27985 + }, + { + "epoch": 0.81, + "grad_norm": 1.3782110427452856, + "learning_rate": 9.011140286084441e-07, + "loss": 0.2692, + "step": 27986 + }, + { + "epoch": 0.81, + "grad_norm": 1.176985686848924, + "learning_rate": 9.008450532998547e-07, + "loss": 0.2577, + "step": 27987 + }, + { + "epoch": 0.81, + "grad_norm": 1.4115430127884212, + "learning_rate": 9.005761141663532e-07, + "loss": 0.2891, + "step": 27988 + }, + { + "epoch": 0.81, + "grad_norm": 1.232547139996705, + "learning_rate": 9.003072112103129e-07, + "loss": 0.2582, + "step": 27989 + }, + { + "epoch": 0.81, + "grad_norm": 1.4070518449072429, + "learning_rate": 9.000383444341071e-07, + "loss": 0.2729, + "step": 27990 + }, + { + "epoch": 0.81, + "grad_norm": 1.3181406255927635, + "learning_rate": 8.99769513840108e-07, + "loss": 0.2736, + "step": 27991 + }, + { + "epoch": 0.81, + "grad_norm": 1.4139634644440302, + "learning_rate": 8.995007194306887e-07, + "loss": 0.2616, + "step": 27992 + }, + { + "epoch": 0.81, + "grad_norm": 1.3743179622965231, + "learning_rate": 8.992319612082206e-07, + "loss": 0.2907, + "step": 27993 + }, + { + "epoch": 0.81, + "grad_norm": 1.5797908606794204, + "learning_rate": 8.989632391750779e-07, + "loss": 0.2746, + "step": 27994 + }, + { + "epoch": 0.81, + "grad_norm": 1.5107238635447429, + "learning_rate": 8.986945533336283e-07, + "loss": 0.2786, + "step": 27995 + }, + { + "epoch": 0.81, + "grad_norm": 1.5137537385360527, + "learning_rate": 8.984259036862452e-07, + "loss": 0.2805, + "step": 27996 + }, + { + "epoch": 0.81, + "grad_norm": 1.2198110296011322, + "learning_rate": 8.981572902352987e-07, + "loss": 0.2711, + "step": 27997 + }, + { + "epoch": 0.81, + "grad_norm": 2.557125255919989, + "learning_rate": 8.978887129831599e-07, + "loss": 0.2932, + "step": 27998 + }, + { + "epoch": 0.81, + "grad_norm": 1.3315082122955166, + "learning_rate": 8.976201719321981e-07, + "loss": 0.2658, + "step": 27999 + }, + { + "epoch": 0.81, + "grad_norm": 1.3307669511850695, + "learning_rate": 8.973516670847853e-07, + "loss": 0.274, + "step": 28000 + }, + { + "epoch": 0.81, + "grad_norm": 1.415831779497598, + "learning_rate": 8.970831984432888e-07, + "loss": 0.2781, + "step": 28001 + }, + { + "epoch": 0.81, + "grad_norm": 1.2925160724214448, + "learning_rate": 8.968147660100779e-07, + "loss": 0.2673, + "step": 28002 + }, + { + "epoch": 0.81, + "grad_norm": 1.4779563016753168, + "learning_rate": 8.965463697875226e-07, + "loss": 0.2845, + "step": 28003 + }, + { + "epoch": 0.81, + "grad_norm": 1.3065440802819954, + "learning_rate": 8.962780097779927e-07, + "loss": 0.2885, + "step": 28004 + }, + { + "epoch": 0.81, + "grad_norm": 1.4173807304068684, + "learning_rate": 8.96009685983853e-07, + "loss": 0.2658, + "step": 28005 + }, + { + "epoch": 0.81, + "grad_norm": 1.377582185985319, + "learning_rate": 8.957413984074737e-07, + "loss": 0.2752, + "step": 28006 + }, + { + "epoch": 0.81, + "grad_norm": 1.2927404726334064, + "learning_rate": 8.954731470512218e-07, + "loss": 0.2984, + "step": 28007 + }, + { + "epoch": 0.81, + "grad_norm": 1.479213590260282, + "learning_rate": 8.952049319174649e-07, + "loss": 0.266, + "step": 28008 + }, + { + "epoch": 0.81, + "grad_norm": 1.1942414678093611, + "learning_rate": 8.949367530085706e-07, + "loss": 0.2573, + "step": 28009 + }, + { + "epoch": 0.81, + "grad_norm": 0.9140953695285098, + "learning_rate": 8.946686103269043e-07, + "loss": 0.5951, + "step": 28010 + }, + { + "epoch": 0.81, + "grad_norm": 1.3754986834940361, + "learning_rate": 8.944005038748332e-07, + "loss": 0.2757, + "step": 28011 + }, + { + "epoch": 0.81, + "grad_norm": 3.9291371921952303, + "learning_rate": 8.941324336547236e-07, + "loss": 0.2577, + "step": 28012 + }, + { + "epoch": 0.81, + "grad_norm": 1.253699047787903, + "learning_rate": 8.938643996689417e-07, + "loss": 0.2623, + "step": 28013 + }, + { + "epoch": 0.81, + "grad_norm": 1.4461682009613641, + "learning_rate": 8.935964019198512e-07, + "loss": 0.2439, + "step": 28014 + }, + { + "epoch": 0.81, + "grad_norm": 2.520574962296655, + "learning_rate": 8.933284404098174e-07, + "loss": 0.2589, + "step": 28015 + }, + { + "epoch": 0.81, + "grad_norm": 1.2772800252589716, + "learning_rate": 8.930605151412064e-07, + "loss": 0.2596, + "step": 28016 + }, + { + "epoch": 0.81, + "grad_norm": 1.3842766732615446, + "learning_rate": 8.92792626116381e-07, + "loss": 0.2611, + "step": 28017 + }, + { + "epoch": 0.81, + "grad_norm": 1.323214696392494, + "learning_rate": 8.925247733377069e-07, + "loss": 0.2798, + "step": 28018 + }, + { + "epoch": 0.81, + "grad_norm": 1.4608771449984543, + "learning_rate": 8.922569568075473e-07, + "loss": 0.2781, + "step": 28019 + }, + { + "epoch": 0.81, + "grad_norm": 2.6714160229866413, + "learning_rate": 8.919891765282651e-07, + "loss": 0.2896, + "step": 28020 + }, + { + "epoch": 0.81, + "grad_norm": 1.6292704154997966, + "learning_rate": 8.917214325022245e-07, + "loss": 0.2744, + "step": 28021 + }, + { + "epoch": 0.81, + "grad_norm": 1.4562546018899596, + "learning_rate": 8.914537247317889e-07, + "loss": 0.2855, + "step": 28022 + }, + { + "epoch": 0.81, + "grad_norm": 1.2480218458945769, + "learning_rate": 8.911860532193184e-07, + "loss": 0.2538, + "step": 28023 + }, + { + "epoch": 0.81, + "grad_norm": 1.3154016287298131, + "learning_rate": 8.909184179671764e-07, + "loss": 0.2636, + "step": 28024 + }, + { + "epoch": 0.81, + "grad_norm": 1.9223686339875992, + "learning_rate": 8.906508189777252e-07, + "loss": 0.2856, + "step": 28025 + }, + { + "epoch": 0.81, + "grad_norm": 1.270269724733556, + "learning_rate": 8.90383256253326e-07, + "loss": 0.2574, + "step": 28026 + }, + { + "epoch": 0.81, + "grad_norm": 0.9382807697322773, + "learning_rate": 8.901157297963403e-07, + "loss": 0.5702, + "step": 28027 + }, + { + "epoch": 0.81, + "grad_norm": 1.2479823614908832, + "learning_rate": 8.898482396091301e-07, + "loss": 0.2634, + "step": 28028 + }, + { + "epoch": 0.81, + "grad_norm": 1.4326497387009327, + "learning_rate": 8.895807856940536e-07, + "loss": 0.2708, + "step": 28029 + }, + { + "epoch": 0.81, + "grad_norm": 1.4516246358960474, + "learning_rate": 8.893133680534721e-07, + "loss": 0.2652, + "step": 28030 + }, + { + "epoch": 0.81, + "grad_norm": 1.2375771795200121, + "learning_rate": 8.890459866897461e-07, + "loss": 0.2829, + "step": 28031 + }, + { + "epoch": 0.81, + "grad_norm": 1.4796728749117016, + "learning_rate": 8.887786416052357e-07, + "loss": 0.2837, + "step": 28032 + }, + { + "epoch": 0.81, + "grad_norm": 1.3971822840778165, + "learning_rate": 8.885113328022981e-07, + "loss": 0.2719, + "step": 28033 + }, + { + "epoch": 0.81, + "grad_norm": 1.3110128272351813, + "learning_rate": 8.882440602832936e-07, + "loss": 0.2624, + "step": 28034 + }, + { + "epoch": 0.81, + "grad_norm": 1.2635220104999774, + "learning_rate": 8.879768240505809e-07, + "loss": 0.2601, + "step": 28035 + }, + { + "epoch": 0.81, + "grad_norm": 1.8889286434909867, + "learning_rate": 8.877096241065186e-07, + "loss": 0.2692, + "step": 28036 + }, + { + "epoch": 0.81, + "grad_norm": 1.610698537292357, + "learning_rate": 8.874424604534643e-07, + "loss": 0.2961, + "step": 28037 + }, + { + "epoch": 0.81, + "grad_norm": 1.4588529881912138, + "learning_rate": 8.87175333093776e-07, + "loss": 0.3178, + "step": 28038 + }, + { + "epoch": 0.81, + "grad_norm": 1.5208077205850905, + "learning_rate": 8.869082420298114e-07, + "loss": 0.2636, + "step": 28039 + }, + { + "epoch": 0.81, + "grad_norm": 1.252328680319081, + "learning_rate": 8.86641187263928e-07, + "loss": 0.2702, + "step": 28040 + }, + { + "epoch": 0.81, + "grad_norm": 0.8792126011315966, + "learning_rate": 8.863741687984806e-07, + "loss": 0.5183, + "step": 28041 + }, + { + "epoch": 0.81, + "grad_norm": 1.4658489671299215, + "learning_rate": 8.861071866358262e-07, + "loss": 0.2901, + "step": 28042 + }, + { + "epoch": 0.81, + "grad_norm": 1.5763376050211966, + "learning_rate": 8.858402407783223e-07, + "loss": 0.2692, + "step": 28043 + }, + { + "epoch": 0.81, + "grad_norm": 1.3490208781382083, + "learning_rate": 8.855733312283237e-07, + "loss": 0.2662, + "step": 28044 + }, + { + "epoch": 0.81, + "grad_norm": 1.203109590115411, + "learning_rate": 8.85306457988186e-07, + "loss": 0.2814, + "step": 28045 + }, + { + "epoch": 0.81, + "grad_norm": 1.3139133819003568, + "learning_rate": 8.850396210602646e-07, + "loss": 0.2675, + "step": 28046 + }, + { + "epoch": 0.81, + "grad_norm": 1.2986193063984237, + "learning_rate": 8.847728204469141e-07, + "loss": 0.2721, + "step": 28047 + }, + { + "epoch": 0.81, + "grad_norm": 1.2477643744019902, + "learning_rate": 8.845060561504893e-07, + "loss": 0.2645, + "step": 28048 + }, + { + "epoch": 0.81, + "grad_norm": 1.3069758603952846, + "learning_rate": 8.842393281733453e-07, + "loss": 0.2731, + "step": 28049 + }, + { + "epoch": 0.81, + "grad_norm": 2.1240549029611624, + "learning_rate": 8.83972636517833e-07, + "loss": 0.285, + "step": 28050 + }, + { + "epoch": 0.81, + "grad_norm": 1.6417520924669387, + "learning_rate": 8.837059811863086e-07, + "loss": 0.3034, + "step": 28051 + }, + { + "epoch": 0.81, + "grad_norm": 1.6752342307878576, + "learning_rate": 8.834393621811244e-07, + "loss": 0.2625, + "step": 28052 + }, + { + "epoch": 0.81, + "grad_norm": 1.4187849759874158, + "learning_rate": 8.831727795046335e-07, + "loss": 0.3116, + "step": 28053 + }, + { + "epoch": 0.81, + "grad_norm": 1.405522762189423, + "learning_rate": 8.829062331591886e-07, + "loss": 0.2696, + "step": 28054 + }, + { + "epoch": 0.81, + "grad_norm": 1.5418627212037193, + "learning_rate": 8.826397231471417e-07, + "loss": 0.2768, + "step": 28055 + }, + { + "epoch": 0.81, + "grad_norm": 1.3314190747916748, + "learning_rate": 8.823732494708464e-07, + "loss": 0.2815, + "step": 28056 + }, + { + "epoch": 0.81, + "grad_norm": 1.357488916798583, + "learning_rate": 8.821068121326515e-07, + "loss": 0.2746, + "step": 28057 + }, + { + "epoch": 0.81, + "grad_norm": 1.312581948152312, + "learning_rate": 8.818404111349094e-07, + "loss": 0.2879, + "step": 28058 + }, + { + "epoch": 0.81, + "grad_norm": 1.4165501163141514, + "learning_rate": 8.815740464799733e-07, + "loss": 0.2797, + "step": 28059 + }, + { + "epoch": 0.81, + "grad_norm": 1.0538021483696056, + "learning_rate": 8.813077181701901e-07, + "loss": 0.5599, + "step": 28060 + }, + { + "epoch": 0.81, + "grad_norm": 1.3374160388756897, + "learning_rate": 8.81041426207912e-07, + "loss": 0.2707, + "step": 28061 + }, + { + "epoch": 0.81, + "grad_norm": 1.3031581094438462, + "learning_rate": 8.807751705954892e-07, + "loss": 0.3042, + "step": 28062 + }, + { + "epoch": 0.81, + "grad_norm": 1.3355058015082604, + "learning_rate": 8.80508951335271e-07, + "loss": 0.2921, + "step": 28063 + }, + { + "epoch": 0.81, + "grad_norm": 1.4269886381082886, + "learning_rate": 8.802427684296073e-07, + "loss": 0.2819, + "step": 28064 + }, + { + "epoch": 0.81, + "grad_norm": 1.2803672926637806, + "learning_rate": 8.799766218808464e-07, + "loss": 0.2834, + "step": 28065 + }, + { + "epoch": 0.81, + "grad_norm": 1.2452943362654578, + "learning_rate": 8.79710511691338e-07, + "loss": 0.3164, + "step": 28066 + }, + { + "epoch": 0.81, + "grad_norm": 1.2919699910099012, + "learning_rate": 8.794444378634299e-07, + "loss": 0.2691, + "step": 28067 + }, + { + "epoch": 0.81, + "grad_norm": 1.287851869678556, + "learning_rate": 8.791784003994713e-07, + "loss": 0.2606, + "step": 28068 + }, + { + "epoch": 0.81, + "grad_norm": 1.3570219906904941, + "learning_rate": 8.789123993018078e-07, + "loss": 0.2685, + "step": 28069 + }, + { + "epoch": 0.81, + "grad_norm": 1.3276471394153233, + "learning_rate": 8.786464345727885e-07, + "loss": 0.2663, + "step": 28070 + }, + { + "epoch": 0.81, + "grad_norm": 1.3594473303743126, + "learning_rate": 8.783805062147598e-07, + "loss": 0.2639, + "step": 28071 + }, + { + "epoch": 0.81, + "grad_norm": 1.3617927140301151, + "learning_rate": 8.781146142300689e-07, + "loss": 0.2657, + "step": 28072 + }, + { + "epoch": 0.81, + "grad_norm": 1.43954873293274, + "learning_rate": 8.778487586210627e-07, + "loss": 0.2921, + "step": 28073 + }, + { + "epoch": 0.81, + "grad_norm": 1.3207184750493244, + "learning_rate": 8.775829393900864e-07, + "loss": 0.2732, + "step": 28074 + }, + { + "epoch": 0.81, + "grad_norm": 1.9068909709723445, + "learning_rate": 8.773171565394866e-07, + "loss": 0.289, + "step": 28075 + }, + { + "epoch": 0.81, + "grad_norm": 1.413596776456305, + "learning_rate": 8.770514100716099e-07, + "loss": 0.2691, + "step": 28076 + }, + { + "epoch": 0.81, + "grad_norm": 1.3848922858617128, + "learning_rate": 8.767856999887992e-07, + "loss": 0.286, + "step": 28077 + }, + { + "epoch": 0.81, + "grad_norm": 1.420134203272354, + "learning_rate": 8.765200262933998e-07, + "loss": 0.2529, + "step": 28078 + }, + { + "epoch": 0.81, + "grad_norm": 1.4237098954009184, + "learning_rate": 8.762543889877573e-07, + "loss": 0.3081, + "step": 28079 + }, + { + "epoch": 0.81, + "grad_norm": 1.5358108578006349, + "learning_rate": 8.759887880742157e-07, + "loss": 0.2822, + "step": 28080 + }, + { + "epoch": 0.81, + "grad_norm": 1.8473189241920867, + "learning_rate": 8.757232235551188e-07, + "loss": 0.2819, + "step": 28081 + }, + { + "epoch": 0.81, + "grad_norm": 1.3646744577970602, + "learning_rate": 8.754576954328098e-07, + "loss": 0.2923, + "step": 28082 + }, + { + "epoch": 0.81, + "grad_norm": 1.2433562917181467, + "learning_rate": 8.751922037096328e-07, + "loss": 0.2662, + "step": 28083 + }, + { + "epoch": 0.81, + "grad_norm": 1.6238431341312598, + "learning_rate": 8.749267483879315e-07, + "loss": 0.2711, + "step": 28084 + }, + { + "epoch": 0.81, + "grad_norm": 0.9202005790540398, + "learning_rate": 8.746613294700462e-07, + "loss": 0.5968, + "step": 28085 + }, + { + "epoch": 0.81, + "grad_norm": 1.3928864575989186, + "learning_rate": 8.743959469583202e-07, + "loss": 0.2591, + "step": 28086 + }, + { + "epoch": 0.81, + "grad_norm": 1.5385502259661186, + "learning_rate": 8.741306008550971e-07, + "loss": 0.2903, + "step": 28087 + }, + { + "epoch": 0.81, + "grad_norm": 1.2432105257722186, + "learning_rate": 8.73865291162716e-07, + "loss": 0.2703, + "step": 28088 + }, + { + "epoch": 0.81, + "grad_norm": 1.4602219695342997, + "learning_rate": 8.736000178835197e-07, + "loss": 0.3147, + "step": 28089 + }, + { + "epoch": 0.81, + "grad_norm": 1.225116185247583, + "learning_rate": 8.733347810198483e-07, + "loss": 0.2796, + "step": 28090 + }, + { + "epoch": 0.81, + "grad_norm": 1.2821634398714903, + "learning_rate": 8.730695805740441e-07, + "loss": 0.2602, + "step": 28091 + }, + { + "epoch": 0.81, + "grad_norm": 1.3162102771836015, + "learning_rate": 8.728044165484462e-07, + "loss": 0.2731, + "step": 28092 + }, + { + "epoch": 0.81, + "grad_norm": 1.3305181885514357, + "learning_rate": 8.725392889453954e-07, + "loss": 0.2639, + "step": 28093 + }, + { + "epoch": 0.81, + "grad_norm": 1.2364239672069972, + "learning_rate": 8.722741977672306e-07, + "loss": 0.2726, + "step": 28094 + }, + { + "epoch": 0.81, + "grad_norm": 1.6221875009049842, + "learning_rate": 8.720091430162936e-07, + "loss": 0.2869, + "step": 28095 + }, + { + "epoch": 0.81, + "grad_norm": 1.4221350420904355, + "learning_rate": 8.717441246949205e-07, + "loss": 0.2719, + "step": 28096 + }, + { + "epoch": 0.81, + "grad_norm": 1.4689435930489512, + "learning_rate": 8.714791428054509e-07, + "loss": 0.2628, + "step": 28097 + }, + { + "epoch": 0.81, + "grad_norm": 1.2563445343532735, + "learning_rate": 8.712141973502236e-07, + "loss": 0.2924, + "step": 28098 + }, + { + "epoch": 0.82, + "grad_norm": 1.6292453867015266, + "learning_rate": 8.709492883315767e-07, + "loss": 0.2801, + "step": 28099 + }, + { + "epoch": 0.82, + "grad_norm": 1.3964528051328853, + "learning_rate": 8.70684415751849e-07, + "loss": 0.2984, + "step": 28100 + }, + { + "epoch": 0.82, + "grad_norm": 1.4292337523474827, + "learning_rate": 8.704195796133763e-07, + "loss": 0.2616, + "step": 28101 + }, + { + "epoch": 0.82, + "grad_norm": 1.2629292117053041, + "learning_rate": 8.701547799184967e-07, + "loss": 0.2647, + "step": 28102 + }, + { + "epoch": 0.82, + "grad_norm": 1.4409062393099228, + "learning_rate": 8.698900166695473e-07, + "loss": 0.2916, + "step": 28103 + }, + { + "epoch": 0.82, + "grad_norm": 1.2363988017913894, + "learning_rate": 8.696252898688656e-07, + "loss": 0.2573, + "step": 28104 + }, + { + "epoch": 0.82, + "grad_norm": 3.5778394947017556, + "learning_rate": 8.693605995187853e-07, + "loss": 0.2677, + "step": 28105 + }, + { + "epoch": 0.82, + "grad_norm": 1.6701662383867086, + "learning_rate": 8.690959456216436e-07, + "loss": 0.2793, + "step": 28106 + }, + { + "epoch": 0.82, + "grad_norm": 1.243268706186064, + "learning_rate": 8.688313281797755e-07, + "loss": 0.2563, + "step": 28107 + }, + { + "epoch": 0.82, + "grad_norm": 1.4022897691823148, + "learning_rate": 8.685667471955172e-07, + "loss": 0.2707, + "step": 28108 + }, + { + "epoch": 0.82, + "grad_norm": 1.4237400756535152, + "learning_rate": 8.683022026712029e-07, + "loss": 0.2627, + "step": 28109 + }, + { + "epoch": 0.82, + "grad_norm": 1.5538170564384535, + "learning_rate": 8.680376946091673e-07, + "loss": 0.2719, + "step": 28110 + }, + { + "epoch": 0.82, + "grad_norm": 1.2767969935262033, + "learning_rate": 8.677732230117453e-07, + "loss": 0.2766, + "step": 28111 + }, + { + "epoch": 0.82, + "grad_norm": 1.4465388951983822, + "learning_rate": 8.675087878812716e-07, + "loss": 0.3169, + "step": 28112 + }, + { + "epoch": 0.82, + "grad_norm": 1.4430183858105587, + "learning_rate": 8.672443892200777e-07, + "loss": 0.2719, + "step": 28113 + }, + { + "epoch": 0.82, + "grad_norm": 1.3712453981226844, + "learning_rate": 8.66980027030499e-07, + "loss": 0.2931, + "step": 28114 + }, + { + "epoch": 0.82, + "grad_norm": 1.2416557754098854, + "learning_rate": 8.667157013148659e-07, + "loss": 0.2596, + "step": 28115 + }, + { + "epoch": 0.82, + "grad_norm": 1.570787427452258, + "learning_rate": 8.66451412075513e-07, + "loss": 0.2613, + "step": 28116 + }, + { + "epoch": 0.82, + "grad_norm": 1.3745150586112769, + "learning_rate": 8.661871593147719e-07, + "loss": 0.2649, + "step": 28117 + }, + { + "epoch": 0.82, + "grad_norm": 8.223864521645226, + "learning_rate": 8.659229430349753e-07, + "loss": 0.2664, + "step": 28118 + }, + { + "epoch": 0.82, + "grad_norm": 1.5099659090560942, + "learning_rate": 8.656587632384539e-07, + "loss": 0.2851, + "step": 28119 + }, + { + "epoch": 0.82, + "grad_norm": 1.2625177278117743, + "learning_rate": 8.653946199275403e-07, + "loss": 0.2567, + "step": 28120 + }, + { + "epoch": 0.82, + "grad_norm": 1.3284320440880029, + "learning_rate": 8.651305131045651e-07, + "loss": 0.2473, + "step": 28121 + }, + { + "epoch": 0.82, + "grad_norm": 1.417692595243066, + "learning_rate": 8.64866442771859e-07, + "loss": 0.2705, + "step": 28122 + }, + { + "epoch": 0.82, + "grad_norm": 1.2460861278889475, + "learning_rate": 8.646024089317534e-07, + "loss": 0.2636, + "step": 28123 + }, + { + "epoch": 0.82, + "grad_norm": 1.2625080773654056, + "learning_rate": 8.643384115865766e-07, + "loss": 0.27, + "step": 28124 + }, + { + "epoch": 0.82, + "grad_norm": 1.5287688755153563, + "learning_rate": 8.640744507386589e-07, + "loss": 0.2593, + "step": 28125 + }, + { + "epoch": 0.82, + "grad_norm": 1.2788425538654022, + "learning_rate": 8.638105263903301e-07, + "loss": 0.28, + "step": 28126 + }, + { + "epoch": 0.82, + "grad_norm": 1.2934116583469744, + "learning_rate": 8.63546638543919e-07, + "loss": 0.2557, + "step": 28127 + }, + { + "epoch": 0.82, + "grad_norm": 1.3375112052115363, + "learning_rate": 8.632827872017552e-07, + "loss": 0.2641, + "step": 28128 + }, + { + "epoch": 0.82, + "grad_norm": 1.2993483575057307, + "learning_rate": 8.630189723661663e-07, + "loss": 0.2708, + "step": 28129 + }, + { + "epoch": 0.82, + "grad_norm": 1.7199650582604749, + "learning_rate": 8.627551940394807e-07, + "loss": 0.2716, + "step": 28130 + }, + { + "epoch": 0.82, + "grad_norm": 1.3128837250760135, + "learning_rate": 8.62491452224028e-07, + "loss": 0.2716, + "step": 28131 + }, + { + "epoch": 0.82, + "grad_norm": 1.7618355695310326, + "learning_rate": 8.622277469221329e-07, + "loss": 0.269, + "step": 28132 + }, + { + "epoch": 0.82, + "grad_norm": 1.3893464298593474, + "learning_rate": 8.619640781361238e-07, + "loss": 0.2933, + "step": 28133 + }, + { + "epoch": 0.82, + "grad_norm": 1.346232622741342, + "learning_rate": 8.617004458683275e-07, + "loss": 0.2857, + "step": 28134 + }, + { + "epoch": 0.82, + "grad_norm": 1.278615613155664, + "learning_rate": 8.614368501210713e-07, + "loss": 0.2749, + "step": 28135 + }, + { + "epoch": 0.82, + "grad_norm": 2.198472887240453, + "learning_rate": 8.611732908966802e-07, + "loss": 0.2546, + "step": 28136 + }, + { + "epoch": 0.82, + "grad_norm": 1.941957295727503, + "learning_rate": 8.609097681974809e-07, + "loss": 0.2939, + "step": 28137 + }, + { + "epoch": 0.82, + "grad_norm": 1.5822025306469798, + "learning_rate": 8.606462820257982e-07, + "loss": 0.2519, + "step": 28138 + }, + { + "epoch": 0.82, + "grad_norm": 1.2805810370201458, + "learning_rate": 8.603828323839586e-07, + "loss": 0.2581, + "step": 28139 + }, + { + "epoch": 0.82, + "grad_norm": 1.8719666746215575, + "learning_rate": 8.601194192742879e-07, + "loss": 0.2774, + "step": 28140 + }, + { + "epoch": 0.82, + "grad_norm": 1.4656679791904481, + "learning_rate": 8.598560426991077e-07, + "loss": 0.2725, + "step": 28141 + }, + { + "epoch": 0.82, + "grad_norm": 1.3450396040051977, + "learning_rate": 8.595927026607448e-07, + "loss": 0.2686, + "step": 28142 + }, + { + "epoch": 0.82, + "grad_norm": 1.3292065029289022, + "learning_rate": 8.593293991615215e-07, + "loss": 0.2864, + "step": 28143 + }, + { + "epoch": 0.82, + "grad_norm": 1.0594741416964901, + "learning_rate": 8.590661322037618e-07, + "loss": 0.6624, + "step": 28144 + }, + { + "epoch": 0.82, + "grad_norm": 1.2351179658710734, + "learning_rate": 8.588029017897898e-07, + "loss": 0.2729, + "step": 28145 + }, + { + "epoch": 0.82, + "grad_norm": 1.4039733626389197, + "learning_rate": 8.585397079219277e-07, + "loss": 0.2649, + "step": 28146 + }, + { + "epoch": 0.82, + "grad_norm": 1.3102894549251884, + "learning_rate": 8.582765506024987e-07, + "loss": 0.2486, + "step": 28147 + }, + { + "epoch": 0.82, + "grad_norm": 1.2585912979819371, + "learning_rate": 8.580134298338255e-07, + "loss": 0.2699, + "step": 28148 + }, + { + "epoch": 0.82, + "grad_norm": 1.4141664788587731, + "learning_rate": 8.577503456182296e-07, + "loss": 0.2746, + "step": 28149 + }, + { + "epoch": 0.82, + "grad_norm": 1.4351802448190882, + "learning_rate": 8.57487297958034e-07, + "loss": 0.2752, + "step": 28150 + }, + { + "epoch": 0.82, + "grad_norm": 1.3179733194468604, + "learning_rate": 8.572242868555575e-07, + "loss": 0.2739, + "step": 28151 + }, + { + "epoch": 0.82, + "grad_norm": 1.3178414484209855, + "learning_rate": 8.569613123131226e-07, + "loss": 0.29, + "step": 28152 + }, + { + "epoch": 0.82, + "grad_norm": 1.3072923984850473, + "learning_rate": 8.566983743330504e-07, + "loss": 0.2942, + "step": 28153 + }, + { + "epoch": 0.82, + "grad_norm": 1.9652974653683286, + "learning_rate": 8.564354729176605e-07, + "loss": 0.2802, + "step": 28154 + }, + { + "epoch": 0.82, + "grad_norm": 1.3459937753689142, + "learning_rate": 8.56172608069274e-07, + "loss": 0.2692, + "step": 28155 + }, + { + "epoch": 0.82, + "grad_norm": 1.3210597327206632, + "learning_rate": 8.5590977979021e-07, + "loss": 0.266, + "step": 28156 + }, + { + "epoch": 0.82, + "grad_norm": 1.4897157813784234, + "learning_rate": 8.556469880827884e-07, + "loss": 0.2641, + "step": 28157 + }, + { + "epoch": 0.82, + "grad_norm": 1.3886528228033175, + "learning_rate": 8.553842329493278e-07, + "loss": 0.2475, + "step": 28158 + }, + { + "epoch": 0.82, + "grad_norm": 1.4549266277252224, + "learning_rate": 8.551215143921487e-07, + "loss": 0.2818, + "step": 28159 + }, + { + "epoch": 0.82, + "grad_norm": 1.6731340023684191, + "learning_rate": 8.54858832413567e-07, + "loss": 0.2771, + "step": 28160 + }, + { + "epoch": 0.82, + "grad_norm": 1.3188967060559371, + "learning_rate": 8.545961870159025e-07, + "loss": 0.2677, + "step": 28161 + }, + { + "epoch": 0.82, + "grad_norm": 1.5279841586783578, + "learning_rate": 8.543335782014722e-07, + "loss": 0.2674, + "step": 28162 + }, + { + "epoch": 0.82, + "grad_norm": 1.555740520200503, + "learning_rate": 8.540710059725937e-07, + "loss": 0.2852, + "step": 28163 + }, + { + "epoch": 0.82, + "grad_norm": 1.7011728165288955, + "learning_rate": 8.538084703315852e-07, + "loss": 0.2839, + "step": 28164 + }, + { + "epoch": 0.82, + "grad_norm": 1.255483748384777, + "learning_rate": 8.535459712807626e-07, + "loss": 0.2546, + "step": 28165 + }, + { + "epoch": 0.82, + "grad_norm": 1.4219514547466252, + "learning_rate": 8.532835088224428e-07, + "loss": 0.2916, + "step": 28166 + }, + { + "epoch": 0.82, + "grad_norm": 5.738081232275383, + "learning_rate": 8.530210829589436e-07, + "loss": 0.2729, + "step": 28167 + }, + { + "epoch": 0.82, + "grad_norm": 1.4201319899520708, + "learning_rate": 8.527586936925774e-07, + "loss": 0.2808, + "step": 28168 + }, + { + "epoch": 0.82, + "grad_norm": 1.275399146804787, + "learning_rate": 8.524963410256637e-07, + "loss": 0.2614, + "step": 28169 + }, + { + "epoch": 0.82, + "grad_norm": 1.2708731700843547, + "learning_rate": 8.522340249605143e-07, + "loss": 0.2713, + "step": 28170 + }, + { + "epoch": 0.82, + "grad_norm": 1.3012121803785148, + "learning_rate": 8.519717454994458e-07, + "loss": 0.2793, + "step": 28171 + }, + { + "epoch": 0.82, + "grad_norm": 1.426727399109768, + "learning_rate": 8.517095026447725e-07, + "loss": 0.256, + "step": 28172 + }, + { + "epoch": 0.82, + "grad_norm": 1.8303818212429532, + "learning_rate": 8.514472963988091e-07, + "loss": 0.2848, + "step": 28173 + }, + { + "epoch": 0.82, + "grad_norm": 1.5989136490025075, + "learning_rate": 8.511851267638688e-07, + "loss": 0.2783, + "step": 28174 + }, + { + "epoch": 0.82, + "grad_norm": 1.3650598965709395, + "learning_rate": 8.509229937422664e-07, + "loss": 0.2769, + "step": 28175 + }, + { + "epoch": 0.82, + "grad_norm": 1.6523720521593728, + "learning_rate": 8.50660897336314e-07, + "loss": 0.278, + "step": 28176 + }, + { + "epoch": 0.82, + "grad_norm": 1.2440830728930365, + "learning_rate": 8.503988375483258e-07, + "loss": 0.2756, + "step": 28177 + }, + { + "epoch": 0.82, + "grad_norm": 1.3073784931213661, + "learning_rate": 8.501368143806143e-07, + "loss": 0.2568, + "step": 28178 + }, + { + "epoch": 0.82, + "grad_norm": 1.3986318954113088, + "learning_rate": 8.498748278354912e-07, + "loss": 0.2839, + "step": 28179 + }, + { + "epoch": 0.82, + "grad_norm": 3.1123348385845486, + "learning_rate": 8.496128779152679e-07, + "loss": 0.275, + "step": 28180 + }, + { + "epoch": 0.82, + "grad_norm": 1.213576515712331, + "learning_rate": 8.493509646222575e-07, + "loss": 0.2693, + "step": 28181 + }, + { + "epoch": 0.82, + "grad_norm": 1.3950192340383056, + "learning_rate": 8.49089087958771e-07, + "loss": 0.3044, + "step": 28182 + }, + { + "epoch": 0.82, + "grad_norm": 1.255236642704636, + "learning_rate": 8.488272479271193e-07, + "loss": 0.2724, + "step": 28183 + }, + { + "epoch": 0.82, + "grad_norm": 0.997279925767156, + "learning_rate": 8.485654445296127e-07, + "loss": 0.5844, + "step": 28184 + }, + { + "epoch": 0.82, + "grad_norm": 0.9429555634018805, + "learning_rate": 8.483036777685627e-07, + "loss": 0.5475, + "step": 28185 + }, + { + "epoch": 0.82, + "grad_norm": 1.3917591992686258, + "learning_rate": 8.480419476462798e-07, + "loss": 0.2723, + "step": 28186 + }, + { + "epoch": 0.82, + "grad_norm": 1.3376368761501938, + "learning_rate": 8.477802541650715e-07, + "loss": 0.2622, + "step": 28187 + }, + { + "epoch": 0.82, + "grad_norm": 1.3269806209985395, + "learning_rate": 8.475185973272482e-07, + "loss": 0.286, + "step": 28188 + }, + { + "epoch": 0.82, + "grad_norm": 1.568225046317055, + "learning_rate": 8.472569771351197e-07, + "loss": 0.2687, + "step": 28189 + }, + { + "epoch": 0.82, + "grad_norm": 1.4466024504830428, + "learning_rate": 8.469953935909947e-07, + "loss": 0.2726, + "step": 28190 + }, + { + "epoch": 0.82, + "grad_norm": 1.4370140371833509, + "learning_rate": 8.467338466971808e-07, + "loss": 0.2732, + "step": 28191 + }, + { + "epoch": 0.82, + "grad_norm": 1.412169777413772, + "learning_rate": 8.464723364559874e-07, + "loss": 0.2904, + "step": 28192 + }, + { + "epoch": 0.82, + "grad_norm": 1.6603934478918096, + "learning_rate": 8.462108628697213e-07, + "loss": 0.2804, + "step": 28193 + }, + { + "epoch": 0.82, + "grad_norm": 1.2684082836946946, + "learning_rate": 8.459494259406908e-07, + "loss": 0.2688, + "step": 28194 + }, + { + "epoch": 0.82, + "grad_norm": 1.4524273338348292, + "learning_rate": 8.456880256712036e-07, + "loss": 0.2877, + "step": 28195 + }, + { + "epoch": 0.82, + "grad_norm": 1.5941199765848006, + "learning_rate": 8.454266620635642e-07, + "loss": 0.2972, + "step": 28196 + }, + { + "epoch": 0.82, + "grad_norm": 1.3423554106288254, + "learning_rate": 8.451653351200823e-07, + "loss": 0.2783, + "step": 28197 + }, + { + "epoch": 0.82, + "grad_norm": 1.4778371367466143, + "learning_rate": 8.449040448430612e-07, + "loss": 0.2725, + "step": 28198 + }, + { + "epoch": 0.82, + "grad_norm": 1.5137696763446777, + "learning_rate": 8.446427912348076e-07, + "loss": 0.2709, + "step": 28199 + }, + { + "epoch": 0.82, + "grad_norm": 1.4260468373125088, + "learning_rate": 8.44381574297628e-07, + "loss": 0.294, + "step": 28200 + }, + { + "epoch": 0.82, + "grad_norm": 1.860737492246697, + "learning_rate": 8.441203940338266e-07, + "loss": 0.2874, + "step": 28201 + }, + { + "epoch": 0.82, + "grad_norm": 2.1082780044875657, + "learning_rate": 8.438592504457088e-07, + "loss": 0.2731, + "step": 28202 + }, + { + "epoch": 0.82, + "grad_norm": 1.391653427084373, + "learning_rate": 8.435981435355795e-07, + "loss": 0.2692, + "step": 28203 + }, + { + "epoch": 0.82, + "grad_norm": 1.4777121629623384, + "learning_rate": 8.433370733057427e-07, + "loss": 0.2608, + "step": 28204 + }, + { + "epoch": 0.82, + "grad_norm": 2.013660464196453, + "learning_rate": 8.430760397585036e-07, + "loss": 0.2558, + "step": 28205 + }, + { + "epoch": 0.82, + "grad_norm": 1.4028791062129116, + "learning_rate": 8.428150428961629e-07, + "loss": 0.2869, + "step": 28206 + }, + { + "epoch": 0.82, + "grad_norm": 1.2813793979735542, + "learning_rate": 8.42554082721026e-07, + "loss": 0.2823, + "step": 28207 + }, + { + "epoch": 0.82, + "grad_norm": 1.3260132365808615, + "learning_rate": 8.422931592353956e-07, + "loss": 0.2943, + "step": 28208 + }, + { + "epoch": 0.82, + "grad_norm": 1.8861282778911517, + "learning_rate": 8.420322724415736e-07, + "loss": 0.2772, + "step": 28209 + }, + { + "epoch": 0.82, + "grad_norm": 6.620051944188786, + "learning_rate": 8.417714223418633e-07, + "loss": 0.2725, + "step": 28210 + }, + { + "epoch": 0.82, + "grad_norm": 1.7050852711729727, + "learning_rate": 8.41510608938566e-07, + "loss": 0.2792, + "step": 28211 + }, + { + "epoch": 0.82, + "grad_norm": 1.5504006555955678, + "learning_rate": 8.412498322339841e-07, + "loss": 0.2617, + "step": 28212 + }, + { + "epoch": 0.82, + "grad_norm": 1.3513001352414726, + "learning_rate": 8.409890922304187e-07, + "loss": 0.2756, + "step": 28213 + }, + { + "epoch": 0.82, + "grad_norm": 1.2961781756566813, + "learning_rate": 8.407283889301715e-07, + "loss": 0.274, + "step": 28214 + }, + { + "epoch": 0.82, + "grad_norm": 1.6347732314923746, + "learning_rate": 8.404677223355411e-07, + "loss": 0.2754, + "step": 28215 + }, + { + "epoch": 0.82, + "grad_norm": 1.307654224689464, + "learning_rate": 8.402070924488299e-07, + "loss": 0.2711, + "step": 28216 + }, + { + "epoch": 0.82, + "grad_norm": 1.5001887359597943, + "learning_rate": 8.399464992723367e-07, + "loss": 0.273, + "step": 28217 + }, + { + "epoch": 0.82, + "grad_norm": 1.5815851309474267, + "learning_rate": 8.39685942808362e-07, + "loss": 0.2883, + "step": 28218 + }, + { + "epoch": 0.82, + "grad_norm": 1.2639015675482712, + "learning_rate": 8.394254230592047e-07, + "loss": 0.2706, + "step": 28219 + }, + { + "epoch": 0.82, + "grad_norm": 2.7124709072516096, + "learning_rate": 8.391649400271645e-07, + "loss": 0.3132, + "step": 28220 + }, + { + "epoch": 0.82, + "grad_norm": 0.9905172402628154, + "learning_rate": 8.389044937145397e-07, + "loss": 0.5908, + "step": 28221 + }, + { + "epoch": 0.82, + "grad_norm": 1.3473365121858183, + "learning_rate": 8.386440841236304e-07, + "loss": 0.2861, + "step": 28222 + }, + { + "epoch": 0.82, + "grad_norm": 1.3114903878944524, + "learning_rate": 8.383837112567316e-07, + "loss": 0.2849, + "step": 28223 + }, + { + "epoch": 0.82, + "grad_norm": 1.4742099716698198, + "learning_rate": 8.381233751161427e-07, + "loss": 0.2633, + "step": 28224 + }, + { + "epoch": 0.82, + "grad_norm": 1.2208705015760084, + "learning_rate": 8.378630757041628e-07, + "loss": 0.2476, + "step": 28225 + }, + { + "epoch": 0.82, + "grad_norm": 1.467790104818003, + "learning_rate": 8.376028130230862e-07, + "loss": 0.2828, + "step": 28226 + }, + { + "epoch": 0.82, + "grad_norm": 1.2794276364020425, + "learning_rate": 8.373425870752105e-07, + "loss": 0.254, + "step": 28227 + }, + { + "epoch": 0.82, + "grad_norm": 1.6239346128007155, + "learning_rate": 8.370823978628329e-07, + "loss": 0.2939, + "step": 28228 + }, + { + "epoch": 0.82, + "grad_norm": 1.5071574451118686, + "learning_rate": 8.368222453882496e-07, + "loss": 0.2549, + "step": 28229 + }, + { + "epoch": 0.82, + "grad_norm": 1.4232026455671687, + "learning_rate": 8.365621296537557e-07, + "loss": 0.2706, + "step": 28230 + }, + { + "epoch": 0.82, + "grad_norm": 1.3386495642956064, + "learning_rate": 8.363020506616476e-07, + "loss": 0.2812, + "step": 28231 + }, + { + "epoch": 0.82, + "grad_norm": 1.2815191981166099, + "learning_rate": 8.360420084142196e-07, + "loss": 0.3149, + "step": 28232 + }, + { + "epoch": 0.82, + "grad_norm": 2.8124632335493507, + "learning_rate": 8.357820029137686e-07, + "loss": 0.2642, + "step": 28233 + }, + { + "epoch": 0.82, + "grad_norm": 1.4280643298156583, + "learning_rate": 8.355220341625864e-07, + "loss": 0.274, + "step": 28234 + }, + { + "epoch": 0.82, + "grad_norm": 1.8591057695307673, + "learning_rate": 8.352621021629686e-07, + "loss": 0.2672, + "step": 28235 + }, + { + "epoch": 0.82, + "grad_norm": 1.8960282096215688, + "learning_rate": 8.350022069172087e-07, + "loss": 0.2598, + "step": 28236 + }, + { + "epoch": 0.82, + "grad_norm": 1.298788923776946, + "learning_rate": 8.347423484276007e-07, + "loss": 0.2916, + "step": 28237 + }, + { + "epoch": 0.82, + "grad_norm": 1.3887700321115959, + "learning_rate": 8.344825266964373e-07, + "loss": 0.2642, + "step": 28238 + }, + { + "epoch": 0.82, + "grad_norm": 1.546682755593708, + "learning_rate": 8.342227417260124e-07, + "loss": 0.2711, + "step": 28239 + }, + { + "epoch": 0.82, + "grad_norm": 1.4824964728626675, + "learning_rate": 8.339629935186172e-07, + "loss": 0.2661, + "step": 28240 + }, + { + "epoch": 0.82, + "grad_norm": 1.2942673599735444, + "learning_rate": 8.337032820765467e-07, + "loss": 0.2934, + "step": 28241 + }, + { + "epoch": 0.82, + "grad_norm": 2.1978502777665767, + "learning_rate": 8.334436074020896e-07, + "loss": 0.2617, + "step": 28242 + }, + { + "epoch": 0.82, + "grad_norm": 1.2994805561092888, + "learning_rate": 8.331839694975391e-07, + "loss": 0.2528, + "step": 28243 + }, + { + "epoch": 0.82, + "grad_norm": 1.406629583910961, + "learning_rate": 8.329243683651861e-07, + "loss": 0.2559, + "step": 28244 + }, + { + "epoch": 0.82, + "grad_norm": 1.2628503896239505, + "learning_rate": 8.326648040073215e-07, + "loss": 0.3113, + "step": 28245 + }, + { + "epoch": 0.82, + "grad_norm": 1.3243275444851097, + "learning_rate": 8.324052764262363e-07, + "loss": 0.274, + "step": 28246 + }, + { + "epoch": 0.82, + "grad_norm": 1.42710501009347, + "learning_rate": 8.32145785624221e-07, + "loss": 0.2986, + "step": 28247 + }, + { + "epoch": 0.82, + "grad_norm": 1.7416603273509728, + "learning_rate": 8.318863316035653e-07, + "loss": 0.2836, + "step": 28248 + }, + { + "epoch": 0.82, + "grad_norm": 1.470368927618063, + "learning_rate": 8.316269143665589e-07, + "loss": 0.2938, + "step": 28249 + }, + { + "epoch": 0.82, + "grad_norm": 1.4287141719257377, + "learning_rate": 8.313675339154925e-07, + "loss": 0.2871, + "step": 28250 + }, + { + "epoch": 0.82, + "grad_norm": 1.3041121849031405, + "learning_rate": 8.311081902526524e-07, + "loss": 0.2811, + "step": 28251 + }, + { + "epoch": 0.82, + "grad_norm": 1.365959423093743, + "learning_rate": 8.30848883380329e-07, + "loss": 0.3009, + "step": 28252 + }, + { + "epoch": 0.82, + "grad_norm": 1.5784217343114069, + "learning_rate": 8.305896133008117e-07, + "loss": 0.2657, + "step": 28253 + }, + { + "epoch": 0.82, + "grad_norm": 1.4110171116402788, + "learning_rate": 8.303303800163865e-07, + "loss": 0.2893, + "step": 28254 + }, + { + "epoch": 0.82, + "grad_norm": 1.3784959904218061, + "learning_rate": 8.300711835293412e-07, + "loss": 0.282, + "step": 28255 + }, + { + "epoch": 0.82, + "grad_norm": 1.281951189705349, + "learning_rate": 8.298120238419644e-07, + "loss": 0.2819, + "step": 28256 + }, + { + "epoch": 0.82, + "grad_norm": 1.6679618624333477, + "learning_rate": 8.295529009565429e-07, + "loss": 0.2814, + "step": 28257 + }, + { + "epoch": 0.82, + "grad_norm": 1.4017271002476446, + "learning_rate": 8.292938148753627e-07, + "loss": 0.28, + "step": 28258 + }, + { + "epoch": 0.82, + "grad_norm": 1.3534741293440113, + "learning_rate": 8.290347656007114e-07, + "loss": 0.2725, + "step": 28259 + }, + { + "epoch": 0.82, + "grad_norm": 1.772267680359492, + "learning_rate": 8.287757531348755e-07, + "loss": 0.2787, + "step": 28260 + }, + { + "epoch": 0.82, + "grad_norm": 1.4334397822580789, + "learning_rate": 8.285167774801389e-07, + "loss": 0.2478, + "step": 28261 + }, + { + "epoch": 0.82, + "grad_norm": 1.5976179372461843, + "learning_rate": 8.282578386387879e-07, + "loss": 0.2529, + "step": 28262 + }, + { + "epoch": 0.82, + "grad_norm": 1.7570434145828893, + "learning_rate": 8.279989366131074e-07, + "loss": 0.256, + "step": 28263 + }, + { + "epoch": 0.82, + "grad_norm": 1.4281994166441787, + "learning_rate": 8.277400714053829e-07, + "loss": 0.2625, + "step": 28264 + }, + { + "epoch": 0.82, + "grad_norm": 1.3913402362004368, + "learning_rate": 8.274812430178986e-07, + "loss": 0.2997, + "step": 28265 + }, + { + "epoch": 0.82, + "grad_norm": 1.4962710406394524, + "learning_rate": 8.272224514529387e-07, + "loss": 0.3247, + "step": 28266 + }, + { + "epoch": 0.82, + "grad_norm": 1.497946065698883, + "learning_rate": 8.269636967127864e-07, + "loss": 0.2612, + "step": 28267 + }, + { + "epoch": 0.82, + "grad_norm": 1.5519266892504948, + "learning_rate": 8.267049787997261e-07, + "loss": 0.2502, + "step": 28268 + }, + { + "epoch": 0.82, + "grad_norm": 1.4437389461097652, + "learning_rate": 8.26446297716042e-07, + "loss": 0.2697, + "step": 28269 + }, + { + "epoch": 0.82, + "grad_norm": 1.2905447940635841, + "learning_rate": 8.261876534640145e-07, + "loss": 0.2795, + "step": 28270 + }, + { + "epoch": 0.82, + "grad_norm": 1.5470450458923788, + "learning_rate": 8.259290460459268e-07, + "loss": 0.278, + "step": 28271 + }, + { + "epoch": 0.82, + "grad_norm": 1.3286041551110668, + "learning_rate": 8.256704754640621e-07, + "loss": 0.2583, + "step": 28272 + }, + { + "epoch": 0.82, + "grad_norm": 1.3720146989796578, + "learning_rate": 8.254119417207019e-07, + "loss": 0.2805, + "step": 28273 + }, + { + "epoch": 0.82, + "grad_norm": 1.3684351381603315, + "learning_rate": 8.251534448181269e-07, + "loss": 0.318, + "step": 28274 + }, + { + "epoch": 0.82, + "grad_norm": 1.4824338630799645, + "learning_rate": 8.248949847586196e-07, + "loss": 0.2683, + "step": 28275 + }, + { + "epoch": 0.82, + "grad_norm": 1.0148331265818904, + "learning_rate": 8.246365615444601e-07, + "loss": 0.5616, + "step": 28276 + }, + { + "epoch": 0.82, + "grad_norm": 1.5366380738127747, + "learning_rate": 8.243781751779306e-07, + "loss": 0.2759, + "step": 28277 + }, + { + "epoch": 0.82, + "grad_norm": 1.4090468807422178, + "learning_rate": 8.241198256613087e-07, + "loss": 0.2759, + "step": 28278 + }, + { + "epoch": 0.82, + "grad_norm": 1.3053868300110456, + "learning_rate": 8.238615129968758e-07, + "loss": 0.2678, + "step": 28279 + }, + { + "epoch": 0.82, + "grad_norm": 1.2655415936507088, + "learning_rate": 8.236032371869113e-07, + "loss": 0.2627, + "step": 28280 + }, + { + "epoch": 0.82, + "grad_norm": 1.4036596339297767, + "learning_rate": 8.233449982336955e-07, + "loss": 0.2783, + "step": 28281 + }, + { + "epoch": 0.82, + "grad_norm": 2.244775931458665, + "learning_rate": 8.230867961395056e-07, + "loss": 0.2697, + "step": 28282 + }, + { + "epoch": 0.82, + "grad_norm": 1.3828555711751347, + "learning_rate": 8.228286309066214e-07, + "loss": 0.2815, + "step": 28283 + }, + { + "epoch": 0.82, + "grad_norm": 1.2139252765055601, + "learning_rate": 8.225705025373204e-07, + "loss": 0.246, + "step": 28284 + }, + { + "epoch": 0.82, + "grad_norm": 1.3677969487799666, + "learning_rate": 8.22312411033881e-07, + "loss": 0.2709, + "step": 28285 + }, + { + "epoch": 0.82, + "grad_norm": 1.4255001179566282, + "learning_rate": 8.220543563985812e-07, + "loss": 0.2763, + "step": 28286 + }, + { + "epoch": 0.82, + "grad_norm": 1.0283734088063423, + "learning_rate": 8.217963386336975e-07, + "loss": 0.5564, + "step": 28287 + }, + { + "epoch": 0.82, + "grad_norm": 2.264957162446634, + "learning_rate": 8.215383577415093e-07, + "loss": 0.2949, + "step": 28288 + }, + { + "epoch": 0.82, + "grad_norm": 1.4452634878304598, + "learning_rate": 8.212804137242897e-07, + "loss": 0.2927, + "step": 28289 + }, + { + "epoch": 0.82, + "grad_norm": 1.0105642678405922, + "learning_rate": 8.21022506584317e-07, + "loss": 0.5618, + "step": 28290 + }, + { + "epoch": 0.82, + "grad_norm": 1.4668856564257258, + "learning_rate": 8.207646363238675e-07, + "loss": 0.2718, + "step": 28291 + }, + { + "epoch": 0.82, + "grad_norm": 1.4046267289656187, + "learning_rate": 8.205068029452156e-07, + "loss": 0.2788, + "step": 28292 + }, + { + "epoch": 0.82, + "grad_norm": 1.3282641530083847, + "learning_rate": 8.202490064506386e-07, + "loss": 0.2614, + "step": 28293 + }, + { + "epoch": 0.82, + "grad_norm": 1.3775778474615414, + "learning_rate": 8.199912468424098e-07, + "loss": 0.286, + "step": 28294 + }, + { + "epoch": 0.82, + "grad_norm": 1.347308638180694, + "learning_rate": 8.197335241228043e-07, + "loss": 0.2686, + "step": 28295 + }, + { + "epoch": 0.82, + "grad_norm": 1.2637543010564438, + "learning_rate": 8.194758382940987e-07, + "loss": 0.2803, + "step": 28296 + }, + { + "epoch": 0.82, + "grad_norm": 1.4108723023829697, + "learning_rate": 8.192181893585638e-07, + "loss": 0.2772, + "step": 28297 + }, + { + "epoch": 0.82, + "grad_norm": 1.3093254627458464, + "learning_rate": 8.189605773184745e-07, + "loss": 0.2611, + "step": 28298 + }, + { + "epoch": 0.82, + "grad_norm": 1.6965216881591434, + "learning_rate": 8.187030021761045e-07, + "loss": 0.2639, + "step": 28299 + }, + { + "epoch": 0.82, + "grad_norm": 1.6546420107413748, + "learning_rate": 8.184454639337274e-07, + "loss": 0.2577, + "step": 28300 + }, + { + "epoch": 0.82, + "grad_norm": 1.5381855208062898, + "learning_rate": 8.181879625936151e-07, + "loss": 0.2818, + "step": 28301 + }, + { + "epoch": 0.82, + "grad_norm": 1.405376570662909, + "learning_rate": 8.179304981580405e-07, + "loss": 0.2895, + "step": 28302 + }, + { + "epoch": 0.82, + "grad_norm": 1.274615317500965, + "learning_rate": 8.176730706292751e-07, + "loss": 0.2575, + "step": 28303 + }, + { + "epoch": 0.82, + "grad_norm": 1.2714394988358921, + "learning_rate": 8.174156800095922e-07, + "loss": 0.2694, + "step": 28304 + }, + { + "epoch": 0.82, + "grad_norm": 1.422238738403518, + "learning_rate": 8.171583263012628e-07, + "loss": 0.3261, + "step": 28305 + }, + { + "epoch": 0.82, + "grad_norm": 1.7873360824835305, + "learning_rate": 8.169010095065566e-07, + "loss": 0.2871, + "step": 28306 + }, + { + "epoch": 0.82, + "grad_norm": 3.39779230697742, + "learning_rate": 8.166437296277457e-07, + "loss": 0.2696, + "step": 28307 + }, + { + "epoch": 0.82, + "grad_norm": 1.7243115868561372, + "learning_rate": 8.163864866671001e-07, + "loss": 0.2709, + "step": 28308 + }, + { + "epoch": 0.82, + "grad_norm": 1.3982379216176595, + "learning_rate": 8.161292806268911e-07, + "loss": 0.2838, + "step": 28309 + }, + { + "epoch": 0.82, + "grad_norm": 1.3177367980494696, + "learning_rate": 8.158721115093865e-07, + "loss": 0.2629, + "step": 28310 + }, + { + "epoch": 0.82, + "grad_norm": 2.105147040262093, + "learning_rate": 8.156149793168572e-07, + "loss": 0.2702, + "step": 28311 + }, + { + "epoch": 0.82, + "grad_norm": 1.3985074815442569, + "learning_rate": 8.153578840515719e-07, + "loss": 0.2757, + "step": 28312 + }, + { + "epoch": 0.82, + "grad_norm": 1.4919514027145204, + "learning_rate": 8.151008257158e-07, + "loss": 0.2866, + "step": 28313 + }, + { + "epoch": 0.82, + "grad_norm": 1.544572493770915, + "learning_rate": 8.148438043118095e-07, + "loss": 0.2792, + "step": 28314 + }, + { + "epoch": 0.82, + "grad_norm": 1.5234338805393473, + "learning_rate": 8.145868198418699e-07, + "loss": 0.275, + "step": 28315 + }, + { + "epoch": 0.82, + "grad_norm": 2.5782633891205786, + "learning_rate": 8.143298723082471e-07, + "loss": 0.2693, + "step": 28316 + }, + { + "epoch": 0.82, + "grad_norm": 1.388498332946967, + "learning_rate": 8.140729617132093e-07, + "loss": 0.2896, + "step": 28317 + }, + { + "epoch": 0.82, + "grad_norm": 1.419572738193601, + "learning_rate": 8.138160880590246e-07, + "loss": 0.2774, + "step": 28318 + }, + { + "epoch": 0.82, + "grad_norm": 1.221164515680625, + "learning_rate": 8.13559251347959e-07, + "loss": 0.2667, + "step": 28319 + }, + { + "epoch": 0.82, + "grad_norm": 1.3211031493359895, + "learning_rate": 8.13302451582279e-07, + "loss": 0.2682, + "step": 28320 + }, + { + "epoch": 0.82, + "grad_norm": 1.7907186014053307, + "learning_rate": 8.130456887642518e-07, + "loss": 0.2455, + "step": 28321 + }, + { + "epoch": 0.82, + "grad_norm": 1.3350777015799586, + "learning_rate": 8.127889628961432e-07, + "loss": 0.26, + "step": 28322 + }, + { + "epoch": 0.82, + "grad_norm": 1.254517137570431, + "learning_rate": 8.125322739802177e-07, + "loss": 0.26, + "step": 28323 + }, + { + "epoch": 0.82, + "grad_norm": 1.4389255405339931, + "learning_rate": 8.122756220187434e-07, + "loss": 0.2722, + "step": 28324 + }, + { + "epoch": 0.82, + "grad_norm": 1.3618682437891811, + "learning_rate": 8.120190070139811e-07, + "loss": 0.2937, + "step": 28325 + }, + { + "epoch": 0.82, + "grad_norm": 1.3605764291885933, + "learning_rate": 8.11762428968198e-07, + "loss": 0.2811, + "step": 28326 + }, + { + "epoch": 0.82, + "grad_norm": 1.3925171063043786, + "learning_rate": 8.11505887883658e-07, + "loss": 0.2618, + "step": 28327 + }, + { + "epoch": 0.82, + "grad_norm": 1.7148524753179155, + "learning_rate": 8.112493837626251e-07, + "loss": 0.2703, + "step": 28328 + }, + { + "epoch": 0.82, + "grad_norm": 1.402443456508065, + "learning_rate": 8.109929166073621e-07, + "loss": 0.2734, + "step": 28329 + }, + { + "epoch": 0.82, + "grad_norm": 1.5151873785656966, + "learning_rate": 8.107364864201339e-07, + "loss": 0.3026, + "step": 28330 + }, + { + "epoch": 0.82, + "grad_norm": 1.315100998749257, + "learning_rate": 8.104800932032026e-07, + "loss": 0.2441, + "step": 28331 + }, + { + "epoch": 0.82, + "grad_norm": 1.4218629121211026, + "learning_rate": 8.102237369588317e-07, + "loss": 0.2454, + "step": 28332 + }, + { + "epoch": 0.82, + "grad_norm": 1.174072421965538, + "learning_rate": 8.099674176892819e-07, + "loss": 0.2621, + "step": 28333 + }, + { + "epoch": 0.82, + "grad_norm": 1.3741194580406881, + "learning_rate": 8.097111353968162e-07, + "loss": 0.2864, + "step": 28334 + }, + { + "epoch": 0.82, + "grad_norm": 1.6967443116631826, + "learning_rate": 8.094548900836957e-07, + "loss": 0.2827, + "step": 28335 + }, + { + "epoch": 0.82, + "grad_norm": 1.3702206276163067, + "learning_rate": 8.091986817521829e-07, + "loss": 0.2747, + "step": 28336 + }, + { + "epoch": 0.82, + "grad_norm": 1.2604250316669958, + "learning_rate": 8.08942510404539e-07, + "loss": 0.2635, + "step": 28337 + }, + { + "epoch": 0.82, + "grad_norm": 1.1963514942142983, + "learning_rate": 8.08686376043023e-07, + "loss": 0.2549, + "step": 28338 + }, + { + "epoch": 0.82, + "grad_norm": 1.8609407339181698, + "learning_rate": 8.084302786698961e-07, + "loss": 0.2434, + "step": 28339 + }, + { + "epoch": 0.82, + "grad_norm": 1.5051863266091408, + "learning_rate": 8.081742182874186e-07, + "loss": 0.304, + "step": 28340 + }, + { + "epoch": 0.82, + "grad_norm": 1.2942668594686062, + "learning_rate": 8.079181948978498e-07, + "loss": 0.2731, + "step": 28341 + }, + { + "epoch": 0.82, + "grad_norm": 1.188574871207862, + "learning_rate": 8.076622085034496e-07, + "loss": 0.27, + "step": 28342 + }, + { + "epoch": 0.82, + "grad_norm": 1.3176253071349542, + "learning_rate": 8.074062591064785e-07, + "loss": 0.2784, + "step": 28343 + }, + { + "epoch": 0.82, + "grad_norm": 1.457622038774201, + "learning_rate": 8.071503467091924e-07, + "loss": 0.2667, + "step": 28344 + }, + { + "epoch": 0.82, + "grad_norm": 1.3990947722410123, + "learning_rate": 8.068944713138505e-07, + "loss": 0.3031, + "step": 28345 + }, + { + "epoch": 0.82, + "grad_norm": 1.3600042435633446, + "learning_rate": 8.066386329227116e-07, + "loss": 0.2859, + "step": 28346 + }, + { + "epoch": 0.82, + "grad_norm": 1.3671858609269358, + "learning_rate": 8.063828315380334e-07, + "loss": 0.2963, + "step": 28347 + }, + { + "epoch": 0.82, + "grad_norm": 1.3572918378800518, + "learning_rate": 8.061270671620735e-07, + "loss": 0.271, + "step": 28348 + }, + { + "epoch": 0.82, + "grad_norm": 1.4952249830809183, + "learning_rate": 8.058713397970885e-07, + "loss": 0.287, + "step": 28349 + }, + { + "epoch": 0.82, + "grad_norm": 1.8558107278539535, + "learning_rate": 8.056156494453354e-07, + "loss": 0.2591, + "step": 28350 + }, + { + "epoch": 0.82, + "grad_norm": 1.399935245686407, + "learning_rate": 8.053599961090719e-07, + "loss": 0.288, + "step": 28351 + }, + { + "epoch": 0.82, + "grad_norm": 1.5293061639392014, + "learning_rate": 8.051043797905517e-07, + "loss": 0.2818, + "step": 28352 + }, + { + "epoch": 0.82, + "grad_norm": 1.4233894219682046, + "learning_rate": 8.048488004920318e-07, + "loss": 0.2943, + "step": 28353 + }, + { + "epoch": 0.82, + "grad_norm": 1.376544156642729, + "learning_rate": 8.045932582157684e-07, + "loss": 0.2744, + "step": 28354 + }, + { + "epoch": 0.82, + "grad_norm": 1.3759437960208178, + "learning_rate": 8.043377529640156e-07, + "loss": 0.2866, + "step": 28355 + }, + { + "epoch": 0.82, + "grad_norm": 1.3053476981564611, + "learning_rate": 8.040822847390284e-07, + "loss": 0.2819, + "step": 28356 + }, + { + "epoch": 0.82, + "grad_norm": 1.3563638309711303, + "learning_rate": 8.038268535430621e-07, + "loss": 0.2723, + "step": 28357 + }, + { + "epoch": 0.82, + "grad_norm": 1.4402398953437454, + "learning_rate": 8.0357145937837e-07, + "loss": 0.2705, + "step": 28358 + }, + { + "epoch": 0.82, + "grad_norm": 1.3553001925235035, + "learning_rate": 8.033161022472063e-07, + "loss": 0.2701, + "step": 28359 + }, + { + "epoch": 0.82, + "grad_norm": 1.451040308265323, + "learning_rate": 8.03060782151826e-07, + "loss": 0.2866, + "step": 28360 + }, + { + "epoch": 0.82, + "grad_norm": 1.43130821105329, + "learning_rate": 8.028054990944794e-07, + "loss": 0.2737, + "step": 28361 + }, + { + "epoch": 0.82, + "grad_norm": 1.3613994426874898, + "learning_rate": 8.025502530774209e-07, + "loss": 0.2479, + "step": 28362 + }, + { + "epoch": 0.82, + "grad_norm": 1.316772577783553, + "learning_rate": 8.022950441029031e-07, + "loss": 0.2623, + "step": 28363 + }, + { + "epoch": 0.82, + "grad_norm": 1.2577306533139765, + "learning_rate": 8.020398721731781e-07, + "loss": 0.266, + "step": 28364 + }, + { + "epoch": 0.82, + "grad_norm": 1.2500995858785708, + "learning_rate": 8.017847372904991e-07, + "loss": 0.288, + "step": 28365 + }, + { + "epoch": 0.82, + "grad_norm": 1.4093198645457836, + "learning_rate": 8.015296394571153e-07, + "loss": 0.2725, + "step": 28366 + }, + { + "epoch": 0.82, + "grad_norm": 1.9807259144446168, + "learning_rate": 8.012745786752784e-07, + "loss": 0.2583, + "step": 28367 + }, + { + "epoch": 0.82, + "grad_norm": 1.265643583897082, + "learning_rate": 8.010195549472405e-07, + "loss": 0.2784, + "step": 28368 + }, + { + "epoch": 0.82, + "grad_norm": 1.3293443618942953, + "learning_rate": 8.007645682752513e-07, + "loss": 0.2679, + "step": 28369 + }, + { + "epoch": 0.82, + "grad_norm": 1.490496200729344, + "learning_rate": 8.00509618661563e-07, + "loss": 0.283, + "step": 28370 + }, + { + "epoch": 0.82, + "grad_norm": 1.452644078991839, + "learning_rate": 8.00254706108422e-07, + "loss": 0.2963, + "step": 28371 + }, + { + "epoch": 0.82, + "grad_norm": 1.6213581646621065, + "learning_rate": 7.9999983061808e-07, + "loss": 0.2618, + "step": 28372 + }, + { + "epoch": 0.82, + "grad_norm": 1.231973559970065, + "learning_rate": 7.997449921927864e-07, + "loss": 0.2443, + "step": 28373 + }, + { + "epoch": 0.82, + "grad_norm": 1.24988432835364, + "learning_rate": 7.994901908347896e-07, + "loss": 0.2486, + "step": 28374 + }, + { + "epoch": 0.82, + "grad_norm": 1.4900273173558594, + "learning_rate": 7.992354265463387e-07, + "loss": 0.2738, + "step": 28375 + }, + { + "epoch": 0.82, + "grad_norm": 1.2724739322777259, + "learning_rate": 7.989806993296817e-07, + "loss": 0.2701, + "step": 28376 + }, + { + "epoch": 0.82, + "grad_norm": 1.35115943404155, + "learning_rate": 7.987260091870664e-07, + "loss": 0.2881, + "step": 28377 + }, + { + "epoch": 0.82, + "grad_norm": 1.5540464689233575, + "learning_rate": 7.984713561207419e-07, + "loss": 0.2887, + "step": 28378 + }, + { + "epoch": 0.82, + "grad_norm": 1.2867600765582405, + "learning_rate": 7.982167401329532e-07, + "loss": 0.2685, + "step": 28379 + }, + { + "epoch": 0.82, + "grad_norm": 1.3769894567084329, + "learning_rate": 7.979621612259486e-07, + "loss": 0.2775, + "step": 28380 + }, + { + "epoch": 0.82, + "grad_norm": 1.290408569537294, + "learning_rate": 7.977076194019745e-07, + "loss": 0.2477, + "step": 28381 + }, + { + "epoch": 0.82, + "grad_norm": 1.2547985431824937, + "learning_rate": 7.974531146632769e-07, + "loss": 0.268, + "step": 28382 + }, + { + "epoch": 0.82, + "grad_norm": 1.4982835690362395, + "learning_rate": 7.971986470121023e-07, + "loss": 0.2578, + "step": 28383 + }, + { + "epoch": 0.82, + "grad_norm": 1.3805868696379617, + "learning_rate": 7.969442164506963e-07, + "loss": 0.2632, + "step": 28384 + }, + { + "epoch": 0.82, + "grad_norm": 1.3322284352425104, + "learning_rate": 7.966898229813047e-07, + "loss": 0.2556, + "step": 28385 + }, + { + "epoch": 0.82, + "grad_norm": 1.2756943558065823, + "learning_rate": 7.964354666061713e-07, + "loss": 0.2871, + "step": 28386 + }, + { + "epoch": 0.82, + "grad_norm": 2.2171187397390124, + "learning_rate": 7.961811473275433e-07, + "loss": 0.3009, + "step": 28387 + }, + { + "epoch": 0.82, + "grad_norm": 1.3486687530884494, + "learning_rate": 7.959268651476615e-07, + "loss": 0.2689, + "step": 28388 + }, + { + "epoch": 0.82, + "grad_norm": 1.6076330346888252, + "learning_rate": 7.956726200687726e-07, + "loss": 0.2751, + "step": 28389 + }, + { + "epoch": 0.82, + "grad_norm": 1.257973381012157, + "learning_rate": 7.954184120931186e-07, + "loss": 0.2617, + "step": 28390 + }, + { + "epoch": 0.82, + "grad_norm": 1.3066443272776829, + "learning_rate": 7.951642412229444e-07, + "loss": 0.2833, + "step": 28391 + }, + { + "epoch": 0.82, + "grad_norm": 1.307334036603418, + "learning_rate": 7.94910107460492e-07, + "loss": 0.2678, + "step": 28392 + }, + { + "epoch": 0.82, + "grad_norm": 1.2478040959867593, + "learning_rate": 7.946560108080059e-07, + "loss": 0.2708, + "step": 28393 + }, + { + "epoch": 0.82, + "grad_norm": 0.8745285505998488, + "learning_rate": 7.944019512677259e-07, + "loss": 0.5726, + "step": 28394 + }, + { + "epoch": 0.82, + "grad_norm": 1.3368671014539102, + "learning_rate": 7.941479288418957e-07, + "loss": 0.2645, + "step": 28395 + }, + { + "epoch": 0.82, + "grad_norm": 1.3595875646516655, + "learning_rate": 7.938939435327564e-07, + "loss": 0.2621, + "step": 28396 + }, + { + "epoch": 0.82, + "grad_norm": 1.4096291299091281, + "learning_rate": 7.936399953425505e-07, + "loss": 0.2786, + "step": 28397 + }, + { + "epoch": 0.82, + "grad_norm": 1.3517743516118754, + "learning_rate": 7.933860842735174e-07, + "loss": 0.2938, + "step": 28398 + }, + { + "epoch": 0.82, + "grad_norm": 1.4273973622271285, + "learning_rate": 7.931322103278993e-07, + "loss": 0.2795, + "step": 28399 + }, + { + "epoch": 0.82, + "grad_norm": 1.495803844594475, + "learning_rate": 7.928783735079354e-07, + "loss": 0.2794, + "step": 28400 + }, + { + "epoch": 0.82, + "grad_norm": 1.3763617289545573, + "learning_rate": 7.926245738158666e-07, + "loss": 0.2689, + "step": 28401 + }, + { + "epoch": 0.82, + "grad_norm": 1.236119715331576, + "learning_rate": 7.92370811253933e-07, + "loss": 0.264, + "step": 28402 + }, + { + "epoch": 0.82, + "grad_norm": 1.2986395964747104, + "learning_rate": 7.921170858243737e-07, + "loss": 0.2786, + "step": 28403 + }, + { + "epoch": 0.82, + "grad_norm": 1.4662830231637207, + "learning_rate": 7.918633975294277e-07, + "loss": 0.2667, + "step": 28404 + }, + { + "epoch": 0.82, + "grad_norm": 1.2875935094979103, + "learning_rate": 7.916097463713335e-07, + "loss": 0.263, + "step": 28405 + }, + { + "epoch": 0.82, + "grad_norm": 1.3794864729360585, + "learning_rate": 7.913561323523317e-07, + "loss": 0.2697, + "step": 28406 + }, + { + "epoch": 0.82, + "grad_norm": 1.006299367870581, + "learning_rate": 7.911025554746571e-07, + "loss": 0.5993, + "step": 28407 + }, + { + "epoch": 0.82, + "grad_norm": 1.3542640614629406, + "learning_rate": 7.908490157405491e-07, + "loss": 0.266, + "step": 28408 + }, + { + "epoch": 0.82, + "grad_norm": 1.4706747372650846, + "learning_rate": 7.905955131522458e-07, + "loss": 0.2603, + "step": 28409 + }, + { + "epoch": 0.82, + "grad_norm": 1.2474511662011634, + "learning_rate": 7.903420477119833e-07, + "loss": 0.2779, + "step": 28410 + }, + { + "epoch": 0.82, + "grad_norm": 1.299512080646041, + "learning_rate": 7.900886194219992e-07, + "loss": 0.2625, + "step": 28411 + }, + { + "epoch": 0.82, + "grad_norm": 1.348679302309432, + "learning_rate": 7.898352282845295e-07, + "loss": 0.2891, + "step": 28412 + }, + { + "epoch": 0.82, + "grad_norm": 1.3463310903444263, + "learning_rate": 7.895818743018108e-07, + "loss": 0.2643, + "step": 28413 + }, + { + "epoch": 0.82, + "grad_norm": 1.342764688647784, + "learning_rate": 7.893285574760801e-07, + "loss": 0.2869, + "step": 28414 + }, + { + "epoch": 0.82, + "grad_norm": 1.3122295395229688, + "learning_rate": 7.8907527780957e-07, + "loss": 0.2651, + "step": 28415 + }, + { + "epoch": 0.82, + "grad_norm": 1.3629662603803436, + "learning_rate": 7.888220353045173e-07, + "loss": 0.2679, + "step": 28416 + }, + { + "epoch": 0.82, + "grad_norm": 1.293952158022725, + "learning_rate": 7.885688299631572e-07, + "loss": 0.3103, + "step": 28417 + }, + { + "epoch": 0.82, + "grad_norm": 1.1728584372027153, + "learning_rate": 7.883156617877236e-07, + "loss": 0.2426, + "step": 28418 + }, + { + "epoch": 0.82, + "grad_norm": 1.5751780970718523, + "learning_rate": 7.880625307804512e-07, + "loss": 0.2686, + "step": 28419 + }, + { + "epoch": 0.82, + "grad_norm": 1.4461672570733846, + "learning_rate": 7.878094369435735e-07, + "loss": 0.3024, + "step": 28420 + }, + { + "epoch": 0.82, + "grad_norm": 1.5785441546520975, + "learning_rate": 7.875563802793257e-07, + "loss": 0.2775, + "step": 28421 + }, + { + "epoch": 0.82, + "grad_norm": 2.0217191230356164, + "learning_rate": 7.873033607899383e-07, + "loss": 0.3212, + "step": 28422 + }, + { + "epoch": 0.82, + "grad_norm": 1.3648513615890039, + "learning_rate": 7.870503784776451e-07, + "loss": 0.2659, + "step": 28423 + }, + { + "epoch": 0.82, + "grad_norm": 1.5600315357190397, + "learning_rate": 7.867974333446792e-07, + "loss": 0.274, + "step": 28424 + }, + { + "epoch": 0.82, + "grad_norm": 1.34854276841391, + "learning_rate": 7.865445253932741e-07, + "loss": 0.2814, + "step": 28425 + }, + { + "epoch": 0.82, + "grad_norm": 1.2651045800277252, + "learning_rate": 7.862916546256594e-07, + "loss": 0.2444, + "step": 28426 + }, + { + "epoch": 0.82, + "grad_norm": 1.2098856279178973, + "learning_rate": 7.860388210440673e-07, + "loss": 0.2491, + "step": 28427 + }, + { + "epoch": 0.82, + "grad_norm": 1.8496539515080215, + "learning_rate": 7.857860246507293e-07, + "loss": 0.2684, + "step": 28428 + }, + { + "epoch": 0.82, + "grad_norm": 1.8978167099984666, + "learning_rate": 7.855332654478759e-07, + "loss": 0.2693, + "step": 28429 + }, + { + "epoch": 0.82, + "grad_norm": 1.5592116966333445, + "learning_rate": 7.852805434377392e-07, + "loss": 0.2604, + "step": 28430 + }, + { + "epoch": 0.82, + "grad_norm": 1.3825008955965103, + "learning_rate": 7.850278586225473e-07, + "loss": 0.2913, + "step": 28431 + }, + { + "epoch": 0.82, + "grad_norm": 1.4806860277889942, + "learning_rate": 7.84775211004532e-07, + "loss": 0.2725, + "step": 28432 + }, + { + "epoch": 0.82, + "grad_norm": 1.3000460708606905, + "learning_rate": 7.845226005859236e-07, + "loss": 0.2658, + "step": 28433 + }, + { + "epoch": 0.82, + "grad_norm": 1.3417105094770068, + "learning_rate": 7.842700273689486e-07, + "loss": 0.2666, + "step": 28434 + }, + { + "epoch": 0.82, + "grad_norm": 1.3466504302067628, + "learning_rate": 7.840174913558373e-07, + "loss": 0.283, + "step": 28435 + }, + { + "epoch": 0.82, + "grad_norm": 1.260113921256751, + "learning_rate": 7.837649925488184e-07, + "loss": 0.2643, + "step": 28436 + }, + { + "epoch": 0.82, + "grad_norm": 1.2418003919265488, + "learning_rate": 7.835125309501202e-07, + "loss": 0.293, + "step": 28437 + }, + { + "epoch": 0.82, + "grad_norm": 1.61857191018163, + "learning_rate": 7.83260106561971e-07, + "loss": 0.2625, + "step": 28438 + }, + { + "epoch": 0.82, + "grad_norm": 1.4778262594834943, + "learning_rate": 7.830077193865976e-07, + "loss": 0.2687, + "step": 28439 + }, + { + "epoch": 0.82, + "grad_norm": 1.3290817198768965, + "learning_rate": 7.827553694262285e-07, + "loss": 0.2686, + "step": 28440 + }, + { + "epoch": 0.82, + "grad_norm": 1.5217507790458873, + "learning_rate": 7.825030566830894e-07, + "loss": 0.2797, + "step": 28441 + }, + { + "epoch": 0.82, + "grad_norm": 1.3599303538146128, + "learning_rate": 7.822507811594094e-07, + "loss": 0.276, + "step": 28442 + }, + { + "epoch": 0.82, + "grad_norm": 1.385934819074085, + "learning_rate": 7.819985428574112e-07, + "loss": 0.2766, + "step": 28443 + }, + { + "epoch": 0.83, + "grad_norm": 1.5794237521223768, + "learning_rate": 7.817463417793231e-07, + "loss": 0.2675, + "step": 28444 + }, + { + "epoch": 0.83, + "grad_norm": 1.414621436919924, + "learning_rate": 7.814941779273705e-07, + "loss": 0.2764, + "step": 28445 + }, + { + "epoch": 0.83, + "grad_norm": 1.3031820190341183, + "learning_rate": 7.812420513037783e-07, + "loss": 0.2694, + "step": 28446 + }, + { + "epoch": 0.83, + "grad_norm": 1.9548347416266079, + "learning_rate": 7.809899619107714e-07, + "loss": 0.2563, + "step": 28447 + }, + { + "epoch": 0.83, + "grad_norm": 1.3289399737720753, + "learning_rate": 7.807379097505758e-07, + "loss": 0.2652, + "step": 28448 + }, + { + "epoch": 0.83, + "grad_norm": 1.7277922239469032, + "learning_rate": 7.804858948254141e-07, + "loss": 0.2798, + "step": 28449 + }, + { + "epoch": 0.83, + "grad_norm": 1.5243102630473835, + "learning_rate": 7.802339171375128e-07, + "loss": 0.2574, + "step": 28450 + }, + { + "epoch": 0.83, + "grad_norm": 1.3023139599348807, + "learning_rate": 7.799819766890926e-07, + "loss": 0.2631, + "step": 28451 + }, + { + "epoch": 0.83, + "grad_norm": 1.5210334085731754, + "learning_rate": 7.797300734823798e-07, + "loss": 0.2666, + "step": 28452 + }, + { + "epoch": 0.83, + "grad_norm": 1.309806435115666, + "learning_rate": 7.794782075195945e-07, + "loss": 0.2605, + "step": 28453 + }, + { + "epoch": 0.83, + "grad_norm": 1.533794772787044, + "learning_rate": 7.792263788029608e-07, + "loss": 0.2857, + "step": 28454 + }, + { + "epoch": 0.83, + "grad_norm": 1.0153748133706755, + "learning_rate": 7.78974587334701e-07, + "loss": 0.6348, + "step": 28455 + }, + { + "epoch": 0.83, + "grad_norm": 1.256084788017171, + "learning_rate": 7.787228331170377e-07, + "loss": 0.2655, + "step": 28456 + }, + { + "epoch": 0.83, + "grad_norm": 1.2483683829433192, + "learning_rate": 7.784711161521918e-07, + "loss": 0.2688, + "step": 28457 + }, + { + "epoch": 0.83, + "grad_norm": 1.35252468354881, + "learning_rate": 7.782194364423856e-07, + "loss": 0.287, + "step": 28458 + }, + { + "epoch": 0.83, + "grad_norm": 1.3365188864506095, + "learning_rate": 7.779677939898395e-07, + "loss": 0.2712, + "step": 28459 + }, + { + "epoch": 0.83, + "grad_norm": 1.337775028580387, + "learning_rate": 7.777161887967744e-07, + "loss": 0.2772, + "step": 28460 + }, + { + "epoch": 0.83, + "grad_norm": 1.6780530606791089, + "learning_rate": 7.774646208654118e-07, + "loss": 0.263, + "step": 28461 + }, + { + "epoch": 0.83, + "grad_norm": 1.4132269713737475, + "learning_rate": 7.772130901979696e-07, + "loss": 0.2825, + "step": 28462 + }, + { + "epoch": 0.83, + "grad_norm": 1.0014790062833399, + "learning_rate": 7.769615967966693e-07, + "loss": 0.5778, + "step": 28463 + }, + { + "epoch": 0.83, + "grad_norm": 1.28257288960868, + "learning_rate": 7.767101406637289e-07, + "loss": 0.2748, + "step": 28464 + }, + { + "epoch": 0.83, + "grad_norm": 1.4480973848804335, + "learning_rate": 7.764587218013692e-07, + "loss": 0.3051, + "step": 28465 + }, + { + "epoch": 0.83, + "grad_norm": 1.3588053719919977, + "learning_rate": 7.762073402118075e-07, + "loss": 0.2873, + "step": 28466 + }, + { + "epoch": 0.83, + "grad_norm": 1.3992928968516718, + "learning_rate": 7.759559958972628e-07, + "loss": 0.2596, + "step": 28467 + }, + { + "epoch": 0.83, + "grad_norm": 1.4061520431739911, + "learning_rate": 7.75704688859954e-07, + "loss": 0.2755, + "step": 28468 + }, + { + "epoch": 0.83, + "grad_norm": 1.292115653160273, + "learning_rate": 7.754534191020985e-07, + "loss": 0.2846, + "step": 28469 + }, + { + "epoch": 0.83, + "grad_norm": 1.2867538833866985, + "learning_rate": 7.75202186625913e-07, + "loss": 0.2787, + "step": 28470 + }, + { + "epoch": 0.83, + "grad_norm": 1.2633549385586396, + "learning_rate": 7.749509914336146e-07, + "loss": 0.2615, + "step": 28471 + }, + { + "epoch": 0.83, + "grad_norm": 1.4446320244898774, + "learning_rate": 7.746998335274208e-07, + "loss": 0.282, + "step": 28472 + }, + { + "epoch": 0.83, + "grad_norm": 1.3109336298560523, + "learning_rate": 7.744487129095479e-07, + "loss": 0.2653, + "step": 28473 + }, + { + "epoch": 0.83, + "grad_norm": 1.3063190160289944, + "learning_rate": 7.741976295822118e-07, + "loss": 0.2721, + "step": 28474 + }, + { + "epoch": 0.83, + "grad_norm": 1.3696081516092482, + "learning_rate": 7.739465835476284e-07, + "loss": 0.281, + "step": 28475 + }, + { + "epoch": 0.83, + "grad_norm": 1.3937754324636005, + "learning_rate": 7.736955748080138e-07, + "loss": 0.2964, + "step": 28476 + }, + { + "epoch": 0.83, + "grad_norm": 1.3620600050818765, + "learning_rate": 7.734446033655824e-07, + "loss": 0.2476, + "step": 28477 + }, + { + "epoch": 0.83, + "grad_norm": 1.3676903934858868, + "learning_rate": 7.7319366922255e-07, + "loss": 0.2871, + "step": 28478 + }, + { + "epoch": 0.83, + "grad_norm": 1.2904854297878838, + "learning_rate": 7.729427723811294e-07, + "loss": 0.2773, + "step": 28479 + }, + { + "epoch": 0.83, + "grad_norm": 1.3620418172389621, + "learning_rate": 7.726919128435373e-07, + "loss": 0.2559, + "step": 28480 + }, + { + "epoch": 0.83, + "grad_norm": 1.279135108810726, + "learning_rate": 7.724410906119845e-07, + "loss": 0.2709, + "step": 28481 + }, + { + "epoch": 0.83, + "grad_norm": 1.3566402148256462, + "learning_rate": 7.721903056886859e-07, + "loss": 0.2817, + "step": 28482 + }, + { + "epoch": 0.83, + "grad_norm": 1.3053490512304164, + "learning_rate": 7.719395580758549e-07, + "loss": 0.2699, + "step": 28483 + }, + { + "epoch": 0.83, + "grad_norm": 1.3310481946843966, + "learning_rate": 7.716888477757039e-07, + "loss": 0.2661, + "step": 28484 + }, + { + "epoch": 0.83, + "grad_norm": 1.323875928880469, + "learning_rate": 7.714381747904459e-07, + "loss": 0.2925, + "step": 28485 + }, + { + "epoch": 0.83, + "grad_norm": 1.3185252881147367, + "learning_rate": 7.711875391222934e-07, + "loss": 0.2696, + "step": 28486 + }, + { + "epoch": 0.83, + "grad_norm": 1.540215216180229, + "learning_rate": 7.709369407734574e-07, + "loss": 0.3082, + "step": 28487 + }, + { + "epoch": 0.83, + "grad_norm": 1.4689199109540974, + "learning_rate": 7.706863797461506e-07, + "loss": 0.2916, + "step": 28488 + }, + { + "epoch": 0.83, + "grad_norm": 1.3861339031007647, + "learning_rate": 7.704358560425829e-07, + "loss": 0.2717, + "step": 28489 + }, + { + "epoch": 0.83, + "grad_norm": 1.5976500421804505, + "learning_rate": 7.701853696649653e-07, + "loss": 0.276, + "step": 28490 + }, + { + "epoch": 0.83, + "grad_norm": 2.1564833190193426, + "learning_rate": 7.699349206155087e-07, + "loss": 0.2764, + "step": 28491 + }, + { + "epoch": 0.83, + "grad_norm": 1.3443641477516874, + "learning_rate": 7.696845088964234e-07, + "loss": 0.2585, + "step": 28492 + }, + { + "epoch": 0.83, + "grad_norm": 1.6333087588053146, + "learning_rate": 7.694341345099193e-07, + "loss": 0.2767, + "step": 28493 + }, + { + "epoch": 0.83, + "grad_norm": 1.3874160195390106, + "learning_rate": 7.691837974582061e-07, + "loss": 0.2634, + "step": 28494 + }, + { + "epoch": 0.83, + "grad_norm": 1.435134024312784, + "learning_rate": 7.689334977434926e-07, + "loss": 0.276, + "step": 28495 + }, + { + "epoch": 0.83, + "grad_norm": 1.3109454683043709, + "learning_rate": 7.68683235367988e-07, + "loss": 0.2768, + "step": 28496 + }, + { + "epoch": 0.83, + "grad_norm": 1.433824353688451, + "learning_rate": 7.684330103339016e-07, + "loss": 0.2547, + "step": 28497 + }, + { + "epoch": 0.83, + "grad_norm": 1.376181761897402, + "learning_rate": 7.681828226434402e-07, + "loss": 0.2848, + "step": 28498 + }, + { + "epoch": 0.83, + "grad_norm": 1.3127107743033524, + "learning_rate": 7.679326722988118e-07, + "loss": 0.2732, + "step": 28499 + }, + { + "epoch": 0.83, + "grad_norm": 1.584673334011198, + "learning_rate": 7.676825593022252e-07, + "loss": 0.2709, + "step": 28500 + }, + { + "epoch": 0.83, + "grad_norm": 1.371232091547548, + "learning_rate": 7.674324836558867e-07, + "loss": 0.2802, + "step": 28501 + }, + { + "epoch": 0.83, + "grad_norm": 1.3343307752793725, + "learning_rate": 7.67182445362003e-07, + "loss": 0.2611, + "step": 28502 + }, + { + "epoch": 0.83, + "grad_norm": 1.8084518110717644, + "learning_rate": 7.669324444227815e-07, + "loss": 0.2852, + "step": 28503 + }, + { + "epoch": 0.83, + "grad_norm": 1.281320276199017, + "learning_rate": 7.666824808404283e-07, + "loss": 0.2614, + "step": 28504 + }, + { + "epoch": 0.83, + "grad_norm": 1.2569723898327512, + "learning_rate": 7.664325546171502e-07, + "loss": 0.2994, + "step": 28505 + }, + { + "epoch": 0.83, + "grad_norm": 0.957602716092266, + "learning_rate": 7.661826657551508e-07, + "loss": 0.5814, + "step": 28506 + }, + { + "epoch": 0.83, + "grad_norm": 1.2141873132933931, + "learning_rate": 7.659328142566369e-07, + "loss": 0.2496, + "step": 28507 + }, + { + "epoch": 0.83, + "grad_norm": 1.3523966580532985, + "learning_rate": 7.656830001238125e-07, + "loss": 0.2632, + "step": 28508 + }, + { + "epoch": 0.83, + "grad_norm": 1.7439163471400327, + "learning_rate": 7.654332233588819e-07, + "loss": 0.2573, + "step": 28509 + }, + { + "epoch": 0.83, + "grad_norm": 1.29222510371806, + "learning_rate": 7.651834839640505e-07, + "loss": 0.2676, + "step": 28510 + }, + { + "epoch": 0.83, + "grad_norm": 3.4501857111626877, + "learning_rate": 7.64933781941522e-07, + "loss": 0.2657, + "step": 28511 + }, + { + "epoch": 0.83, + "grad_norm": 2.1958962603958243, + "learning_rate": 7.646841172934993e-07, + "loss": 0.2639, + "step": 28512 + }, + { + "epoch": 0.83, + "grad_norm": 1.3201563749266374, + "learning_rate": 7.644344900221868e-07, + "loss": 0.2844, + "step": 28513 + }, + { + "epoch": 0.83, + "grad_norm": 1.3812962233687889, + "learning_rate": 7.641849001297863e-07, + "loss": 0.2717, + "step": 28514 + }, + { + "epoch": 0.83, + "grad_norm": 1.4050788836473815, + "learning_rate": 7.639353476185013e-07, + "loss": 0.2853, + "step": 28515 + }, + { + "epoch": 0.83, + "grad_norm": 1.3489545439001782, + "learning_rate": 7.636858324905355e-07, + "loss": 0.2714, + "step": 28516 + }, + { + "epoch": 0.83, + "grad_norm": 1.30605942924608, + "learning_rate": 7.634363547480878e-07, + "loss": 0.2578, + "step": 28517 + }, + { + "epoch": 0.83, + "grad_norm": 1.268671372501806, + "learning_rate": 7.631869143933606e-07, + "loss": 0.271, + "step": 28518 + }, + { + "epoch": 0.83, + "grad_norm": 1.4155132609744736, + "learning_rate": 7.62937511428557e-07, + "loss": 0.277, + "step": 28519 + }, + { + "epoch": 0.83, + "grad_norm": 1.456228298840606, + "learning_rate": 7.626881458558761e-07, + "loss": 0.2696, + "step": 28520 + }, + { + "epoch": 0.83, + "grad_norm": 1.2522605573838572, + "learning_rate": 7.624388176775194e-07, + "loss": 0.2468, + "step": 28521 + }, + { + "epoch": 0.83, + "grad_norm": 1.2817129127308784, + "learning_rate": 7.621895268956875e-07, + "loss": 0.2737, + "step": 28522 + }, + { + "epoch": 0.83, + "grad_norm": 1.3174169933524498, + "learning_rate": 7.619402735125797e-07, + "loss": 0.2638, + "step": 28523 + }, + { + "epoch": 0.83, + "grad_norm": 1.347620340007624, + "learning_rate": 7.616910575303971e-07, + "loss": 0.2778, + "step": 28524 + }, + { + "epoch": 0.83, + "grad_norm": 2.204811827919329, + "learning_rate": 7.614418789513372e-07, + "loss": 0.2852, + "step": 28525 + }, + { + "epoch": 0.83, + "grad_norm": 1.4667054561691488, + "learning_rate": 7.61192737777599e-07, + "loss": 0.2706, + "step": 28526 + }, + { + "epoch": 0.83, + "grad_norm": 1.6899314847985276, + "learning_rate": 7.609436340113824e-07, + "loss": 0.2726, + "step": 28527 + }, + { + "epoch": 0.83, + "grad_norm": 1.214459481962901, + "learning_rate": 7.606945676548855e-07, + "loss": 0.2411, + "step": 28528 + }, + { + "epoch": 0.83, + "grad_norm": 1.288700310646266, + "learning_rate": 7.60445538710306e-07, + "loss": 0.3009, + "step": 28529 + }, + { + "epoch": 0.83, + "grad_norm": 1.3407412087513442, + "learning_rate": 7.601965471798412e-07, + "loss": 0.2725, + "step": 28530 + }, + { + "epoch": 0.83, + "grad_norm": 1.2839259390277349, + "learning_rate": 7.599475930656891e-07, + "loss": 0.2669, + "step": 28531 + }, + { + "epoch": 0.83, + "grad_norm": 1.7465917896296783, + "learning_rate": 7.596986763700464e-07, + "loss": 0.2982, + "step": 28532 + }, + { + "epoch": 0.83, + "grad_norm": 1.3103627464200027, + "learning_rate": 7.594497970951114e-07, + "loss": 0.2839, + "step": 28533 + }, + { + "epoch": 0.83, + "grad_norm": 1.2032855003494773, + "learning_rate": 7.59200955243078e-07, + "loss": 0.2495, + "step": 28534 + }, + { + "epoch": 0.83, + "grad_norm": 1.3706780661540199, + "learning_rate": 7.58952150816144e-07, + "loss": 0.3161, + "step": 28535 + }, + { + "epoch": 0.83, + "grad_norm": 1.3859340596428027, + "learning_rate": 7.587033838165032e-07, + "loss": 0.2687, + "step": 28536 + }, + { + "epoch": 0.83, + "grad_norm": 1.3180995964874833, + "learning_rate": 7.584546542463522e-07, + "loss": 0.2754, + "step": 28537 + }, + { + "epoch": 0.83, + "grad_norm": 2.3554910133322955, + "learning_rate": 7.582059621078858e-07, + "loss": 0.2774, + "step": 28538 + }, + { + "epoch": 0.83, + "grad_norm": 1.4233507696978231, + "learning_rate": 7.579573074032987e-07, + "loss": 0.2565, + "step": 28539 + }, + { + "epoch": 0.83, + "grad_norm": 1.4856374150979905, + "learning_rate": 7.577086901347857e-07, + "loss": 0.2549, + "step": 28540 + }, + { + "epoch": 0.83, + "grad_norm": 1.3798627783547428, + "learning_rate": 7.574601103045404e-07, + "loss": 0.265, + "step": 28541 + }, + { + "epoch": 0.83, + "grad_norm": 1.706968175706253, + "learning_rate": 7.572115679147568e-07, + "loss": 0.2676, + "step": 28542 + }, + { + "epoch": 0.83, + "grad_norm": 1.293494376962524, + "learning_rate": 7.569630629676294e-07, + "loss": 0.2904, + "step": 28543 + }, + { + "epoch": 0.83, + "grad_norm": 1.2057254896126073, + "learning_rate": 7.567145954653488e-07, + "loss": 0.2632, + "step": 28544 + }, + { + "epoch": 0.83, + "grad_norm": 1.3702489000943507, + "learning_rate": 7.564661654101091e-07, + "loss": 0.2758, + "step": 28545 + }, + { + "epoch": 0.83, + "grad_norm": 1.442917843194295, + "learning_rate": 7.56217772804102e-07, + "loss": 0.2724, + "step": 28546 + }, + { + "epoch": 0.83, + "grad_norm": 4.5046501965631585, + "learning_rate": 7.559694176495208e-07, + "loss": 0.2742, + "step": 28547 + }, + { + "epoch": 0.83, + "grad_norm": 1.3394286571623328, + "learning_rate": 7.557210999485564e-07, + "loss": 0.2665, + "step": 28548 + }, + { + "epoch": 0.83, + "grad_norm": 1.663626727707017, + "learning_rate": 7.554728197034001e-07, + "loss": 0.2925, + "step": 28549 + }, + { + "epoch": 0.83, + "grad_norm": 2.4317198570861813, + "learning_rate": 7.552245769162436e-07, + "loss": 0.2693, + "step": 28550 + }, + { + "epoch": 0.83, + "grad_norm": 1.5367667157408695, + "learning_rate": 7.549763715892771e-07, + "loss": 0.3084, + "step": 28551 + }, + { + "epoch": 0.83, + "grad_norm": 1.7731628158722954, + "learning_rate": 7.547282037246922e-07, + "loss": 0.311, + "step": 28552 + }, + { + "epoch": 0.83, + "grad_norm": 1.6030341977678018, + "learning_rate": 7.544800733246771e-07, + "loss": 0.2936, + "step": 28553 + }, + { + "epoch": 0.83, + "grad_norm": 1.4069500139704667, + "learning_rate": 7.542319803914228e-07, + "loss": 0.2636, + "step": 28554 + }, + { + "epoch": 0.83, + "grad_norm": 1.3973883693088887, + "learning_rate": 7.539839249271175e-07, + "loss": 0.2682, + "step": 28555 + }, + { + "epoch": 0.83, + "grad_norm": 1.9334582010850523, + "learning_rate": 7.537359069339518e-07, + "loss": 0.2656, + "step": 28556 + }, + { + "epoch": 0.83, + "grad_norm": 1.2596665463733994, + "learning_rate": 7.534879264141138e-07, + "loss": 0.2702, + "step": 28557 + }, + { + "epoch": 0.83, + "grad_norm": 1.4277103491127663, + "learning_rate": 7.532399833697917e-07, + "loss": 0.2808, + "step": 28558 + }, + { + "epoch": 0.83, + "grad_norm": 2.271133613860735, + "learning_rate": 7.529920778031741e-07, + "loss": 0.278, + "step": 28559 + }, + { + "epoch": 0.83, + "grad_norm": 1.4999787337760293, + "learning_rate": 7.527442097164494e-07, + "loss": 0.2836, + "step": 28560 + }, + { + "epoch": 0.83, + "grad_norm": 1.544312485339819, + "learning_rate": 7.524963791118034e-07, + "loss": 0.2766, + "step": 28561 + }, + { + "epoch": 0.83, + "grad_norm": 1.2354665043365138, + "learning_rate": 7.522485859914247e-07, + "loss": 0.2569, + "step": 28562 + }, + { + "epoch": 0.83, + "grad_norm": 1.4464863360135995, + "learning_rate": 7.520008303574983e-07, + "loss": 0.2669, + "step": 28563 + }, + { + "epoch": 0.83, + "grad_norm": 1.6830746284652633, + "learning_rate": 7.517531122122118e-07, + "loss": 0.2854, + "step": 28564 + }, + { + "epoch": 0.83, + "grad_norm": 2.0558921597797153, + "learning_rate": 7.515054315577513e-07, + "loss": 0.2758, + "step": 28565 + }, + { + "epoch": 0.83, + "grad_norm": 1.2256626931857448, + "learning_rate": 7.512577883963024e-07, + "loss": 0.2768, + "step": 28566 + }, + { + "epoch": 0.83, + "grad_norm": 1.3970018573811567, + "learning_rate": 7.510101827300509e-07, + "loss": 0.2782, + "step": 28567 + }, + { + "epoch": 0.83, + "grad_norm": 1.2284564495816954, + "learning_rate": 7.507626145611812e-07, + "loss": 0.2759, + "step": 28568 + }, + { + "epoch": 0.83, + "grad_norm": 1.388627266065897, + "learning_rate": 7.50515083891879e-07, + "loss": 0.2549, + "step": 28569 + }, + { + "epoch": 0.83, + "grad_norm": 1.3313450652322316, + "learning_rate": 7.502675907243278e-07, + "loss": 0.2866, + "step": 28570 + }, + { + "epoch": 0.83, + "grad_norm": 0.943910993345612, + "learning_rate": 7.50020135060714e-07, + "loss": 0.5673, + "step": 28571 + }, + { + "epoch": 0.83, + "grad_norm": 1.5974682240708713, + "learning_rate": 7.497727169032182e-07, + "loss": 0.3042, + "step": 28572 + }, + { + "epoch": 0.83, + "grad_norm": 1.3541118382738435, + "learning_rate": 7.495253362540256e-07, + "loss": 0.2908, + "step": 28573 + }, + { + "epoch": 0.83, + "grad_norm": 1.2929720571583512, + "learning_rate": 7.492779931153187e-07, + "loss": 0.2843, + "step": 28574 + }, + { + "epoch": 0.83, + "grad_norm": 1.329174274366053, + "learning_rate": 7.490306874892805e-07, + "loss": 0.2753, + "step": 28575 + }, + { + "epoch": 0.83, + "grad_norm": 1.3220950998404555, + "learning_rate": 7.48783419378094e-07, + "loss": 0.2663, + "step": 28576 + }, + { + "epoch": 0.83, + "grad_norm": 1.2626499750912623, + "learning_rate": 7.48536188783941e-07, + "loss": 0.2786, + "step": 28577 + }, + { + "epoch": 0.83, + "grad_norm": 1.2841845087208628, + "learning_rate": 7.482889957090034e-07, + "loss": 0.2718, + "step": 28578 + }, + { + "epoch": 0.83, + "grad_norm": 1.4053819451978486, + "learning_rate": 7.480418401554634e-07, + "loss": 0.2802, + "step": 28579 + }, + { + "epoch": 0.83, + "grad_norm": 1.2379863130336624, + "learning_rate": 7.477947221255005e-07, + "loss": 0.2673, + "step": 28580 + }, + { + "epoch": 0.83, + "grad_norm": 1.3722313944810867, + "learning_rate": 7.475476416212962e-07, + "loss": 0.2646, + "step": 28581 + }, + { + "epoch": 0.83, + "grad_norm": 1.3239961068411255, + "learning_rate": 7.47300598645031e-07, + "loss": 0.2655, + "step": 28582 + }, + { + "epoch": 0.83, + "grad_norm": 1.438787454601782, + "learning_rate": 7.470535931988854e-07, + "loss": 0.2786, + "step": 28583 + }, + { + "epoch": 0.83, + "grad_norm": 0.9077468062859536, + "learning_rate": 7.468066252850386e-07, + "loss": 0.6055, + "step": 28584 + }, + { + "epoch": 0.83, + "grad_norm": 1.3520333941935045, + "learning_rate": 7.465596949056708e-07, + "loss": 0.293, + "step": 28585 + }, + { + "epoch": 0.83, + "grad_norm": 1.3384862034070928, + "learning_rate": 7.463128020629612e-07, + "loss": 0.2653, + "step": 28586 + }, + { + "epoch": 0.83, + "grad_norm": 1.2508882241186914, + "learning_rate": 7.46065946759088e-07, + "loss": 0.2551, + "step": 28587 + }, + { + "epoch": 0.83, + "grad_norm": 1.2703164893424816, + "learning_rate": 7.458191289962313e-07, + "loss": 0.2774, + "step": 28588 + }, + { + "epoch": 0.83, + "grad_norm": 1.195700505663121, + "learning_rate": 7.455723487765664e-07, + "loss": 0.256, + "step": 28589 + }, + { + "epoch": 0.83, + "grad_norm": 6.453187672643486, + "learning_rate": 7.453256061022746e-07, + "loss": 0.2805, + "step": 28590 + }, + { + "epoch": 0.83, + "grad_norm": 1.474822925739667, + "learning_rate": 7.450789009755299e-07, + "loss": 0.3077, + "step": 28591 + }, + { + "epoch": 0.83, + "grad_norm": 1.2906583052784306, + "learning_rate": 7.448322333985109e-07, + "loss": 0.2611, + "step": 28592 + }, + { + "epoch": 0.83, + "grad_norm": 1.2762927246031326, + "learning_rate": 7.44585603373395e-07, + "loss": 0.2467, + "step": 28593 + }, + { + "epoch": 0.83, + "grad_norm": 1.2964490168944598, + "learning_rate": 7.443390109023579e-07, + "loss": 0.2656, + "step": 28594 + }, + { + "epoch": 0.83, + "grad_norm": 1.4171331222115402, + "learning_rate": 7.440924559875767e-07, + "loss": 0.2774, + "step": 28595 + }, + { + "epoch": 0.83, + "grad_norm": 1.4658030566571882, + "learning_rate": 7.438459386312269e-07, + "loss": 0.3072, + "step": 28596 + }, + { + "epoch": 0.83, + "grad_norm": 1.486713170249819, + "learning_rate": 7.435994588354839e-07, + "loss": 0.2907, + "step": 28597 + }, + { + "epoch": 0.83, + "grad_norm": 1.3930316966889003, + "learning_rate": 7.433530166025238e-07, + "loss": 0.2666, + "step": 28598 + }, + { + "epoch": 0.83, + "grad_norm": 1.5491110923457359, + "learning_rate": 7.431066119345193e-07, + "loss": 0.2537, + "step": 28599 + }, + { + "epoch": 0.83, + "grad_norm": 1.362080412875254, + "learning_rate": 7.428602448336463e-07, + "loss": 0.2587, + "step": 28600 + }, + { + "epoch": 0.83, + "grad_norm": 1.452988644228963, + "learning_rate": 7.426139153020789e-07, + "loss": 0.2925, + "step": 28601 + }, + { + "epoch": 0.83, + "grad_norm": 0.929018682124376, + "learning_rate": 7.423676233419908e-07, + "loss": 0.5567, + "step": 28602 + }, + { + "epoch": 0.83, + "grad_norm": 1.35112411672909, + "learning_rate": 7.421213689555556e-07, + "loss": 0.2655, + "step": 28603 + }, + { + "epoch": 0.83, + "grad_norm": 1.2268990713112737, + "learning_rate": 7.418751521449468e-07, + "loss": 0.2608, + "step": 28604 + }, + { + "epoch": 0.83, + "grad_norm": 1.5654244945460996, + "learning_rate": 7.416289729123372e-07, + "loss": 0.2956, + "step": 28605 + }, + { + "epoch": 0.83, + "grad_norm": 1.5184854995337755, + "learning_rate": 7.413828312598986e-07, + "loss": 0.2711, + "step": 28606 + }, + { + "epoch": 0.83, + "grad_norm": 1.528848791949175, + "learning_rate": 7.411367271898051e-07, + "loss": 0.283, + "step": 28607 + }, + { + "epoch": 0.83, + "grad_norm": 1.5826971780544652, + "learning_rate": 7.408906607042266e-07, + "loss": 0.285, + "step": 28608 + }, + { + "epoch": 0.83, + "grad_norm": 1.5109272117957788, + "learning_rate": 7.406446318053345e-07, + "loss": 0.2617, + "step": 28609 + }, + { + "epoch": 0.83, + "grad_norm": 1.3233219819391384, + "learning_rate": 7.403986404953012e-07, + "loss": 0.2525, + "step": 28610 + }, + { + "epoch": 0.83, + "grad_norm": 1.4406491204279281, + "learning_rate": 7.40152686776297e-07, + "loss": 0.2985, + "step": 28611 + }, + { + "epoch": 0.83, + "grad_norm": 1.4269104121750942, + "learning_rate": 7.399067706504925e-07, + "loss": 0.2657, + "step": 28612 + }, + { + "epoch": 0.83, + "grad_norm": 1.4440524638570595, + "learning_rate": 7.396608921200582e-07, + "loss": 0.2669, + "step": 28613 + }, + { + "epoch": 0.83, + "grad_norm": 1.3365788209503262, + "learning_rate": 7.394150511871639e-07, + "loss": 0.2787, + "step": 28614 + }, + { + "epoch": 0.83, + "grad_norm": 1.3023570638742687, + "learning_rate": 7.391692478539803e-07, + "loss": 0.2652, + "step": 28615 + }, + { + "epoch": 0.83, + "grad_norm": 1.5440150806647155, + "learning_rate": 7.389234821226738e-07, + "loss": 0.285, + "step": 28616 + }, + { + "epoch": 0.83, + "grad_norm": 1.0518889165272403, + "learning_rate": 7.386777539954149e-07, + "loss": 0.5573, + "step": 28617 + }, + { + "epoch": 0.83, + "grad_norm": 1.448004650852135, + "learning_rate": 7.384320634743735e-07, + "loss": 0.2699, + "step": 28618 + }, + { + "epoch": 0.83, + "grad_norm": 1.2931369368158439, + "learning_rate": 7.381864105617148e-07, + "loss": 0.2702, + "step": 28619 + }, + { + "epoch": 0.83, + "grad_norm": 1.3262743452120171, + "learning_rate": 7.379407952596085e-07, + "loss": 0.2598, + "step": 28620 + }, + { + "epoch": 0.83, + "grad_norm": 1.4605463597361041, + "learning_rate": 7.376952175702223e-07, + "loss": 0.3054, + "step": 28621 + }, + { + "epoch": 0.83, + "grad_norm": 1.5568419353222533, + "learning_rate": 7.37449677495723e-07, + "loss": 0.2651, + "step": 28622 + }, + { + "epoch": 0.83, + "grad_norm": 1.7070191807130086, + "learning_rate": 7.372041750382774e-07, + "loss": 0.2711, + "step": 28623 + }, + { + "epoch": 0.83, + "grad_norm": 1.4676358314295903, + "learning_rate": 7.36958710200052e-07, + "loss": 0.2947, + "step": 28624 + }, + { + "epoch": 0.83, + "grad_norm": 1.2973511156829964, + "learning_rate": 7.367132829832136e-07, + "loss": 0.2594, + "step": 28625 + }, + { + "epoch": 0.83, + "grad_norm": 1.62909155503488, + "learning_rate": 7.364678933899288e-07, + "loss": 0.2947, + "step": 28626 + }, + { + "epoch": 0.83, + "grad_norm": 1.5432180825794093, + "learning_rate": 7.362225414223612e-07, + "loss": 0.2632, + "step": 28627 + }, + { + "epoch": 0.83, + "grad_norm": 1.54706465790481, + "learning_rate": 7.359772270826771e-07, + "loss": 0.302, + "step": 28628 + }, + { + "epoch": 0.83, + "grad_norm": 0.9711118441650979, + "learning_rate": 7.357319503730409e-07, + "loss": 0.5734, + "step": 28629 + }, + { + "epoch": 0.83, + "grad_norm": 1.4474042270331793, + "learning_rate": 7.354867112956176e-07, + "loss": 0.284, + "step": 28630 + }, + { + "epoch": 0.83, + "grad_norm": 1.5067818147130183, + "learning_rate": 7.352415098525711e-07, + "loss": 0.316, + "step": 28631 + }, + { + "epoch": 0.83, + "grad_norm": 1.3410975427959348, + "learning_rate": 7.349963460460662e-07, + "loss": 0.2796, + "step": 28632 + }, + { + "epoch": 0.83, + "grad_norm": 1.3469925086092853, + "learning_rate": 7.347512198782658e-07, + "loss": 0.2883, + "step": 28633 + }, + { + "epoch": 0.83, + "grad_norm": 1.350237331046539, + "learning_rate": 7.34506131351334e-07, + "loss": 0.2632, + "step": 28634 + }, + { + "epoch": 0.83, + "grad_norm": 1.4330701074431236, + "learning_rate": 7.342610804674316e-07, + "loss": 0.2615, + "step": 28635 + }, + { + "epoch": 0.83, + "grad_norm": 1.4872627476400875, + "learning_rate": 7.340160672287227e-07, + "loss": 0.2589, + "step": 28636 + }, + { + "epoch": 0.83, + "grad_norm": 2.0645112983635365, + "learning_rate": 7.337710916373697e-07, + "loss": 0.2901, + "step": 28637 + }, + { + "epoch": 0.83, + "grad_norm": 1.057513931133931, + "learning_rate": 7.335261536955341e-07, + "loss": 0.5701, + "step": 28638 + }, + { + "epoch": 0.83, + "grad_norm": 1.6888369042389997, + "learning_rate": 7.332812534053768e-07, + "loss": 0.3179, + "step": 28639 + }, + { + "epoch": 0.83, + "grad_norm": 1.47886445845045, + "learning_rate": 7.330363907690602e-07, + "loss": 0.2735, + "step": 28640 + }, + { + "epoch": 0.83, + "grad_norm": 1.31428052809226, + "learning_rate": 7.327915657887452e-07, + "loss": 0.292, + "step": 28641 + }, + { + "epoch": 0.83, + "grad_norm": 1.3703853844035876, + "learning_rate": 7.325467784665913e-07, + "loss": 0.2805, + "step": 28642 + }, + { + "epoch": 0.83, + "grad_norm": 1.484162807571811, + "learning_rate": 7.32302028804761e-07, + "loss": 0.2781, + "step": 28643 + }, + { + "epoch": 0.83, + "grad_norm": 1.4099858449882252, + "learning_rate": 7.320573168054112e-07, + "loss": 0.2851, + "step": 28644 + }, + { + "epoch": 0.83, + "grad_norm": 1.3656205408182294, + "learning_rate": 7.318126424707033e-07, + "loss": 0.2921, + "step": 28645 + }, + { + "epoch": 0.83, + "grad_norm": 1.1910671858766062, + "learning_rate": 7.315680058027974e-07, + "loss": 0.2608, + "step": 28646 + }, + { + "epoch": 0.83, + "grad_norm": 1.6211697995665162, + "learning_rate": 7.313234068038494e-07, + "loss": 0.2927, + "step": 28647 + }, + { + "epoch": 0.83, + "grad_norm": 1.5309818268033597, + "learning_rate": 7.310788454760203e-07, + "loss": 0.2939, + "step": 28648 + }, + { + "epoch": 0.83, + "grad_norm": 1.3283853912607906, + "learning_rate": 7.308343218214675e-07, + "loss": 0.2807, + "step": 28649 + }, + { + "epoch": 0.83, + "grad_norm": 2.2063003585404055, + "learning_rate": 7.305898358423491e-07, + "loss": 0.2775, + "step": 28650 + }, + { + "epoch": 0.83, + "grad_norm": 1.2534695231732027, + "learning_rate": 7.303453875408228e-07, + "loss": 0.2675, + "step": 28651 + }, + { + "epoch": 0.83, + "grad_norm": 1.2572824335209793, + "learning_rate": 7.301009769190459e-07, + "loss": 0.2764, + "step": 28652 + }, + { + "epoch": 0.83, + "grad_norm": 1.3320817939125449, + "learning_rate": 7.29856603979176e-07, + "loss": 0.2661, + "step": 28653 + }, + { + "epoch": 0.83, + "grad_norm": 1.6975590850552134, + "learning_rate": 7.296122687233687e-07, + "loss": 0.2657, + "step": 28654 + }, + { + "epoch": 0.83, + "grad_norm": 1.2946395440004526, + "learning_rate": 7.293679711537799e-07, + "loss": 0.2621, + "step": 28655 + }, + { + "epoch": 0.83, + "grad_norm": 1.3080384484310863, + "learning_rate": 7.291237112725657e-07, + "loss": 0.2574, + "step": 28656 + }, + { + "epoch": 0.83, + "grad_norm": 1.290189592372946, + "learning_rate": 7.288794890818828e-07, + "loss": 0.259, + "step": 28657 + }, + { + "epoch": 0.83, + "grad_norm": 1.2014112119326734, + "learning_rate": 7.286353045838857e-07, + "loss": 0.2583, + "step": 28658 + }, + { + "epoch": 0.83, + "grad_norm": 1.364879862354634, + "learning_rate": 7.28391157780729e-07, + "loss": 0.2838, + "step": 28659 + }, + { + "epoch": 0.83, + "grad_norm": 1.3532351497947677, + "learning_rate": 7.281470486745684e-07, + "loss": 0.2974, + "step": 28660 + }, + { + "epoch": 0.83, + "grad_norm": 2.199483591876402, + "learning_rate": 7.279029772675572e-07, + "loss": 0.2751, + "step": 28661 + }, + { + "epoch": 0.83, + "grad_norm": 2.6482356657396964, + "learning_rate": 7.276589435618503e-07, + "loss": 0.2843, + "step": 28662 + }, + { + "epoch": 0.83, + "grad_norm": 1.9607919569064287, + "learning_rate": 7.274149475595999e-07, + "loss": 0.2718, + "step": 28663 + }, + { + "epoch": 0.83, + "grad_norm": 1.4172882168978262, + "learning_rate": 7.271709892629602e-07, + "loss": 0.2735, + "step": 28664 + }, + { + "epoch": 0.83, + "grad_norm": 1.3492710880969245, + "learning_rate": 7.269270686740837e-07, + "loss": 0.2668, + "step": 28665 + }, + { + "epoch": 0.83, + "grad_norm": 1.3417578033165878, + "learning_rate": 7.266831857951229e-07, + "loss": 0.2641, + "step": 28666 + }, + { + "epoch": 0.83, + "grad_norm": 1.3813974414871222, + "learning_rate": 7.264393406282305e-07, + "loss": 0.2807, + "step": 28667 + }, + { + "epoch": 0.83, + "grad_norm": 1.3392016685102843, + "learning_rate": 7.261955331755588e-07, + "loss": 0.2902, + "step": 28668 + }, + { + "epoch": 0.83, + "grad_norm": 1.4230934391719072, + "learning_rate": 7.259517634392587e-07, + "loss": 0.2742, + "step": 28669 + }, + { + "epoch": 0.83, + "grad_norm": 1.3126194583670996, + "learning_rate": 7.257080314214827e-07, + "loss": 0.2566, + "step": 28670 + }, + { + "epoch": 0.83, + "grad_norm": 1.245293551650069, + "learning_rate": 7.254643371243797e-07, + "loss": 0.2606, + "step": 28671 + }, + { + "epoch": 0.83, + "grad_norm": 1.8344674858558105, + "learning_rate": 7.252206805501011e-07, + "loss": 0.2704, + "step": 28672 + }, + { + "epoch": 0.83, + "grad_norm": 1.3131814889247622, + "learning_rate": 7.249770617007978e-07, + "loss": 0.2709, + "step": 28673 + }, + { + "epoch": 0.83, + "grad_norm": 1.3263404672365653, + "learning_rate": 7.247334805786205e-07, + "loss": 0.2883, + "step": 28674 + }, + { + "epoch": 0.83, + "grad_norm": 1.5246722631180205, + "learning_rate": 7.244899371857161e-07, + "loss": 0.2672, + "step": 28675 + }, + { + "epoch": 0.83, + "grad_norm": 1.2924176819071411, + "learning_rate": 7.242464315242359e-07, + "loss": 0.276, + "step": 28676 + }, + { + "epoch": 0.83, + "grad_norm": 1.3605628873357631, + "learning_rate": 7.240029635963281e-07, + "loss": 0.2714, + "step": 28677 + }, + { + "epoch": 0.83, + "grad_norm": 1.2296169488805666, + "learning_rate": 7.237595334041414e-07, + "loss": 0.2725, + "step": 28678 + }, + { + "epoch": 0.83, + "grad_norm": 1.6490187310442448, + "learning_rate": 7.235161409498248e-07, + "loss": 0.2636, + "step": 28679 + }, + { + "epoch": 0.83, + "grad_norm": 1.281159307411321, + "learning_rate": 7.232727862355254e-07, + "loss": 0.2507, + "step": 28680 + }, + { + "epoch": 0.83, + "grad_norm": 1.3766264981745417, + "learning_rate": 7.230294692633922e-07, + "loss": 0.2574, + "step": 28681 + }, + { + "epoch": 0.83, + "grad_norm": 1.4064370605807546, + "learning_rate": 7.227861900355699e-07, + "loss": 0.2863, + "step": 28682 + }, + { + "epoch": 0.83, + "grad_norm": 1.5074696449320384, + "learning_rate": 7.225429485542074e-07, + "loss": 0.2853, + "step": 28683 + }, + { + "epoch": 0.83, + "grad_norm": 1.543759231271752, + "learning_rate": 7.222997448214508e-07, + "loss": 0.2615, + "step": 28684 + }, + { + "epoch": 0.83, + "grad_norm": 1.4727081051241748, + "learning_rate": 7.220565788394463e-07, + "loss": 0.2802, + "step": 28685 + }, + { + "epoch": 0.83, + "grad_norm": 1.3290338504735142, + "learning_rate": 7.218134506103403e-07, + "loss": 0.271, + "step": 28686 + }, + { + "epoch": 0.83, + "grad_norm": 1.2441815600952033, + "learning_rate": 7.215703601362773e-07, + "loss": 0.2576, + "step": 28687 + }, + { + "epoch": 0.83, + "grad_norm": 1.6282178653008237, + "learning_rate": 7.213273074194039e-07, + "loss": 0.2849, + "step": 28688 + }, + { + "epoch": 0.83, + "grad_norm": 5.261913800199595, + "learning_rate": 7.210842924618655e-07, + "loss": 0.2691, + "step": 28689 + }, + { + "epoch": 0.83, + "grad_norm": 2.2187700761810074, + "learning_rate": 7.208413152658044e-07, + "loss": 0.2649, + "step": 28690 + }, + { + "epoch": 0.83, + "grad_norm": 7.527630204079885, + "learning_rate": 7.205983758333662e-07, + "loss": 0.2913, + "step": 28691 + }, + { + "epoch": 0.83, + "grad_norm": 1.4411535687773358, + "learning_rate": 7.203554741666952e-07, + "loss": 0.286, + "step": 28692 + }, + { + "epoch": 0.83, + "grad_norm": 1.4768471189926082, + "learning_rate": 7.201126102679345e-07, + "loss": 0.2731, + "step": 28693 + }, + { + "epoch": 0.83, + "grad_norm": 1.2515637418712242, + "learning_rate": 7.198697841392272e-07, + "loss": 0.2663, + "step": 28694 + }, + { + "epoch": 0.83, + "grad_norm": 2.7459857191773143, + "learning_rate": 7.196269957827167e-07, + "loss": 0.2951, + "step": 28695 + }, + { + "epoch": 0.83, + "grad_norm": 1.306509529608488, + "learning_rate": 7.193842452005451e-07, + "loss": 0.2676, + "step": 28696 + }, + { + "epoch": 0.83, + "grad_norm": 1.3148416119372999, + "learning_rate": 7.191415323948553e-07, + "loss": 0.2555, + "step": 28697 + }, + { + "epoch": 0.83, + "grad_norm": 1.4852709449294512, + "learning_rate": 7.188988573677902e-07, + "loss": 0.2834, + "step": 28698 + }, + { + "epoch": 0.83, + "grad_norm": 1.415756496730068, + "learning_rate": 7.186562201214892e-07, + "loss": 0.3393, + "step": 28699 + }, + { + "epoch": 0.83, + "grad_norm": 1.3551690968312209, + "learning_rate": 7.184136206580943e-07, + "loss": 0.2818, + "step": 28700 + }, + { + "epoch": 0.83, + "grad_norm": 1.1860536803401676, + "learning_rate": 7.181710589797464e-07, + "loss": 0.2811, + "step": 28701 + }, + { + "epoch": 0.83, + "grad_norm": 1.4505809041550424, + "learning_rate": 7.179285350885878e-07, + "loss": 0.2776, + "step": 28702 + }, + { + "epoch": 0.83, + "grad_norm": 1.280655400830439, + "learning_rate": 7.176860489867566e-07, + "loss": 0.2964, + "step": 28703 + }, + { + "epoch": 0.83, + "grad_norm": 1.3973814334621621, + "learning_rate": 7.17443600676393e-07, + "loss": 0.2558, + "step": 28704 + }, + { + "epoch": 0.83, + "grad_norm": 1.489322500522723, + "learning_rate": 7.172011901596376e-07, + "loss": 0.2597, + "step": 28705 + }, + { + "epoch": 0.83, + "grad_norm": 2.167693786038895, + "learning_rate": 7.169588174386288e-07, + "loss": 0.2616, + "step": 28706 + }, + { + "epoch": 0.83, + "grad_norm": 1.427810380251516, + "learning_rate": 7.16716482515506e-07, + "loss": 0.2719, + "step": 28707 + }, + { + "epoch": 0.83, + "grad_norm": 1.312848159562188, + "learning_rate": 7.16474185392409e-07, + "loss": 0.2602, + "step": 28708 + }, + { + "epoch": 0.83, + "grad_norm": 1.3686004541160284, + "learning_rate": 7.162319260714739e-07, + "loss": 0.2415, + "step": 28709 + }, + { + "epoch": 0.83, + "grad_norm": 1.2719101003057927, + "learning_rate": 7.159897045548392e-07, + "loss": 0.2642, + "step": 28710 + }, + { + "epoch": 0.83, + "grad_norm": 1.402796095730677, + "learning_rate": 7.157475208446435e-07, + "loss": 0.2639, + "step": 28711 + }, + { + "epoch": 0.83, + "grad_norm": 1.5646712748760645, + "learning_rate": 7.155053749430229e-07, + "loss": 0.2574, + "step": 28712 + }, + { + "epoch": 0.83, + "grad_norm": 1.2517971067819913, + "learning_rate": 7.152632668521148e-07, + "loss": 0.2702, + "step": 28713 + }, + { + "epoch": 0.83, + "grad_norm": 1.6455748034381816, + "learning_rate": 7.150211965740561e-07, + "loss": 0.2673, + "step": 28714 + }, + { + "epoch": 0.83, + "grad_norm": 1.4782108123062832, + "learning_rate": 7.147791641109831e-07, + "loss": 0.2699, + "step": 28715 + }, + { + "epoch": 0.83, + "grad_norm": 1.3406034743931843, + "learning_rate": 7.145371694650311e-07, + "loss": 0.2657, + "step": 28716 + }, + { + "epoch": 0.83, + "grad_norm": 1.4876887211999965, + "learning_rate": 7.142952126383373e-07, + "loss": 0.2708, + "step": 28717 + }, + { + "epoch": 0.83, + "grad_norm": 1.3899005398891582, + "learning_rate": 7.140532936330352e-07, + "loss": 0.304, + "step": 28718 + }, + { + "epoch": 0.83, + "grad_norm": 1.5952857389631254, + "learning_rate": 7.138114124512596e-07, + "loss": 0.2576, + "step": 28719 + }, + { + "epoch": 0.83, + "grad_norm": 1.388761701557108, + "learning_rate": 7.135695690951467e-07, + "loss": 0.2614, + "step": 28720 + }, + { + "epoch": 0.83, + "grad_norm": 1.3527454476338248, + "learning_rate": 7.13327763566829e-07, + "loss": 0.2785, + "step": 28721 + }, + { + "epoch": 0.83, + "grad_norm": 1.3186567097030335, + "learning_rate": 7.130859958684416e-07, + "loss": 0.2526, + "step": 28722 + }, + { + "epoch": 0.83, + "grad_norm": 1.18483676904159, + "learning_rate": 7.128442660021178e-07, + "loss": 0.2548, + "step": 28723 + }, + { + "epoch": 0.83, + "grad_norm": 1.4409572954466228, + "learning_rate": 7.126025739699915e-07, + "loss": 0.288, + "step": 28724 + }, + { + "epoch": 0.83, + "grad_norm": 1.3132656338113522, + "learning_rate": 7.123609197741954e-07, + "loss": 0.2623, + "step": 28725 + }, + { + "epoch": 0.83, + "grad_norm": 1.32559356674306, + "learning_rate": 7.121193034168611e-07, + "loss": 0.2753, + "step": 28726 + }, + { + "epoch": 0.83, + "grad_norm": 1.613728958877025, + "learning_rate": 7.118777249001213e-07, + "loss": 0.2479, + "step": 28727 + }, + { + "epoch": 0.83, + "grad_norm": 1.8230085799713311, + "learning_rate": 7.116361842261082e-07, + "loss": 0.2646, + "step": 28728 + }, + { + "epoch": 0.83, + "grad_norm": 1.3151048020333738, + "learning_rate": 7.113946813969536e-07, + "loss": 0.2521, + "step": 28729 + }, + { + "epoch": 0.83, + "grad_norm": 1.4049143271147047, + "learning_rate": 7.111532164147883e-07, + "loss": 0.2689, + "step": 28730 + }, + { + "epoch": 0.83, + "grad_norm": 1.3631285075812511, + "learning_rate": 7.109117892817447e-07, + "loss": 0.2821, + "step": 28731 + }, + { + "epoch": 0.83, + "grad_norm": 1.4393193382943763, + "learning_rate": 7.106703999999509e-07, + "loss": 0.2858, + "step": 28732 + }, + { + "epoch": 0.83, + "grad_norm": 1.9269648240769937, + "learning_rate": 7.104290485715382e-07, + "loss": 0.2576, + "step": 28733 + }, + { + "epoch": 0.83, + "grad_norm": 1.3290613961616724, + "learning_rate": 7.101877349986369e-07, + "loss": 0.2788, + "step": 28734 + }, + { + "epoch": 0.83, + "grad_norm": 1.347694090802423, + "learning_rate": 7.099464592833777e-07, + "loss": 0.2566, + "step": 28735 + }, + { + "epoch": 0.83, + "grad_norm": 1.4987727512652946, + "learning_rate": 7.097052214278871e-07, + "loss": 0.2623, + "step": 28736 + }, + { + "epoch": 0.83, + "grad_norm": 1.274968029878619, + "learning_rate": 7.094640214342957e-07, + "loss": 0.2635, + "step": 28737 + }, + { + "epoch": 0.83, + "grad_norm": 1.449380709150748, + "learning_rate": 7.092228593047323e-07, + "loss": 0.3185, + "step": 28738 + }, + { + "epoch": 0.83, + "grad_norm": 1.3096057336534823, + "learning_rate": 7.089817350413241e-07, + "loss": 0.2766, + "step": 28739 + }, + { + "epoch": 0.83, + "grad_norm": 1.4193597152559563, + "learning_rate": 7.087406486462001e-07, + "loss": 0.2995, + "step": 28740 + }, + { + "epoch": 0.83, + "grad_norm": 1.236249999010233, + "learning_rate": 7.084996001214872e-07, + "loss": 0.2624, + "step": 28741 + }, + { + "epoch": 0.83, + "grad_norm": 1.3073767106458634, + "learning_rate": 7.082585894693134e-07, + "loss": 0.2692, + "step": 28742 + }, + { + "epoch": 0.83, + "grad_norm": 1.3944089310015217, + "learning_rate": 7.080176166918052e-07, + "loss": 0.2442, + "step": 28743 + }, + { + "epoch": 0.83, + "grad_norm": 1.3682842855080357, + "learning_rate": 7.077766817910897e-07, + "loss": 0.261, + "step": 28744 + }, + { + "epoch": 0.83, + "grad_norm": 2.1853438746761342, + "learning_rate": 7.075357847692915e-07, + "loss": 0.2741, + "step": 28745 + }, + { + "epoch": 0.83, + "grad_norm": 1.2829564047727073, + "learning_rate": 7.072949256285377e-07, + "loss": 0.2899, + "step": 28746 + }, + { + "epoch": 0.83, + "grad_norm": 1.318827925872789, + "learning_rate": 7.070541043709544e-07, + "loss": 0.2717, + "step": 28747 + }, + { + "epoch": 0.83, + "grad_norm": 1.3743488792979457, + "learning_rate": 7.068133209986655e-07, + "loss": 0.2801, + "step": 28748 + }, + { + "epoch": 0.83, + "grad_norm": 1.5074955932712084, + "learning_rate": 7.06572575513797e-07, + "loss": 0.2603, + "step": 28749 + }, + { + "epoch": 0.83, + "grad_norm": 1.320288573572169, + "learning_rate": 7.063318679184733e-07, + "loss": 0.2698, + "step": 28750 + }, + { + "epoch": 0.83, + "grad_norm": 1.457767095934226, + "learning_rate": 7.060911982148183e-07, + "loss": 0.2923, + "step": 28751 + }, + { + "epoch": 0.83, + "grad_norm": 1.312903018599987, + "learning_rate": 7.058505664049559e-07, + "loss": 0.271, + "step": 28752 + }, + { + "epoch": 0.83, + "grad_norm": 1.4185337379227427, + "learning_rate": 7.056099724910115e-07, + "loss": 0.2844, + "step": 28753 + }, + { + "epoch": 0.83, + "grad_norm": 1.6212126618610603, + "learning_rate": 7.053694164751052e-07, + "loss": 0.2887, + "step": 28754 + }, + { + "epoch": 0.83, + "grad_norm": 1.2425768639516381, + "learning_rate": 7.051288983593618e-07, + "loss": 0.2651, + "step": 28755 + }, + { + "epoch": 0.83, + "grad_norm": 1.3139831460321953, + "learning_rate": 7.048884181459032e-07, + "loss": 0.2601, + "step": 28756 + }, + { + "epoch": 0.83, + "grad_norm": 1.6457029016487648, + "learning_rate": 7.046479758368519e-07, + "loss": 0.2799, + "step": 28757 + }, + { + "epoch": 0.83, + "grad_norm": 1.6268463287113317, + "learning_rate": 7.044075714343301e-07, + "loss": 0.2871, + "step": 28758 + }, + { + "epoch": 0.83, + "grad_norm": 1.6590921373943623, + "learning_rate": 7.041672049404602e-07, + "loss": 0.2795, + "step": 28759 + }, + { + "epoch": 0.83, + "grad_norm": 1.2293278027963657, + "learning_rate": 7.039268763573614e-07, + "loss": 0.2744, + "step": 28760 + }, + { + "epoch": 0.83, + "grad_norm": 1.3637683453001757, + "learning_rate": 7.03686585687155e-07, + "loss": 0.2764, + "step": 28761 + }, + { + "epoch": 0.83, + "grad_norm": 1.3786702952838763, + "learning_rate": 7.03446332931963e-07, + "loss": 0.2662, + "step": 28762 + }, + { + "epoch": 0.83, + "grad_norm": 1.3102595897802598, + "learning_rate": 7.032061180939054e-07, + "loss": 0.2677, + "step": 28763 + }, + { + "epoch": 0.83, + "grad_norm": 1.4497536966476154, + "learning_rate": 7.029659411751e-07, + "loss": 0.2692, + "step": 28764 + }, + { + "epoch": 0.83, + "grad_norm": 1.5611835696443257, + "learning_rate": 7.027258021776684e-07, + "loss": 0.2791, + "step": 28765 + }, + { + "epoch": 0.83, + "grad_norm": 1.5013543297218537, + "learning_rate": 7.02485701103729e-07, + "loss": 0.2663, + "step": 28766 + }, + { + "epoch": 0.83, + "grad_norm": 1.300411783613039, + "learning_rate": 7.02245637955401e-07, + "loss": 0.2607, + "step": 28767 + }, + { + "epoch": 0.83, + "grad_norm": 1.4049312864274122, + "learning_rate": 7.020056127348035e-07, + "loss": 0.2771, + "step": 28768 + }, + { + "epoch": 0.83, + "grad_norm": 1.2828185521417228, + "learning_rate": 7.017656254440536e-07, + "loss": 0.2502, + "step": 28769 + }, + { + "epoch": 0.83, + "grad_norm": 1.3464642506500066, + "learning_rate": 7.015256760852696e-07, + "loss": 0.2801, + "step": 28770 + }, + { + "epoch": 0.83, + "grad_norm": 1.520575561247006, + "learning_rate": 7.012857646605703e-07, + "loss": 0.3071, + "step": 28771 + }, + { + "epoch": 0.83, + "grad_norm": 1.3961352417982187, + "learning_rate": 7.010458911720708e-07, + "loss": 0.2697, + "step": 28772 + }, + { + "epoch": 0.83, + "grad_norm": 1.2993664414224941, + "learning_rate": 7.008060556218893e-07, + "loss": 0.2721, + "step": 28773 + }, + { + "epoch": 0.83, + "grad_norm": 1.337370663319547, + "learning_rate": 7.005662580121419e-07, + "loss": 0.2563, + "step": 28774 + }, + { + "epoch": 0.83, + "grad_norm": 1.3018718691370146, + "learning_rate": 7.003264983449448e-07, + "loss": 0.265, + "step": 28775 + }, + { + "epoch": 0.83, + "grad_norm": 1.4286597366497773, + "learning_rate": 7.00086776622414e-07, + "loss": 0.2976, + "step": 28776 + }, + { + "epoch": 0.83, + "grad_norm": 1.2753258766956008, + "learning_rate": 6.998470928466655e-07, + "loss": 0.2586, + "step": 28777 + }, + { + "epoch": 0.83, + "grad_norm": 1.4994512832623754, + "learning_rate": 6.996074470198139e-07, + "loss": 0.2768, + "step": 28778 + }, + { + "epoch": 0.83, + "grad_norm": 1.4541401952183763, + "learning_rate": 6.993678391439739e-07, + "loss": 0.2785, + "step": 28779 + }, + { + "epoch": 0.83, + "grad_norm": 2.0538424686539893, + "learning_rate": 6.99128269221262e-07, + "loss": 0.2831, + "step": 28780 + }, + { + "epoch": 0.83, + "grad_norm": 1.3360190287122902, + "learning_rate": 6.988887372537895e-07, + "loss": 0.2761, + "step": 28781 + }, + { + "epoch": 0.83, + "grad_norm": 1.3922548899558473, + "learning_rate": 6.98649243243672e-07, + "loss": 0.2989, + "step": 28782 + }, + { + "epoch": 0.83, + "grad_norm": 1.5060360127255985, + "learning_rate": 6.984097871930223e-07, + "loss": 0.2801, + "step": 28783 + }, + { + "epoch": 0.83, + "grad_norm": 1.4117327618914834, + "learning_rate": 6.981703691039537e-07, + "loss": 0.2601, + "step": 28784 + }, + { + "epoch": 0.83, + "grad_norm": 1.4764085278234789, + "learning_rate": 6.979309889785801e-07, + "loss": 0.278, + "step": 28785 + }, + { + "epoch": 0.83, + "grad_norm": 1.5124397037157191, + "learning_rate": 6.976916468190126e-07, + "loss": 0.2476, + "step": 28786 + }, + { + "epoch": 0.83, + "grad_norm": 1.212150727284419, + "learning_rate": 6.974523426273654e-07, + "loss": 0.2552, + "step": 28787 + }, + { + "epoch": 0.83, + "grad_norm": 1.5058680498323078, + "learning_rate": 6.972130764057478e-07, + "loss": 0.2817, + "step": 28788 + }, + { + "epoch": 0.84, + "grad_norm": 1.504080892333326, + "learning_rate": 6.969738481562727e-07, + "loss": 0.3006, + "step": 28789 + }, + { + "epoch": 0.84, + "grad_norm": 1.3533978286099546, + "learning_rate": 6.967346578810519e-07, + "loss": 0.3028, + "step": 28790 + }, + { + "epoch": 0.84, + "grad_norm": 1.2445542607181, + "learning_rate": 6.964955055821948e-07, + "loss": 0.2611, + "step": 28791 + }, + { + "epoch": 0.84, + "grad_norm": 1.6416290130941387, + "learning_rate": 6.96256391261812e-07, + "loss": 0.2929, + "step": 28792 + }, + { + "epoch": 0.84, + "grad_norm": 1.2792973696653982, + "learning_rate": 6.960173149220145e-07, + "loss": 0.2862, + "step": 28793 + }, + { + "epoch": 0.84, + "grad_norm": 1.2277835333306275, + "learning_rate": 6.957782765649124e-07, + "loss": 0.2691, + "step": 28794 + }, + { + "epoch": 0.84, + "grad_norm": 1.2669174861686627, + "learning_rate": 6.955392761926145e-07, + "loss": 0.2962, + "step": 28795 + }, + { + "epoch": 0.84, + "grad_norm": 1.7073748971159044, + "learning_rate": 6.9530031380723e-07, + "loss": 0.2695, + "step": 28796 + }, + { + "epoch": 0.84, + "grad_norm": 1.7903643611725255, + "learning_rate": 6.950613894108682e-07, + "loss": 0.2652, + "step": 28797 + }, + { + "epoch": 0.84, + "grad_norm": 1.6288050079688376, + "learning_rate": 6.948225030056377e-07, + "loss": 0.2728, + "step": 28798 + }, + { + "epoch": 0.84, + "grad_norm": 1.2949398814586526, + "learning_rate": 6.945836545936468e-07, + "loss": 0.256, + "step": 28799 + }, + { + "epoch": 0.84, + "grad_norm": 7.453869006685182, + "learning_rate": 6.943448441770024e-07, + "loss": 0.2678, + "step": 28800 + }, + { + "epoch": 0.84, + "grad_norm": 1.4766754368679453, + "learning_rate": 6.941060717578118e-07, + "loss": 0.2851, + "step": 28801 + }, + { + "epoch": 0.84, + "grad_norm": 1.4292974788002943, + "learning_rate": 6.938673373381838e-07, + "loss": 0.2927, + "step": 28802 + }, + { + "epoch": 0.84, + "grad_norm": 1.4181709413000698, + "learning_rate": 6.936286409202236e-07, + "loss": 0.2956, + "step": 28803 + }, + { + "epoch": 0.84, + "grad_norm": 0.9377529903058051, + "learning_rate": 6.933899825060386e-07, + "loss": 0.5704, + "step": 28804 + }, + { + "epoch": 0.84, + "grad_norm": 1.3248913672887956, + "learning_rate": 6.931513620977354e-07, + "loss": 0.2623, + "step": 28805 + }, + { + "epoch": 0.84, + "grad_norm": 1.4929108207929636, + "learning_rate": 6.929127796974183e-07, + "loss": 0.2879, + "step": 28806 + }, + { + "epoch": 0.84, + "grad_norm": 1.2380646678978078, + "learning_rate": 6.926742353071952e-07, + "loss": 0.2629, + "step": 28807 + }, + { + "epoch": 0.84, + "grad_norm": 1.2147835639119948, + "learning_rate": 6.924357289291689e-07, + "loss": 0.2885, + "step": 28808 + }, + { + "epoch": 0.84, + "grad_norm": 1.593917917850811, + "learning_rate": 6.921972605654448e-07, + "loss": 0.2602, + "step": 28809 + }, + { + "epoch": 0.84, + "grad_norm": 1.358595029841865, + "learning_rate": 6.919588302181273e-07, + "loss": 0.2536, + "step": 28810 + }, + { + "epoch": 0.84, + "grad_norm": 1.3076056717498885, + "learning_rate": 6.917204378893216e-07, + "loss": 0.2722, + "step": 28811 + }, + { + "epoch": 0.84, + "grad_norm": 1.2876772445404245, + "learning_rate": 6.914820835811303e-07, + "loss": 0.2633, + "step": 28812 + }, + { + "epoch": 0.84, + "grad_norm": 1.4583592213438132, + "learning_rate": 6.912437672956579e-07, + "loss": 0.2799, + "step": 28813 + }, + { + "epoch": 0.84, + "grad_norm": 1.2345757585380879, + "learning_rate": 6.910054890350065e-07, + "loss": 0.2722, + "step": 28814 + }, + { + "epoch": 0.84, + "grad_norm": 1.3667566905245165, + "learning_rate": 6.907672488012807e-07, + "loss": 0.272, + "step": 28815 + }, + { + "epoch": 0.84, + "grad_norm": 1.3831033449425052, + "learning_rate": 6.905290465965808e-07, + "loss": 0.2808, + "step": 28816 + }, + { + "epoch": 0.84, + "grad_norm": 1.340519948141601, + "learning_rate": 6.902908824230098e-07, + "loss": 0.262, + "step": 28817 + }, + { + "epoch": 0.84, + "grad_norm": 1.4674863995321659, + "learning_rate": 6.900527562826709e-07, + "loss": 0.2956, + "step": 28818 + }, + { + "epoch": 0.84, + "grad_norm": 1.7505832187166102, + "learning_rate": 6.898146681776629e-07, + "loss": 0.2559, + "step": 28819 + }, + { + "epoch": 0.84, + "grad_norm": 1.4295982022347868, + "learning_rate": 6.895766181100883e-07, + "loss": 0.2972, + "step": 28820 + }, + { + "epoch": 0.84, + "grad_norm": 1.5071300761679136, + "learning_rate": 6.893386060820478e-07, + "loss": 0.2726, + "step": 28821 + }, + { + "epoch": 0.84, + "grad_norm": 1.3535756224447653, + "learning_rate": 6.891006320956423e-07, + "loss": 0.2872, + "step": 28822 + }, + { + "epoch": 0.84, + "grad_norm": 1.3732203699303338, + "learning_rate": 6.88862696152971e-07, + "loss": 0.2552, + "step": 28823 + }, + { + "epoch": 0.84, + "grad_norm": 1.1752592059190308, + "learning_rate": 6.886247982561345e-07, + "loss": 0.2603, + "step": 28824 + }, + { + "epoch": 0.84, + "grad_norm": 1.5066331632741954, + "learning_rate": 6.883869384072323e-07, + "loss": 0.269, + "step": 28825 + }, + { + "epoch": 0.84, + "grad_norm": 1.5254969500586346, + "learning_rate": 6.881491166083637e-07, + "loss": 0.2465, + "step": 28826 + }, + { + "epoch": 0.84, + "grad_norm": 1.5089045326933974, + "learning_rate": 6.87911332861626e-07, + "loss": 0.2922, + "step": 28827 + }, + { + "epoch": 0.84, + "grad_norm": 1.2914015303979025, + "learning_rate": 6.876735871691187e-07, + "loss": 0.2682, + "step": 28828 + }, + { + "epoch": 0.84, + "grad_norm": 1.4830795324209758, + "learning_rate": 6.8743587953294e-07, + "loss": 0.2792, + "step": 28829 + }, + { + "epoch": 0.84, + "grad_norm": 1.4894456715515483, + "learning_rate": 6.871982099551878e-07, + "loss": 0.2951, + "step": 28830 + }, + { + "epoch": 0.84, + "grad_norm": 1.37989059960337, + "learning_rate": 6.869605784379585e-07, + "loss": 0.2773, + "step": 28831 + }, + { + "epoch": 0.84, + "grad_norm": 1.2590261518846801, + "learning_rate": 6.867229849833501e-07, + "loss": 0.2538, + "step": 28832 + }, + { + "epoch": 0.84, + "grad_norm": 1.9848952178518666, + "learning_rate": 6.864854295934598e-07, + "loss": 0.2619, + "step": 28833 + }, + { + "epoch": 0.84, + "grad_norm": 1.4791300847918882, + "learning_rate": 6.86247912270383e-07, + "loss": 0.2696, + "step": 28834 + }, + { + "epoch": 0.84, + "grad_norm": 1.008157328925472, + "learning_rate": 6.860104330162171e-07, + "loss": 0.595, + "step": 28835 + }, + { + "epoch": 0.84, + "grad_norm": 1.3082797535448996, + "learning_rate": 6.857729918330563e-07, + "loss": 0.2797, + "step": 28836 + }, + { + "epoch": 0.84, + "grad_norm": 1.3764665782036576, + "learning_rate": 6.855355887229964e-07, + "loss": 0.2909, + "step": 28837 + }, + { + "epoch": 0.84, + "grad_norm": 1.2296661308157906, + "learning_rate": 6.852982236881328e-07, + "loss": 0.2519, + "step": 28838 + }, + { + "epoch": 0.84, + "grad_norm": 1.345976472346628, + "learning_rate": 6.850608967305606e-07, + "loss": 0.279, + "step": 28839 + }, + { + "epoch": 0.84, + "grad_norm": 1.4536842454597976, + "learning_rate": 6.848236078523735e-07, + "loss": 0.2712, + "step": 28840 + }, + { + "epoch": 0.84, + "grad_norm": 1.249730031107578, + "learning_rate": 6.84586357055666e-07, + "loss": 0.288, + "step": 28841 + }, + { + "epoch": 0.84, + "grad_norm": 1.292646680070307, + "learning_rate": 6.843491443425321e-07, + "loss": 0.2636, + "step": 28842 + }, + { + "epoch": 0.84, + "grad_norm": 1.338230325165094, + "learning_rate": 6.841119697150656e-07, + "loss": 0.2609, + "step": 28843 + }, + { + "epoch": 0.84, + "grad_norm": 1.3000135023439436, + "learning_rate": 6.838748331753581e-07, + "loss": 0.2876, + "step": 28844 + }, + { + "epoch": 0.84, + "grad_norm": 1.5887776906633504, + "learning_rate": 6.836377347255035e-07, + "loss": 0.2566, + "step": 28845 + }, + { + "epoch": 0.84, + "grad_norm": 1.329204380423975, + "learning_rate": 6.834006743675936e-07, + "loss": 0.2475, + "step": 28846 + }, + { + "epoch": 0.84, + "grad_norm": 0.9087006476844081, + "learning_rate": 6.8316365210372e-07, + "loss": 0.5757, + "step": 28847 + }, + { + "epoch": 0.84, + "grad_norm": 1.764357007894429, + "learning_rate": 6.829266679359752e-07, + "loss": 0.2828, + "step": 28848 + }, + { + "epoch": 0.84, + "grad_norm": 3.308756184786211, + "learning_rate": 6.826897218664503e-07, + "loss": 0.3057, + "step": 28849 + }, + { + "epoch": 0.84, + "grad_norm": 1.3498764110644572, + "learning_rate": 6.82452813897237e-07, + "loss": 0.2664, + "step": 28850 + }, + { + "epoch": 0.84, + "grad_norm": 1.3098622530765889, + "learning_rate": 6.822159440304249e-07, + "loss": 0.2702, + "step": 28851 + }, + { + "epoch": 0.84, + "grad_norm": 1.6231362231685902, + "learning_rate": 6.819791122681052e-07, + "loss": 0.2834, + "step": 28852 + }, + { + "epoch": 0.84, + "grad_norm": 1.4108750194199782, + "learning_rate": 6.817423186123678e-07, + "loss": 0.283, + "step": 28853 + }, + { + "epoch": 0.84, + "grad_norm": 1.3848634334505954, + "learning_rate": 6.815055630653034e-07, + "loss": 0.2603, + "step": 28854 + }, + { + "epoch": 0.84, + "grad_norm": 1.509066668483601, + "learning_rate": 6.812688456289996e-07, + "loss": 0.2939, + "step": 28855 + }, + { + "epoch": 0.84, + "grad_norm": 1.4722393442794885, + "learning_rate": 6.810321663055452e-07, + "loss": 0.2749, + "step": 28856 + }, + { + "epoch": 0.84, + "grad_norm": 1.3432811875033956, + "learning_rate": 6.807955250970305e-07, + "loss": 0.2746, + "step": 28857 + }, + { + "epoch": 0.84, + "grad_norm": 1.597162297348007, + "learning_rate": 6.805589220055437e-07, + "loss": 0.2845, + "step": 28858 + }, + { + "epoch": 0.84, + "grad_norm": 1.3732675779825692, + "learning_rate": 6.803223570331713e-07, + "loss": 0.2838, + "step": 28859 + }, + { + "epoch": 0.84, + "grad_norm": 1.3684955038267592, + "learning_rate": 6.800858301820029e-07, + "loss": 0.2967, + "step": 28860 + }, + { + "epoch": 0.84, + "grad_norm": 1.4670310507917155, + "learning_rate": 6.798493414541241e-07, + "loss": 0.2822, + "step": 28861 + }, + { + "epoch": 0.84, + "grad_norm": 1.373677223624582, + "learning_rate": 6.796128908516247e-07, + "loss": 0.2524, + "step": 28862 + }, + { + "epoch": 0.84, + "grad_norm": 1.3802567454434576, + "learning_rate": 6.793764783765877e-07, + "loss": 0.2925, + "step": 28863 + }, + { + "epoch": 0.84, + "grad_norm": 1.283190563505586, + "learning_rate": 6.791401040311019e-07, + "loss": 0.2699, + "step": 28864 + }, + { + "epoch": 0.84, + "grad_norm": 1.388271945166765, + "learning_rate": 6.789037678172522e-07, + "loss": 0.303, + "step": 28865 + }, + { + "epoch": 0.84, + "grad_norm": 1.2457754422936187, + "learning_rate": 6.786674697371248e-07, + "loss": 0.2472, + "step": 28866 + }, + { + "epoch": 0.84, + "grad_norm": 1.3718785214720768, + "learning_rate": 6.784312097928048e-07, + "loss": 0.2741, + "step": 28867 + }, + { + "epoch": 0.84, + "grad_norm": 1.3359157752918405, + "learning_rate": 6.781949879863769e-07, + "loss": 0.2674, + "step": 28868 + }, + { + "epoch": 0.84, + "grad_norm": 0.9920321883217694, + "learning_rate": 6.77958804319927e-07, + "loss": 0.5761, + "step": 28869 + }, + { + "epoch": 0.84, + "grad_norm": 1.2626363303975594, + "learning_rate": 6.777226587955382e-07, + "loss": 0.2737, + "step": 28870 + }, + { + "epoch": 0.84, + "grad_norm": 1.4913341386883718, + "learning_rate": 6.774865514152962e-07, + "loss": 0.2774, + "step": 28871 + }, + { + "epoch": 0.84, + "grad_norm": 1.6554420970659345, + "learning_rate": 6.77250482181282e-07, + "loss": 0.2964, + "step": 28872 + }, + { + "epoch": 0.84, + "grad_norm": 1.2970211454111702, + "learning_rate": 6.770144510955812e-07, + "loss": 0.2623, + "step": 28873 + }, + { + "epoch": 0.84, + "grad_norm": 1.3926145712743174, + "learning_rate": 6.767784581602749e-07, + "loss": 0.2746, + "step": 28874 + }, + { + "epoch": 0.84, + "grad_norm": 1.2999176250508708, + "learning_rate": 6.76542503377447e-07, + "loss": 0.2771, + "step": 28875 + }, + { + "epoch": 0.84, + "grad_norm": 1.2021604845977005, + "learning_rate": 6.763065867491791e-07, + "loss": 0.2723, + "step": 28876 + }, + { + "epoch": 0.84, + "grad_norm": 1.4526114178503127, + "learning_rate": 6.760707082775537e-07, + "loss": 0.2745, + "step": 28877 + }, + { + "epoch": 0.84, + "grad_norm": 1.3476429375416075, + "learning_rate": 6.758348679646521e-07, + "loss": 0.2723, + "step": 28878 + }, + { + "epoch": 0.84, + "grad_norm": 1.2440708382095795, + "learning_rate": 6.755990658125556e-07, + "loss": 0.2693, + "step": 28879 + }, + { + "epoch": 0.84, + "grad_norm": 1.4201994420647492, + "learning_rate": 6.753633018233458e-07, + "loss": 0.2589, + "step": 28880 + }, + { + "epoch": 0.84, + "grad_norm": 1.7605067236889052, + "learning_rate": 6.751275759991033e-07, + "loss": 0.2982, + "step": 28881 + }, + { + "epoch": 0.84, + "grad_norm": 1.6384439252363054, + "learning_rate": 6.748918883419076e-07, + "loss": 0.289, + "step": 28882 + }, + { + "epoch": 0.84, + "grad_norm": 1.3479289369425937, + "learning_rate": 6.746562388538381e-07, + "loss": 0.2626, + "step": 28883 + }, + { + "epoch": 0.84, + "grad_norm": 1.433952298646974, + "learning_rate": 6.744206275369758e-07, + "loss": 0.2709, + "step": 28884 + }, + { + "epoch": 0.84, + "grad_norm": 1.3304250482841153, + "learning_rate": 6.741850543933997e-07, + "loss": 0.2561, + "step": 28885 + }, + { + "epoch": 0.84, + "grad_norm": 1.2976419157661203, + "learning_rate": 6.739495194251877e-07, + "loss": 0.2874, + "step": 28886 + }, + { + "epoch": 0.84, + "grad_norm": 2.8532963820041033, + "learning_rate": 6.737140226344197e-07, + "loss": 0.3191, + "step": 28887 + }, + { + "epoch": 0.84, + "grad_norm": 1.2437734457836729, + "learning_rate": 6.734785640231734e-07, + "loss": 0.2642, + "step": 28888 + }, + { + "epoch": 0.84, + "grad_norm": 1.3285425881968127, + "learning_rate": 6.732431435935272e-07, + "loss": 0.2679, + "step": 28889 + }, + { + "epoch": 0.84, + "grad_norm": 1.2997900450896527, + "learning_rate": 6.730077613475588e-07, + "loss": 0.2742, + "step": 28890 + }, + { + "epoch": 0.84, + "grad_norm": 1.418609248671266, + "learning_rate": 6.72772417287344e-07, + "loss": 0.2673, + "step": 28891 + }, + { + "epoch": 0.84, + "grad_norm": 1.4721467485215336, + "learning_rate": 6.725371114149603e-07, + "loss": 0.2702, + "step": 28892 + }, + { + "epoch": 0.84, + "grad_norm": 1.872404979706211, + "learning_rate": 6.723018437324852e-07, + "loss": 0.2628, + "step": 28893 + }, + { + "epoch": 0.84, + "grad_norm": 0.9610737611142014, + "learning_rate": 6.720666142419935e-07, + "loss": 0.5722, + "step": 28894 + }, + { + "epoch": 0.84, + "grad_norm": 1.3484417568226403, + "learning_rate": 6.718314229455625e-07, + "loss": 0.2767, + "step": 28895 + }, + { + "epoch": 0.84, + "grad_norm": 1.3691774876318552, + "learning_rate": 6.715962698452672e-07, + "loss": 0.2781, + "step": 28896 + }, + { + "epoch": 0.84, + "grad_norm": 1.31281535496181, + "learning_rate": 6.713611549431826e-07, + "loss": 0.2773, + "step": 28897 + }, + { + "epoch": 0.84, + "grad_norm": 1.4245402894896853, + "learning_rate": 6.711260782413853e-07, + "loss": 0.2735, + "step": 28898 + }, + { + "epoch": 0.84, + "grad_norm": 1.4213187511928944, + "learning_rate": 6.708910397419472e-07, + "loss": 0.2868, + "step": 28899 + }, + { + "epoch": 0.84, + "grad_norm": 1.3900378883429105, + "learning_rate": 6.706560394469446e-07, + "loss": 0.2799, + "step": 28900 + }, + { + "epoch": 0.84, + "grad_norm": 2.07360834215844, + "learning_rate": 6.704210773584496e-07, + "loss": 0.2787, + "step": 28901 + }, + { + "epoch": 0.84, + "grad_norm": 1.6241672310778916, + "learning_rate": 6.701861534785365e-07, + "loss": 0.2638, + "step": 28902 + }, + { + "epoch": 0.84, + "grad_norm": 1.2972807132646125, + "learning_rate": 6.699512678092784e-07, + "loss": 0.2612, + "step": 28903 + }, + { + "epoch": 0.84, + "grad_norm": 2.8020690390450036, + "learning_rate": 6.697164203527484e-07, + "loss": 0.2917, + "step": 28904 + }, + { + "epoch": 0.84, + "grad_norm": 1.4838650601540861, + "learning_rate": 6.694816111110191e-07, + "loss": 0.2774, + "step": 28905 + }, + { + "epoch": 0.84, + "grad_norm": 1.4232325260288996, + "learning_rate": 6.692468400861629e-07, + "loss": 0.2369, + "step": 28906 + }, + { + "epoch": 0.84, + "grad_norm": 1.2900974830568357, + "learning_rate": 6.690121072802514e-07, + "loss": 0.2638, + "step": 28907 + }, + { + "epoch": 0.84, + "grad_norm": 1.3491072445021797, + "learning_rate": 6.687774126953561e-07, + "loss": 0.272, + "step": 28908 + }, + { + "epoch": 0.84, + "grad_norm": 1.316935353169466, + "learning_rate": 6.685427563335489e-07, + "loss": 0.2431, + "step": 28909 + }, + { + "epoch": 0.84, + "grad_norm": 1.4045182653607449, + "learning_rate": 6.683081381968992e-07, + "loss": 0.2564, + "step": 28910 + }, + { + "epoch": 0.84, + "grad_norm": 1.3547453513735057, + "learning_rate": 6.680735582874781e-07, + "loss": 0.2601, + "step": 28911 + }, + { + "epoch": 0.84, + "grad_norm": 1.6739947996284774, + "learning_rate": 6.678390166073556e-07, + "loss": 0.2752, + "step": 28912 + }, + { + "epoch": 0.84, + "grad_norm": 1.4334161592505141, + "learning_rate": 6.676045131586023e-07, + "loss": 0.2705, + "step": 28913 + }, + { + "epoch": 0.84, + "grad_norm": 1.3417694762147403, + "learning_rate": 6.673700479432871e-07, + "loss": 0.2598, + "step": 28914 + }, + { + "epoch": 0.84, + "grad_norm": 1.3271528231550855, + "learning_rate": 6.671356209634794e-07, + "loss": 0.2628, + "step": 28915 + }, + { + "epoch": 0.84, + "grad_norm": 1.466410719189425, + "learning_rate": 6.669012322212476e-07, + "loss": 0.2821, + "step": 28916 + }, + { + "epoch": 0.84, + "grad_norm": 1.3266019595674001, + "learning_rate": 6.666668817186617e-07, + "loss": 0.2759, + "step": 28917 + }, + { + "epoch": 0.84, + "grad_norm": 1.4924572153622206, + "learning_rate": 6.664325694577872e-07, + "loss": 0.2652, + "step": 28918 + }, + { + "epoch": 0.84, + "grad_norm": 1.4639250041332816, + "learning_rate": 6.661982954406942e-07, + "loss": 0.3089, + "step": 28919 + }, + { + "epoch": 0.84, + "grad_norm": 1.182553792071578, + "learning_rate": 6.659640596694489e-07, + "loss": 0.2689, + "step": 28920 + }, + { + "epoch": 0.84, + "grad_norm": 1.291686513803, + "learning_rate": 6.657298621461183e-07, + "loss": 0.2663, + "step": 28921 + }, + { + "epoch": 0.84, + "grad_norm": 1.564519842189325, + "learning_rate": 6.654957028727705e-07, + "loss": 0.2856, + "step": 28922 + }, + { + "epoch": 0.84, + "grad_norm": 1.376441019490167, + "learning_rate": 6.652615818514707e-07, + "loss": 0.2547, + "step": 28923 + }, + { + "epoch": 0.84, + "grad_norm": 1.7251745048132574, + "learning_rate": 6.650274990842859e-07, + "loss": 0.2587, + "step": 28924 + }, + { + "epoch": 0.84, + "grad_norm": 1.5049410397501433, + "learning_rate": 6.647934545732815e-07, + "loss": 0.2658, + "step": 28925 + }, + { + "epoch": 0.84, + "grad_norm": 1.3002072447041904, + "learning_rate": 6.645594483205242e-07, + "loss": 0.2665, + "step": 28926 + }, + { + "epoch": 0.84, + "grad_norm": 1.3231745108276733, + "learning_rate": 6.643254803280763e-07, + "loss": 0.28, + "step": 28927 + }, + { + "epoch": 0.84, + "grad_norm": 1.518956935783633, + "learning_rate": 6.640915505980055e-07, + "loss": 0.297, + "step": 28928 + }, + { + "epoch": 0.84, + "grad_norm": 1.3666877770390102, + "learning_rate": 6.638576591323736e-07, + "loss": 0.2812, + "step": 28929 + }, + { + "epoch": 0.84, + "grad_norm": 1.648053990255737, + "learning_rate": 6.636238059332461e-07, + "loss": 0.2675, + "step": 28930 + }, + { + "epoch": 0.84, + "grad_norm": 1.3031924638099293, + "learning_rate": 6.633899910026865e-07, + "loss": 0.2608, + "step": 28931 + }, + { + "epoch": 0.84, + "grad_norm": 1.3546271631254647, + "learning_rate": 6.631562143427584e-07, + "loss": 0.2693, + "step": 28932 + }, + { + "epoch": 0.84, + "grad_norm": 1.2375226960134078, + "learning_rate": 6.629224759555253e-07, + "loss": 0.2632, + "step": 28933 + }, + { + "epoch": 0.84, + "grad_norm": 1.5837576732166618, + "learning_rate": 6.626887758430489e-07, + "loss": 0.2805, + "step": 28934 + }, + { + "epoch": 0.84, + "grad_norm": 1.30985947665606, + "learning_rate": 6.624551140073921e-07, + "loss": 0.2649, + "step": 28935 + }, + { + "epoch": 0.84, + "grad_norm": 1.5718691845797104, + "learning_rate": 6.622214904506186e-07, + "loss": 0.2888, + "step": 28936 + }, + { + "epoch": 0.84, + "grad_norm": 1.7198747879759997, + "learning_rate": 6.619879051747869e-07, + "loss": 0.27, + "step": 28937 + }, + { + "epoch": 0.84, + "grad_norm": 1.4083124538120795, + "learning_rate": 6.617543581819602e-07, + "loss": 0.2716, + "step": 28938 + }, + { + "epoch": 0.84, + "grad_norm": 1.6022384376752092, + "learning_rate": 6.615208494741998e-07, + "loss": 0.333, + "step": 28939 + }, + { + "epoch": 0.84, + "grad_norm": 1.2529341622810155, + "learning_rate": 6.612873790535657e-07, + "loss": 0.2787, + "step": 28940 + }, + { + "epoch": 0.84, + "grad_norm": 3.0334349356895025, + "learning_rate": 6.610539469221188e-07, + "loss": 0.2997, + "step": 28941 + }, + { + "epoch": 0.84, + "grad_norm": 0.9533621322667009, + "learning_rate": 6.60820553081919e-07, + "loss": 0.549, + "step": 28942 + }, + { + "epoch": 0.84, + "grad_norm": 1.5903457854627696, + "learning_rate": 6.605871975350253e-07, + "loss": 0.3044, + "step": 28943 + }, + { + "epoch": 0.84, + "grad_norm": 1.486696546826752, + "learning_rate": 6.603538802834986e-07, + "loss": 0.2804, + "step": 28944 + }, + { + "epoch": 0.84, + "grad_norm": 1.4114945223583966, + "learning_rate": 6.601206013293976e-07, + "loss": 0.2937, + "step": 28945 + }, + { + "epoch": 0.84, + "grad_norm": 1.355873431781949, + "learning_rate": 6.598873606747796e-07, + "loss": 0.265, + "step": 28946 + }, + { + "epoch": 0.84, + "grad_norm": 1.2326049859465562, + "learning_rate": 6.596541583217037e-07, + "loss": 0.2451, + "step": 28947 + }, + { + "epoch": 0.84, + "grad_norm": 1.3382838172790437, + "learning_rate": 6.594209942722279e-07, + "loss": 0.2787, + "step": 28948 + }, + { + "epoch": 0.84, + "grad_norm": 1.3516428049350933, + "learning_rate": 6.591878685284103e-07, + "loss": 0.2635, + "step": 28949 + }, + { + "epoch": 0.84, + "grad_norm": 1.3716197494895852, + "learning_rate": 6.589547810923075e-07, + "loss": 0.2746, + "step": 28950 + }, + { + "epoch": 0.84, + "grad_norm": 1.4696449947919095, + "learning_rate": 6.587217319659772e-07, + "loss": 0.2713, + "step": 28951 + }, + { + "epoch": 0.84, + "grad_norm": 1.4657836114128187, + "learning_rate": 6.584887211514756e-07, + "loss": 0.263, + "step": 28952 + }, + { + "epoch": 0.84, + "grad_norm": 2.2342472320621374, + "learning_rate": 6.582557486508606e-07, + "loss": 0.2582, + "step": 28953 + }, + { + "epoch": 0.84, + "grad_norm": 1.3704372407455694, + "learning_rate": 6.580228144661854e-07, + "loss": 0.2964, + "step": 28954 + }, + { + "epoch": 0.84, + "grad_norm": 1.7927803845835788, + "learning_rate": 6.577899185995079e-07, + "loss": 0.2811, + "step": 28955 + }, + { + "epoch": 0.84, + "grad_norm": 1.3763879132422796, + "learning_rate": 6.57557061052882e-07, + "loss": 0.261, + "step": 28956 + }, + { + "epoch": 0.84, + "grad_norm": 1.3149406802304526, + "learning_rate": 6.573242418283632e-07, + "loss": 0.2669, + "step": 28957 + }, + { + "epoch": 0.84, + "grad_norm": 0.9379605151738336, + "learning_rate": 6.570914609280055e-07, + "loss": 0.5483, + "step": 28958 + }, + { + "epoch": 0.84, + "grad_norm": 1.610951584405303, + "learning_rate": 6.568587183538644e-07, + "loss": 0.2678, + "step": 28959 + }, + { + "epoch": 0.84, + "grad_norm": 1.3101130476539833, + "learning_rate": 6.566260141079933e-07, + "loss": 0.2874, + "step": 28960 + }, + { + "epoch": 0.84, + "grad_norm": 1.286348182254625, + "learning_rate": 6.563933481924456e-07, + "loss": 0.2724, + "step": 28961 + }, + { + "epoch": 0.84, + "grad_norm": 1.2816932818750062, + "learning_rate": 6.561607206092746e-07, + "loss": 0.2701, + "step": 28962 + }, + { + "epoch": 0.84, + "grad_norm": 1.2458256835945618, + "learning_rate": 6.559281313605337e-07, + "loss": 0.2655, + "step": 28963 + }, + { + "epoch": 0.84, + "grad_norm": 2.29643171497636, + "learning_rate": 6.556955804482762e-07, + "loss": 0.2856, + "step": 28964 + }, + { + "epoch": 0.84, + "grad_norm": 1.4850908869647501, + "learning_rate": 6.554630678745522e-07, + "loss": 0.2715, + "step": 28965 + }, + { + "epoch": 0.84, + "grad_norm": 1.23759106310019, + "learning_rate": 6.55230593641415e-07, + "loss": 0.282, + "step": 28966 + }, + { + "epoch": 0.84, + "grad_norm": 1.2925471006996085, + "learning_rate": 6.549981577509163e-07, + "loss": 0.2512, + "step": 28967 + }, + { + "epoch": 0.84, + "grad_norm": 1.4231449393989526, + "learning_rate": 6.547657602051067e-07, + "loss": 0.2643, + "step": 28968 + }, + { + "epoch": 0.84, + "grad_norm": 1.3715365405281634, + "learning_rate": 6.545334010060372e-07, + "loss": 0.2654, + "step": 28969 + }, + { + "epoch": 0.84, + "grad_norm": 1.5026336884870017, + "learning_rate": 6.543010801557592e-07, + "loss": 0.2646, + "step": 28970 + }, + { + "epoch": 0.84, + "grad_norm": 2.0498960194662232, + "learning_rate": 6.54068797656322e-07, + "loss": 0.2855, + "step": 28971 + }, + { + "epoch": 0.84, + "grad_norm": 1.4761794106120973, + "learning_rate": 6.538365535097774e-07, + "loss": 0.2701, + "step": 28972 + }, + { + "epoch": 0.84, + "grad_norm": 1.4488014347807403, + "learning_rate": 6.536043477181719e-07, + "loss": 0.2812, + "step": 28973 + }, + { + "epoch": 0.84, + "grad_norm": 1.274306548973276, + "learning_rate": 6.533721802835563e-07, + "loss": 0.2634, + "step": 28974 + }, + { + "epoch": 0.84, + "grad_norm": 1.3900693197119733, + "learning_rate": 6.531400512079794e-07, + "loss": 0.262, + "step": 28975 + }, + { + "epoch": 0.84, + "grad_norm": 1.3352461251818124, + "learning_rate": 6.529079604934902e-07, + "loss": 0.2731, + "step": 28976 + }, + { + "epoch": 0.84, + "grad_norm": 1.685933083762978, + "learning_rate": 6.526759081421363e-07, + "loss": 0.272, + "step": 28977 + }, + { + "epoch": 0.84, + "grad_norm": 1.3538532184846381, + "learning_rate": 6.524438941559652e-07, + "loss": 0.2709, + "step": 28978 + }, + { + "epoch": 0.84, + "grad_norm": 1.2342230428988479, + "learning_rate": 6.522119185370257e-07, + "loss": 0.2878, + "step": 28979 + }, + { + "epoch": 0.84, + "grad_norm": 1.362202078310045, + "learning_rate": 6.51979981287364e-07, + "loss": 0.2603, + "step": 28980 + }, + { + "epoch": 0.84, + "grad_norm": 1.4440125142982776, + "learning_rate": 6.517480824090277e-07, + "loss": 0.2823, + "step": 28981 + }, + { + "epoch": 0.84, + "grad_norm": 1.3705393270159023, + "learning_rate": 6.515162219040627e-07, + "loss": 0.2738, + "step": 28982 + }, + { + "epoch": 0.84, + "grad_norm": 2.3750844253627337, + "learning_rate": 6.512843997745156e-07, + "loss": 0.259, + "step": 28983 + }, + { + "epoch": 0.84, + "grad_norm": 1.4113832191760065, + "learning_rate": 6.510526160224312e-07, + "loss": 0.2742, + "step": 28984 + }, + { + "epoch": 0.84, + "grad_norm": 1.4267109762214658, + "learning_rate": 6.508208706498558e-07, + "loss": 0.2599, + "step": 28985 + }, + { + "epoch": 0.84, + "grad_norm": 1.3035255134429244, + "learning_rate": 6.50589163658834e-07, + "loss": 0.2816, + "step": 28986 + }, + { + "epoch": 0.84, + "grad_norm": 1.2186985133195123, + "learning_rate": 6.503574950514113e-07, + "loss": 0.2551, + "step": 28987 + }, + { + "epoch": 0.84, + "grad_norm": 1.3581061100059815, + "learning_rate": 6.501258648296322e-07, + "loss": 0.2595, + "step": 28988 + }, + { + "epoch": 0.84, + "grad_norm": 1.5160721381006839, + "learning_rate": 6.498942729955405e-07, + "loss": 0.2791, + "step": 28989 + }, + { + "epoch": 0.84, + "grad_norm": 1.3752015540017402, + "learning_rate": 6.496627195511801e-07, + "loss": 0.2776, + "step": 28990 + }, + { + "epoch": 0.84, + "grad_norm": 1.4928848612197534, + "learning_rate": 6.494312044985957e-07, + "loss": 0.2695, + "step": 28991 + }, + { + "epoch": 0.84, + "grad_norm": 1.4427214565686806, + "learning_rate": 6.491997278398277e-07, + "loss": 0.2459, + "step": 28992 + }, + { + "epoch": 0.84, + "grad_norm": 1.3149204477374663, + "learning_rate": 6.489682895769206e-07, + "loss": 0.291, + "step": 28993 + }, + { + "epoch": 0.84, + "grad_norm": 1.324113806778286, + "learning_rate": 6.487368897119162e-07, + "loss": 0.2795, + "step": 28994 + }, + { + "epoch": 0.84, + "grad_norm": 1.5744376155917277, + "learning_rate": 6.485055282468578e-07, + "loss": 0.2644, + "step": 28995 + }, + { + "epoch": 0.84, + "grad_norm": 1.382722466279143, + "learning_rate": 6.48274205183786e-07, + "loss": 0.2993, + "step": 28996 + }, + { + "epoch": 0.84, + "grad_norm": 1.2638487588742993, + "learning_rate": 6.480429205247423e-07, + "loss": 0.3008, + "step": 28997 + }, + { + "epoch": 0.84, + "grad_norm": 1.2855753985314375, + "learning_rate": 6.478116742717683e-07, + "loss": 0.2516, + "step": 28998 + }, + { + "epoch": 0.84, + "grad_norm": 1.4776235043679526, + "learning_rate": 6.475804664269047e-07, + "loss": 0.2681, + "step": 28999 + }, + { + "epoch": 0.84, + "grad_norm": 1.2958853491905962, + "learning_rate": 6.473492969921929e-07, + "loss": 0.2558, + "step": 29000 + }, + { + "epoch": 0.84, + "grad_norm": 1.3273273945449426, + "learning_rate": 6.471181659696707e-07, + "loss": 0.2656, + "step": 29001 + }, + { + "epoch": 0.84, + "grad_norm": 1.3157709740500063, + "learning_rate": 6.468870733613786e-07, + "loss": 0.2771, + "step": 29002 + }, + { + "epoch": 0.84, + "grad_norm": 1.6046315259508135, + "learning_rate": 6.466560191693566e-07, + "loss": 0.2591, + "step": 29003 + }, + { + "epoch": 0.84, + "grad_norm": 1.3402852548664586, + "learning_rate": 6.464250033956437e-07, + "loss": 0.2618, + "step": 29004 + }, + { + "epoch": 0.84, + "grad_norm": 2.5090764469012066, + "learning_rate": 6.461940260422784e-07, + "loss": 0.2692, + "step": 29005 + }, + { + "epoch": 0.84, + "grad_norm": 1.4438551364262346, + "learning_rate": 6.459630871112993e-07, + "loss": 0.2642, + "step": 29006 + }, + { + "epoch": 0.84, + "grad_norm": 1.468375908379046, + "learning_rate": 6.457321866047439e-07, + "loss": 0.2739, + "step": 29007 + }, + { + "epoch": 0.84, + "grad_norm": 1.5868063597947595, + "learning_rate": 6.455013245246516e-07, + "loss": 0.2722, + "step": 29008 + }, + { + "epoch": 0.84, + "grad_norm": 1.6475130208369517, + "learning_rate": 6.45270500873057e-07, + "loss": 0.2887, + "step": 29009 + }, + { + "epoch": 0.84, + "grad_norm": 1.38631407278412, + "learning_rate": 6.450397156519989e-07, + "loss": 0.2625, + "step": 29010 + }, + { + "epoch": 0.84, + "grad_norm": 1.3854795117485514, + "learning_rate": 6.448089688635134e-07, + "loss": 0.2851, + "step": 29011 + }, + { + "epoch": 0.84, + "grad_norm": 2.146564935640815, + "learning_rate": 6.445782605096379e-07, + "loss": 0.2638, + "step": 29012 + }, + { + "epoch": 0.84, + "grad_norm": 1.5047588154881344, + "learning_rate": 6.443475905924069e-07, + "loss": 0.269, + "step": 29013 + }, + { + "epoch": 0.84, + "grad_norm": 1.335207288135684, + "learning_rate": 6.441169591138569e-07, + "loss": 0.2697, + "step": 29014 + }, + { + "epoch": 0.84, + "grad_norm": 1.3566204874636631, + "learning_rate": 6.438863660760225e-07, + "loss": 0.2855, + "step": 29015 + }, + { + "epoch": 0.84, + "grad_norm": 1.3263363063669291, + "learning_rate": 6.436558114809394e-07, + "loss": 0.2798, + "step": 29016 + }, + { + "epoch": 0.84, + "grad_norm": 1.3630904086707913, + "learning_rate": 6.434252953306425e-07, + "loss": 0.2636, + "step": 29017 + }, + { + "epoch": 0.84, + "grad_norm": 1.034283930372813, + "learning_rate": 6.431948176271652e-07, + "loss": 0.5657, + "step": 29018 + }, + { + "epoch": 0.84, + "grad_norm": 1.6517327856911677, + "learning_rate": 6.429643783725437e-07, + "loss": 0.2666, + "step": 29019 + }, + { + "epoch": 0.84, + "grad_norm": 1.396102725004143, + "learning_rate": 6.427339775688079e-07, + "loss": 0.2837, + "step": 29020 + }, + { + "epoch": 0.84, + "grad_norm": 1.2810734314355574, + "learning_rate": 6.425036152179936e-07, + "loss": 0.2808, + "step": 29021 + }, + { + "epoch": 0.84, + "grad_norm": 1.3015175136285049, + "learning_rate": 6.422732913221336e-07, + "loss": 0.2563, + "step": 29022 + }, + { + "epoch": 0.84, + "grad_norm": 0.9376055904666586, + "learning_rate": 6.420430058832594e-07, + "loss": 0.5286, + "step": 29023 + }, + { + "epoch": 0.84, + "grad_norm": 1.3328776541288068, + "learning_rate": 6.418127589034045e-07, + "loss": 0.2614, + "step": 29024 + }, + { + "epoch": 0.84, + "grad_norm": 1.3311449606811585, + "learning_rate": 6.415825503846001e-07, + "loss": 0.2665, + "step": 29025 + }, + { + "epoch": 0.84, + "grad_norm": 1.2799906661797753, + "learning_rate": 6.41352380328878e-07, + "loss": 0.2754, + "step": 29026 + }, + { + "epoch": 0.84, + "grad_norm": 1.3959940813774496, + "learning_rate": 6.411222487382706e-07, + "loss": 0.2535, + "step": 29027 + }, + { + "epoch": 0.84, + "grad_norm": 1.60351054526637, + "learning_rate": 6.408921556148068e-07, + "loss": 0.2797, + "step": 29028 + }, + { + "epoch": 0.84, + "grad_norm": 1.5998556574458365, + "learning_rate": 6.406621009605185e-07, + "loss": 0.2898, + "step": 29029 + }, + { + "epoch": 0.84, + "grad_norm": 1.5562823189080388, + "learning_rate": 6.404320847774348e-07, + "loss": 0.2822, + "step": 29030 + }, + { + "epoch": 0.84, + "grad_norm": 1.2489078052223568, + "learning_rate": 6.402021070675868e-07, + "loss": 0.272, + "step": 29031 + }, + { + "epoch": 0.84, + "grad_norm": 1.3481754588977397, + "learning_rate": 6.399721678330034e-07, + "loss": 0.2622, + "step": 29032 + }, + { + "epoch": 0.84, + "grad_norm": 1.348116062946193, + "learning_rate": 6.397422670757136e-07, + "loss": 0.2636, + "step": 29033 + }, + { + "epoch": 0.84, + "grad_norm": 2.15036024310025, + "learning_rate": 6.395124047977469e-07, + "loss": 0.2705, + "step": 29034 + }, + { + "epoch": 0.84, + "grad_norm": 1.2388305247646045, + "learning_rate": 6.392825810011316e-07, + "loss": 0.2627, + "step": 29035 + }, + { + "epoch": 0.84, + "grad_norm": 1.4253370105675978, + "learning_rate": 6.390527956878973e-07, + "loss": 0.2923, + "step": 29036 + }, + { + "epoch": 0.84, + "grad_norm": 1.9398380618692002, + "learning_rate": 6.388230488600694e-07, + "loss": 0.2595, + "step": 29037 + }, + { + "epoch": 0.84, + "grad_norm": 1.393842540141306, + "learning_rate": 6.385933405196759e-07, + "loss": 0.2641, + "step": 29038 + }, + { + "epoch": 0.84, + "grad_norm": 1.2878998896005338, + "learning_rate": 6.383636706687451e-07, + "loss": 0.2617, + "step": 29039 + }, + { + "epoch": 0.84, + "grad_norm": 1.5628747583578444, + "learning_rate": 6.381340393093038e-07, + "loss": 0.2917, + "step": 29040 + }, + { + "epoch": 0.84, + "grad_norm": 1.2665148803575927, + "learning_rate": 6.379044464433776e-07, + "loss": 0.2721, + "step": 29041 + }, + { + "epoch": 0.84, + "grad_norm": 1.7928239857677832, + "learning_rate": 6.376748920729925e-07, + "loss": 0.2699, + "step": 29042 + }, + { + "epoch": 0.84, + "grad_norm": 1.242730021424516, + "learning_rate": 6.374453762001754e-07, + "loss": 0.2769, + "step": 29043 + }, + { + "epoch": 0.84, + "grad_norm": 0.9530032687111957, + "learning_rate": 6.372158988269505e-07, + "loss": 0.6225, + "step": 29044 + }, + { + "epoch": 0.84, + "grad_norm": 1.663157760083573, + "learning_rate": 6.369864599553443e-07, + "loss": 0.2763, + "step": 29045 + }, + { + "epoch": 0.84, + "grad_norm": 1.3124532586394395, + "learning_rate": 6.367570595873817e-07, + "loss": 0.2397, + "step": 29046 + }, + { + "epoch": 0.84, + "grad_norm": 1.4616844135785187, + "learning_rate": 6.365276977250856e-07, + "loss": 0.2762, + "step": 29047 + }, + { + "epoch": 0.84, + "grad_norm": 1.3323979747249086, + "learning_rate": 6.36298374370481e-07, + "loss": 0.3037, + "step": 29048 + }, + { + "epoch": 0.84, + "grad_norm": 1.4009102079994087, + "learning_rate": 6.360690895255916e-07, + "loss": 0.2913, + "step": 29049 + }, + { + "epoch": 0.84, + "grad_norm": 1.6152964079861196, + "learning_rate": 6.358398431924411e-07, + "loss": 0.2685, + "step": 29050 + }, + { + "epoch": 0.84, + "grad_norm": 1.3333661054190118, + "learning_rate": 6.356106353730523e-07, + "loss": 0.2809, + "step": 29051 + }, + { + "epoch": 0.84, + "grad_norm": 1.454115771035234, + "learning_rate": 6.353814660694479e-07, + "loss": 0.2811, + "step": 29052 + }, + { + "epoch": 0.84, + "grad_norm": 1.2820906606803257, + "learning_rate": 6.351523352836502e-07, + "loss": 0.2613, + "step": 29053 + }, + { + "epoch": 0.84, + "grad_norm": 1.3553881304077857, + "learning_rate": 6.349232430176821e-07, + "loss": 0.274, + "step": 29054 + }, + { + "epoch": 0.84, + "grad_norm": 3.2290853356915448, + "learning_rate": 6.346941892735658e-07, + "loss": 0.2677, + "step": 29055 + }, + { + "epoch": 0.84, + "grad_norm": 1.34821858831726, + "learning_rate": 6.344651740533203e-07, + "loss": 0.2637, + "step": 29056 + }, + { + "epoch": 0.84, + "grad_norm": 2.8447322067617598, + "learning_rate": 6.342361973589683e-07, + "loss": 0.2864, + "step": 29057 + }, + { + "epoch": 0.84, + "grad_norm": 2.2530694375327656, + "learning_rate": 6.3400725919253e-07, + "loss": 0.2699, + "step": 29058 + }, + { + "epoch": 0.84, + "grad_norm": 1.306689585144191, + "learning_rate": 6.337783595560266e-07, + "loss": 0.2935, + "step": 29059 + }, + { + "epoch": 0.84, + "grad_norm": 1.6256362776433553, + "learning_rate": 6.335494984514773e-07, + "loss": 0.2971, + "step": 29060 + }, + { + "epoch": 0.84, + "grad_norm": 1.348718981267119, + "learning_rate": 6.333206758809024e-07, + "loss": 0.2733, + "step": 29061 + }, + { + "epoch": 0.84, + "grad_norm": 1.3918253097036797, + "learning_rate": 6.330918918463202e-07, + "loss": 0.2527, + "step": 29062 + }, + { + "epoch": 0.84, + "grad_norm": 0.9743749007283445, + "learning_rate": 6.328631463497526e-07, + "loss": 0.5849, + "step": 29063 + }, + { + "epoch": 0.84, + "grad_norm": 2.342724464011402, + "learning_rate": 6.326344393932144e-07, + "loss": 0.3209, + "step": 29064 + }, + { + "epoch": 0.84, + "grad_norm": 1.3752839153175775, + "learning_rate": 6.324057709787257e-07, + "loss": 0.2828, + "step": 29065 + }, + { + "epoch": 0.84, + "grad_norm": 1.3032104180746098, + "learning_rate": 6.32177141108305e-07, + "loss": 0.2716, + "step": 29066 + }, + { + "epoch": 0.84, + "grad_norm": 1.3409878804959319, + "learning_rate": 6.319485497839695e-07, + "loss": 0.2767, + "step": 29067 + }, + { + "epoch": 0.84, + "grad_norm": 1.372985368311293, + "learning_rate": 6.31719997007737e-07, + "loss": 0.2741, + "step": 29068 + }, + { + "epoch": 0.84, + "grad_norm": 1.424002383232055, + "learning_rate": 6.314914827816232e-07, + "loss": 0.2497, + "step": 29069 + }, + { + "epoch": 0.84, + "grad_norm": 1.2700637486262105, + "learning_rate": 6.312630071076459e-07, + "loss": 0.3071, + "step": 29070 + }, + { + "epoch": 0.84, + "grad_norm": 1.4386809963558111, + "learning_rate": 6.310345699878206e-07, + "loss": 0.2784, + "step": 29071 + }, + { + "epoch": 0.84, + "grad_norm": 1.3473993911738165, + "learning_rate": 6.308061714241637e-07, + "loss": 0.2688, + "step": 29072 + }, + { + "epoch": 0.84, + "grad_norm": 1.372716780819572, + "learning_rate": 6.305778114186917e-07, + "loss": 0.2642, + "step": 29073 + }, + { + "epoch": 0.84, + "grad_norm": 1.6017655274064506, + "learning_rate": 6.303494899734181e-07, + "loss": 0.2784, + "step": 29074 + }, + { + "epoch": 0.84, + "grad_norm": 1.2974499262837096, + "learning_rate": 6.30121207090359e-07, + "loss": 0.2725, + "step": 29075 + }, + { + "epoch": 0.84, + "grad_norm": 1.9078456703724564, + "learning_rate": 6.298929627715283e-07, + "loss": 0.2716, + "step": 29076 + }, + { + "epoch": 0.84, + "grad_norm": 1.967193274724382, + "learning_rate": 6.296647570189413e-07, + "loss": 0.2784, + "step": 29077 + }, + { + "epoch": 0.84, + "grad_norm": 1.3791210681871804, + "learning_rate": 6.294365898346105e-07, + "loss": 0.2909, + "step": 29078 + }, + { + "epoch": 0.84, + "grad_norm": 1.7696259641806098, + "learning_rate": 6.292084612205507e-07, + "loss": 0.2659, + "step": 29079 + }, + { + "epoch": 0.84, + "grad_norm": 1.6648879880904937, + "learning_rate": 6.289803711787751e-07, + "loss": 0.2801, + "step": 29080 + }, + { + "epoch": 0.84, + "grad_norm": 1.2420787831532016, + "learning_rate": 6.287523197112955e-07, + "loss": 0.2671, + "step": 29081 + }, + { + "epoch": 0.84, + "grad_norm": 1.562154582237175, + "learning_rate": 6.285243068201269e-07, + "loss": 0.2521, + "step": 29082 + }, + { + "epoch": 0.84, + "grad_norm": 1.485558418203654, + "learning_rate": 6.282963325072783e-07, + "loss": 0.2654, + "step": 29083 + }, + { + "epoch": 0.84, + "grad_norm": 1.9922611596099327, + "learning_rate": 6.28068396774763e-07, + "loss": 0.2671, + "step": 29084 + }, + { + "epoch": 0.84, + "grad_norm": 1.3011100135677578, + "learning_rate": 6.278404996245924e-07, + "loss": 0.267, + "step": 29085 + }, + { + "epoch": 0.84, + "grad_norm": 1.5046250123379945, + "learning_rate": 6.276126410587785e-07, + "loss": 0.2605, + "step": 29086 + }, + { + "epoch": 0.84, + "grad_norm": 1.336244623186566, + "learning_rate": 6.27384821079331e-07, + "loss": 0.2751, + "step": 29087 + }, + { + "epoch": 0.84, + "grad_norm": 1.4530701966371646, + "learning_rate": 6.271570396882615e-07, + "loss": 0.2851, + "step": 29088 + }, + { + "epoch": 0.84, + "grad_norm": 1.3620855798909794, + "learning_rate": 6.269292968875795e-07, + "loss": 0.2745, + "step": 29089 + }, + { + "epoch": 0.84, + "grad_norm": 1.317641365120285, + "learning_rate": 6.267015926792946e-07, + "loss": 0.265, + "step": 29090 + }, + { + "epoch": 0.84, + "grad_norm": 1.4239605360140875, + "learning_rate": 6.26473927065418e-07, + "loss": 0.2544, + "step": 29091 + }, + { + "epoch": 0.84, + "grad_norm": 1.4733848076089378, + "learning_rate": 6.262463000479563e-07, + "loss": 0.2867, + "step": 29092 + }, + { + "epoch": 0.84, + "grad_norm": 1.9959964589009167, + "learning_rate": 6.260187116289196e-07, + "loss": 0.2691, + "step": 29093 + }, + { + "epoch": 0.84, + "grad_norm": 1.2249397732111873, + "learning_rate": 6.257911618103163e-07, + "loss": 0.244, + "step": 29094 + }, + { + "epoch": 0.84, + "grad_norm": 1.2943593377148426, + "learning_rate": 6.255636505941548e-07, + "loss": 0.2605, + "step": 29095 + }, + { + "epoch": 0.84, + "grad_norm": 1.2940925396759164, + "learning_rate": 6.253361779824435e-07, + "loss": 0.2662, + "step": 29096 + }, + { + "epoch": 0.84, + "grad_norm": 1.396300616314452, + "learning_rate": 6.251087439771875e-07, + "loss": 0.2937, + "step": 29097 + }, + { + "epoch": 0.84, + "grad_norm": 1.3824465630691072, + "learning_rate": 6.248813485803956e-07, + "loss": 0.257, + "step": 29098 + }, + { + "epoch": 0.84, + "grad_norm": 1.3635560970612879, + "learning_rate": 6.246539917940748e-07, + "loss": 0.284, + "step": 29099 + }, + { + "epoch": 0.84, + "grad_norm": 1.4266183759965025, + "learning_rate": 6.244266736202303e-07, + "loss": 0.2567, + "step": 29100 + }, + { + "epoch": 0.84, + "grad_norm": 1.4331591697625, + "learning_rate": 6.241993940608703e-07, + "loss": 0.2752, + "step": 29101 + }, + { + "epoch": 0.84, + "grad_norm": 1.908717053905215, + "learning_rate": 6.239721531179982e-07, + "loss": 0.2589, + "step": 29102 + }, + { + "epoch": 0.84, + "grad_norm": 1.3413633386860633, + "learning_rate": 6.237449507936205e-07, + "loss": 0.2913, + "step": 29103 + }, + { + "epoch": 0.84, + "grad_norm": 1.4593635496610413, + "learning_rate": 6.235177870897424e-07, + "loss": 0.254, + "step": 29104 + }, + { + "epoch": 0.84, + "grad_norm": 1.6658407188348388, + "learning_rate": 6.232906620083679e-07, + "loss": 0.2587, + "step": 29105 + }, + { + "epoch": 0.84, + "grad_norm": 1.315246471419832, + "learning_rate": 6.230635755515019e-07, + "loss": 0.3009, + "step": 29106 + }, + { + "epoch": 0.84, + "grad_norm": 1.291898356873911, + "learning_rate": 6.228365277211484e-07, + "loss": 0.2621, + "step": 29107 + }, + { + "epoch": 0.84, + "grad_norm": 1.3817949102823022, + "learning_rate": 6.226095185193115e-07, + "loss": 0.2733, + "step": 29108 + }, + { + "epoch": 0.84, + "grad_norm": 1.3872685573539099, + "learning_rate": 6.223825479479945e-07, + "loss": 0.2775, + "step": 29109 + }, + { + "epoch": 0.84, + "grad_norm": 1.310757740915696, + "learning_rate": 6.221556160091996e-07, + "loss": 0.2645, + "step": 29110 + }, + { + "epoch": 0.84, + "grad_norm": 1.3529256846499833, + "learning_rate": 6.219287227049298e-07, + "loss": 0.266, + "step": 29111 + }, + { + "epoch": 0.84, + "grad_norm": 1.8393608443765193, + "learning_rate": 6.217018680371878e-07, + "loss": 0.265, + "step": 29112 + }, + { + "epoch": 0.84, + "grad_norm": 1.322320338107661, + "learning_rate": 6.214750520079748e-07, + "loss": 0.2895, + "step": 29113 + }, + { + "epoch": 0.84, + "grad_norm": 1.6292304517069585, + "learning_rate": 6.212482746192938e-07, + "loss": 0.2754, + "step": 29114 + }, + { + "epoch": 0.84, + "grad_norm": 1.3359890161745673, + "learning_rate": 6.210215358731447e-07, + "loss": 0.2607, + "step": 29115 + }, + { + "epoch": 0.84, + "grad_norm": 1.8059152507920955, + "learning_rate": 6.207948357715293e-07, + "loss": 0.311, + "step": 29116 + }, + { + "epoch": 0.84, + "grad_norm": 1.4592263969821841, + "learning_rate": 6.205681743164482e-07, + "loss": 0.2855, + "step": 29117 + }, + { + "epoch": 0.84, + "grad_norm": 1.3256134912464912, + "learning_rate": 6.203415515099026e-07, + "loss": 0.2651, + "step": 29118 + }, + { + "epoch": 0.84, + "grad_norm": 3.232311967448909, + "learning_rate": 6.201149673538903e-07, + "loss": 0.2538, + "step": 29119 + }, + { + "epoch": 0.84, + "grad_norm": 1.4519781993253387, + "learning_rate": 6.19888421850412e-07, + "loss": 0.3122, + "step": 29120 + }, + { + "epoch": 0.84, + "grad_norm": 1.3279695613407267, + "learning_rate": 6.196619150014665e-07, + "loss": 0.2801, + "step": 29121 + }, + { + "epoch": 0.84, + "grad_norm": 0.9388399774501851, + "learning_rate": 6.194354468090541e-07, + "loss": 0.5894, + "step": 29122 + }, + { + "epoch": 0.84, + "grad_norm": 2.0834625921650516, + "learning_rate": 6.192090172751719e-07, + "loss": 0.273, + "step": 29123 + }, + { + "epoch": 0.84, + "grad_norm": 1.46261345592939, + "learning_rate": 6.1898262640182e-07, + "loss": 0.2673, + "step": 29124 + }, + { + "epoch": 0.84, + "grad_norm": 1.7870573133436736, + "learning_rate": 6.187562741909936e-07, + "loss": 0.2708, + "step": 29125 + }, + { + "epoch": 0.84, + "grad_norm": 1.7921604042226638, + "learning_rate": 6.185299606446921e-07, + "loss": 0.2547, + "step": 29126 + }, + { + "epoch": 0.84, + "grad_norm": 1.2897163465946477, + "learning_rate": 6.183036857649121e-07, + "loss": 0.2766, + "step": 29127 + }, + { + "epoch": 0.84, + "grad_norm": 1.3704143734878973, + "learning_rate": 6.180774495536518e-07, + "loss": 0.2656, + "step": 29128 + }, + { + "epoch": 0.84, + "grad_norm": 1.4098924204065082, + "learning_rate": 6.178512520129054e-07, + "loss": 0.2748, + "step": 29129 + }, + { + "epoch": 0.84, + "grad_norm": 1.389756514062196, + "learning_rate": 6.176250931446704e-07, + "loss": 0.2856, + "step": 29130 + }, + { + "epoch": 0.84, + "grad_norm": 1.3164612067024017, + "learning_rate": 6.173989729509428e-07, + "loss": 0.2815, + "step": 29131 + }, + { + "epoch": 0.84, + "grad_norm": 1.308923686178053, + "learning_rate": 6.171728914337177e-07, + "loss": 0.2701, + "step": 29132 + }, + { + "epoch": 0.84, + "grad_norm": 1.5558890183290375, + "learning_rate": 6.169468485949903e-07, + "loss": 0.2608, + "step": 29133 + }, + { + "epoch": 0.85, + "grad_norm": 1.767876527724186, + "learning_rate": 6.167208444367556e-07, + "loss": 0.2737, + "step": 29134 + }, + { + "epoch": 0.85, + "grad_norm": 1.3408920827420234, + "learning_rate": 6.164948789610086e-07, + "loss": 0.2711, + "step": 29135 + }, + { + "epoch": 0.85, + "grad_norm": 1.4935580654152418, + "learning_rate": 6.162689521697424e-07, + "loss": 0.3054, + "step": 29136 + }, + { + "epoch": 0.85, + "grad_norm": 1.5749592400599992, + "learning_rate": 6.160430640649523e-07, + "loss": 0.3079, + "step": 29137 + }, + { + "epoch": 0.85, + "grad_norm": 0.9518350902719901, + "learning_rate": 6.158172146486302e-07, + "loss": 0.5996, + "step": 29138 + }, + { + "epoch": 0.85, + "grad_norm": 1.2896539989453173, + "learning_rate": 6.155914039227695e-07, + "loss": 0.2665, + "step": 29139 + }, + { + "epoch": 0.85, + "grad_norm": 1.4815350018531455, + "learning_rate": 6.153656318893636e-07, + "loss": 0.2945, + "step": 29140 + }, + { + "epoch": 0.85, + "grad_norm": 1.336910299375436, + "learning_rate": 6.151398985504043e-07, + "loss": 0.2588, + "step": 29141 + }, + { + "epoch": 0.85, + "grad_norm": 1.407644359351832, + "learning_rate": 6.149142039078842e-07, + "loss": 0.3125, + "step": 29142 + }, + { + "epoch": 0.85, + "grad_norm": 1.5548445780444489, + "learning_rate": 6.146885479637948e-07, + "loss": 0.2668, + "step": 29143 + }, + { + "epoch": 0.85, + "grad_norm": 1.37781062230979, + "learning_rate": 6.144629307201277e-07, + "loss": 0.2525, + "step": 29144 + }, + { + "epoch": 0.85, + "grad_norm": 1.3631668012765064, + "learning_rate": 6.142373521788747e-07, + "loss": 0.2623, + "step": 29145 + }, + { + "epoch": 0.85, + "grad_norm": 1.3210476672706946, + "learning_rate": 6.140118123420246e-07, + "loss": 0.2651, + "step": 29146 + }, + { + "epoch": 0.85, + "grad_norm": 1.5394295468009942, + "learning_rate": 6.137863112115694e-07, + "loss": 0.2665, + "step": 29147 + }, + { + "epoch": 0.85, + "grad_norm": 1.3688054240395306, + "learning_rate": 6.135608487894984e-07, + "loss": 0.2589, + "step": 29148 + }, + { + "epoch": 0.85, + "grad_norm": 1.291941037545616, + "learning_rate": 6.133354250778018e-07, + "loss": 0.2754, + "step": 29149 + }, + { + "epoch": 0.85, + "grad_norm": 1.6625732456296838, + "learning_rate": 6.131100400784684e-07, + "loss": 0.2764, + "step": 29150 + }, + { + "epoch": 0.85, + "grad_norm": 1.3219982847065592, + "learning_rate": 6.128846937934879e-07, + "loss": 0.2707, + "step": 29151 + }, + { + "epoch": 0.85, + "grad_norm": 1.4193823199886744, + "learning_rate": 6.126593862248493e-07, + "loss": 0.2698, + "step": 29152 + }, + { + "epoch": 0.85, + "grad_norm": 1.3714784908781226, + "learning_rate": 6.124341173745396e-07, + "loss": 0.28, + "step": 29153 + }, + { + "epoch": 0.85, + "grad_norm": 1.2907727487301344, + "learning_rate": 6.12208887244547e-07, + "loss": 0.2753, + "step": 29154 + }, + { + "epoch": 0.85, + "grad_norm": 1.3697611813373245, + "learning_rate": 6.119836958368603e-07, + "loss": 0.2647, + "step": 29155 + }, + { + "epoch": 0.85, + "grad_norm": 1.5513940719274169, + "learning_rate": 6.117585431534667e-07, + "loss": 0.2812, + "step": 29156 + }, + { + "epoch": 0.85, + "grad_norm": 1.354538227345987, + "learning_rate": 6.115334291963521e-07, + "loss": 0.3068, + "step": 29157 + }, + { + "epoch": 0.85, + "grad_norm": 1.3405152284268411, + "learning_rate": 6.113083539675029e-07, + "loss": 0.2774, + "step": 29158 + }, + { + "epoch": 0.85, + "grad_norm": 1.671649336039552, + "learning_rate": 6.110833174689069e-07, + "loss": 0.2812, + "step": 29159 + }, + { + "epoch": 0.85, + "grad_norm": 1.3321463597837524, + "learning_rate": 6.108583197025492e-07, + "loss": 0.2638, + "step": 29160 + }, + { + "epoch": 0.85, + "grad_norm": 2.265063831582158, + "learning_rate": 6.106333606704152e-07, + "loss": 0.2586, + "step": 29161 + }, + { + "epoch": 0.85, + "grad_norm": 1.2887811554549506, + "learning_rate": 6.104084403744903e-07, + "loss": 0.2602, + "step": 29162 + }, + { + "epoch": 0.85, + "grad_norm": 3.8143850505677053, + "learning_rate": 6.101835588167598e-07, + "loss": 0.2619, + "step": 29163 + }, + { + "epoch": 0.85, + "grad_norm": 0.9679254708045326, + "learning_rate": 6.099587159992099e-07, + "loss": 0.5796, + "step": 29164 + }, + { + "epoch": 0.85, + "grad_norm": 1.3220920100251798, + "learning_rate": 6.097339119238211e-07, + "loss": 0.2713, + "step": 29165 + }, + { + "epoch": 0.85, + "grad_norm": 1.2575739149219933, + "learning_rate": 6.095091465925796e-07, + "loss": 0.2908, + "step": 29166 + }, + { + "epoch": 0.85, + "grad_norm": 1.3079958759517356, + "learning_rate": 6.092844200074682e-07, + "loss": 0.2628, + "step": 29167 + }, + { + "epoch": 0.85, + "grad_norm": 0.999778220658395, + "learning_rate": 6.090597321704711e-07, + "loss": 0.541, + "step": 29168 + }, + { + "epoch": 0.85, + "grad_norm": 3.9521752828580605, + "learning_rate": 6.088350830835699e-07, + "loss": 0.2598, + "step": 29169 + }, + { + "epoch": 0.85, + "grad_norm": 1.5074304664250826, + "learning_rate": 6.086104727487485e-07, + "loss": 0.2685, + "step": 29170 + }, + { + "epoch": 0.85, + "grad_norm": 1.2296725814567886, + "learning_rate": 6.083859011679877e-07, + "loss": 0.2567, + "step": 29171 + }, + { + "epoch": 0.85, + "grad_norm": 1.5291564165055131, + "learning_rate": 6.081613683432703e-07, + "loss": 0.281, + "step": 29172 + }, + { + "epoch": 0.85, + "grad_norm": 1.316702185578261, + "learning_rate": 6.07936874276579e-07, + "loss": 0.2664, + "step": 29173 + }, + { + "epoch": 0.85, + "grad_norm": 1.5422709836267054, + "learning_rate": 6.077124189698919e-07, + "loss": 0.2718, + "step": 29174 + }, + { + "epoch": 0.85, + "grad_norm": 1.4998942915426172, + "learning_rate": 6.074880024251917e-07, + "loss": 0.2761, + "step": 29175 + }, + { + "epoch": 0.85, + "grad_norm": 1.3361535155094668, + "learning_rate": 6.072636246444585e-07, + "loss": 0.2781, + "step": 29176 + }, + { + "epoch": 0.85, + "grad_norm": 1.3435500764168649, + "learning_rate": 6.070392856296725e-07, + "loss": 0.2893, + "step": 29177 + }, + { + "epoch": 0.85, + "grad_norm": 2.3760975766428887, + "learning_rate": 6.068149853828137e-07, + "loss": 0.2773, + "step": 29178 + }, + { + "epoch": 0.85, + "grad_norm": 1.530889573756886, + "learning_rate": 6.065907239058616e-07, + "loss": 0.2671, + "step": 29179 + }, + { + "epoch": 0.85, + "grad_norm": 1.6512294219869457, + "learning_rate": 6.063665012007958e-07, + "loss": 0.273, + "step": 29180 + }, + { + "epoch": 0.85, + "grad_norm": 1.4451938468608172, + "learning_rate": 6.061423172695935e-07, + "loss": 0.2744, + "step": 29181 + }, + { + "epoch": 0.85, + "grad_norm": 1.2339870204904144, + "learning_rate": 6.059181721142338e-07, + "loss": 0.2686, + "step": 29182 + }, + { + "epoch": 0.85, + "grad_norm": 1.7897700116190285, + "learning_rate": 6.05694065736696e-07, + "loss": 0.2903, + "step": 29183 + }, + { + "epoch": 0.85, + "grad_norm": 1.2842811806169818, + "learning_rate": 6.054699981389562e-07, + "loss": 0.2856, + "step": 29184 + }, + { + "epoch": 0.85, + "grad_norm": 1.5140185915170756, + "learning_rate": 6.052459693229917e-07, + "loss": 0.2781, + "step": 29185 + }, + { + "epoch": 0.85, + "grad_norm": 1.781437834899125, + "learning_rate": 6.050219792907807e-07, + "loss": 0.302, + "step": 29186 + }, + { + "epoch": 0.85, + "grad_norm": 1.2283345000130943, + "learning_rate": 6.047980280443e-07, + "loss": 0.2648, + "step": 29187 + }, + { + "epoch": 0.85, + "grad_norm": 0.953962535505458, + "learning_rate": 6.045741155855245e-07, + "loss": 0.554, + "step": 29188 + }, + { + "epoch": 0.85, + "grad_norm": 1.4844122959297223, + "learning_rate": 6.043502419164315e-07, + "loss": 0.2846, + "step": 29189 + }, + { + "epoch": 0.85, + "grad_norm": 1.4381443589165166, + "learning_rate": 6.041264070389968e-07, + "loss": 0.2642, + "step": 29190 + }, + { + "epoch": 0.85, + "grad_norm": 1.3416260482406641, + "learning_rate": 6.039026109551949e-07, + "loss": 0.2554, + "step": 29191 + }, + { + "epoch": 0.85, + "grad_norm": 1.41190166518506, + "learning_rate": 6.036788536670024e-07, + "loss": 0.3154, + "step": 29192 + }, + { + "epoch": 0.85, + "grad_norm": 2.76878525102711, + "learning_rate": 6.034551351763918e-07, + "loss": 0.2736, + "step": 29193 + }, + { + "epoch": 0.85, + "grad_norm": 1.3172487051075106, + "learning_rate": 6.032314554853385e-07, + "loss": 0.2661, + "step": 29194 + }, + { + "epoch": 0.85, + "grad_norm": 1.475234456966493, + "learning_rate": 6.030078145958163e-07, + "loss": 0.2634, + "step": 29195 + }, + { + "epoch": 0.85, + "grad_norm": 1.377252022176153, + "learning_rate": 6.027842125097988e-07, + "loss": 0.2648, + "step": 29196 + }, + { + "epoch": 0.85, + "grad_norm": 1.443500048525451, + "learning_rate": 6.02560649229259e-07, + "loss": 0.2732, + "step": 29197 + }, + { + "epoch": 0.85, + "grad_norm": 1.4332715427422544, + "learning_rate": 6.02337124756171e-07, + "loss": 0.2585, + "step": 29198 + }, + { + "epoch": 0.85, + "grad_norm": 1.3412212829440813, + "learning_rate": 6.021136390925064e-07, + "loss": 0.2852, + "step": 29199 + }, + { + "epoch": 0.85, + "grad_norm": 1.2854067787890346, + "learning_rate": 6.018901922402387e-07, + "loss": 0.2566, + "step": 29200 + }, + { + "epoch": 0.85, + "grad_norm": 1.4825146712067145, + "learning_rate": 6.016667842013379e-07, + "loss": 0.2639, + "step": 29201 + }, + { + "epoch": 0.85, + "grad_norm": 1.2721659486272294, + "learning_rate": 6.014434149777765e-07, + "loss": 0.2707, + "step": 29202 + }, + { + "epoch": 0.85, + "grad_norm": 1.614976884228006, + "learning_rate": 6.012200845715255e-07, + "loss": 0.2782, + "step": 29203 + }, + { + "epoch": 0.85, + "grad_norm": 1.5000338126358452, + "learning_rate": 6.009967929845561e-07, + "loss": 0.2757, + "step": 29204 + }, + { + "epoch": 0.85, + "grad_norm": 1.3463270290865739, + "learning_rate": 6.007735402188392e-07, + "loss": 0.254, + "step": 29205 + }, + { + "epoch": 0.85, + "grad_norm": 1.3575862312609885, + "learning_rate": 6.005503262763445e-07, + "loss": 0.2598, + "step": 29206 + }, + { + "epoch": 0.85, + "grad_norm": 1.2526955179114783, + "learning_rate": 6.003271511590414e-07, + "loss": 0.2872, + "step": 29207 + }, + { + "epoch": 0.85, + "grad_norm": 1.3191451877161295, + "learning_rate": 6.001040148689019e-07, + "loss": 0.2957, + "step": 29208 + }, + { + "epoch": 0.85, + "grad_norm": 1.3237674542227889, + "learning_rate": 5.998809174078918e-07, + "loss": 0.2679, + "step": 29209 + }, + { + "epoch": 0.85, + "grad_norm": 1.6806668794315942, + "learning_rate": 5.996578587779811e-07, + "loss": 0.2679, + "step": 29210 + }, + { + "epoch": 0.85, + "grad_norm": 1.3171829914197184, + "learning_rate": 5.9943483898114e-07, + "loss": 0.2686, + "step": 29211 + }, + { + "epoch": 0.85, + "grad_norm": 4.338421543301251, + "learning_rate": 5.992118580193345e-07, + "loss": 0.2657, + "step": 29212 + }, + { + "epoch": 0.85, + "grad_norm": 1.327584432309659, + "learning_rate": 5.989889158945328e-07, + "loss": 0.3137, + "step": 29213 + }, + { + "epoch": 0.85, + "grad_norm": 1.931307373259877, + "learning_rate": 5.98766012608703e-07, + "loss": 0.3043, + "step": 29214 + }, + { + "epoch": 0.85, + "grad_norm": 1.380302265543001, + "learning_rate": 5.985431481638115e-07, + "loss": 0.2685, + "step": 29215 + }, + { + "epoch": 0.85, + "grad_norm": 1.4063440616847576, + "learning_rate": 5.983203225618261e-07, + "loss": 0.2748, + "step": 29216 + }, + { + "epoch": 0.85, + "grad_norm": 3.3108320929749198, + "learning_rate": 5.98097535804712e-07, + "loss": 0.2576, + "step": 29217 + }, + { + "epoch": 0.85, + "grad_norm": 2.2730939750093606, + "learning_rate": 5.978747878944369e-07, + "loss": 0.2708, + "step": 29218 + }, + { + "epoch": 0.85, + "grad_norm": 1.530463556102181, + "learning_rate": 5.97652078832966e-07, + "loss": 0.2847, + "step": 29219 + }, + { + "epoch": 0.85, + "grad_norm": 1.3646290158851362, + "learning_rate": 5.974294086222632e-07, + "loss": 0.2854, + "step": 29220 + }, + { + "epoch": 0.85, + "grad_norm": 1.470076269756523, + "learning_rate": 5.972067772642953e-07, + "loss": 0.2943, + "step": 29221 + }, + { + "epoch": 0.85, + "grad_norm": 1.3290086587263392, + "learning_rate": 5.969841847610258e-07, + "loss": 0.2663, + "step": 29222 + }, + { + "epoch": 0.85, + "grad_norm": 2.2220323101502566, + "learning_rate": 5.967616311144203e-07, + "loss": 0.3003, + "step": 29223 + }, + { + "epoch": 0.85, + "grad_norm": 1.3925676441970103, + "learning_rate": 5.965391163264417e-07, + "loss": 0.2647, + "step": 29224 + }, + { + "epoch": 0.85, + "grad_norm": 1.258342748500372, + "learning_rate": 5.963166403990545e-07, + "loss": 0.2713, + "step": 29225 + }, + { + "epoch": 0.85, + "grad_norm": 1.3020919351275893, + "learning_rate": 5.960942033342221e-07, + "loss": 0.2492, + "step": 29226 + }, + { + "epoch": 0.85, + "grad_norm": 1.4146809694448863, + "learning_rate": 5.958718051339069e-07, + "loss": 0.3033, + "step": 29227 + }, + { + "epoch": 0.85, + "grad_norm": 1.536507310327679, + "learning_rate": 5.956494458000728e-07, + "loss": 0.2827, + "step": 29228 + }, + { + "epoch": 0.85, + "grad_norm": 1.4382081264780542, + "learning_rate": 5.954271253346805e-07, + "loss": 0.2764, + "step": 29229 + }, + { + "epoch": 0.85, + "grad_norm": 1.4129439066171219, + "learning_rate": 5.952048437396923e-07, + "loss": 0.2878, + "step": 29230 + }, + { + "epoch": 0.85, + "grad_norm": 1.54789843628552, + "learning_rate": 5.949826010170706e-07, + "loss": 0.2813, + "step": 29231 + }, + { + "epoch": 0.85, + "grad_norm": 1.5264502571775505, + "learning_rate": 5.94760397168776e-07, + "loss": 0.2763, + "step": 29232 + }, + { + "epoch": 0.85, + "grad_norm": 1.9041359062310228, + "learning_rate": 5.945382321967696e-07, + "loss": 0.2538, + "step": 29233 + }, + { + "epoch": 0.85, + "grad_norm": 1.2759524500706703, + "learning_rate": 5.943161061030128e-07, + "loss": 0.2576, + "step": 29234 + }, + { + "epoch": 0.85, + "grad_norm": 1.4308658829700167, + "learning_rate": 5.94094018889465e-07, + "loss": 0.2828, + "step": 29235 + }, + { + "epoch": 0.85, + "grad_norm": 1.2353144002231393, + "learning_rate": 5.938719705580869e-07, + "loss": 0.2879, + "step": 29236 + }, + { + "epoch": 0.85, + "grad_norm": 1.4093980954010366, + "learning_rate": 5.936499611108371e-07, + "loss": 0.2989, + "step": 29237 + }, + { + "epoch": 0.85, + "grad_norm": 1.3931815522799464, + "learning_rate": 5.934279905496759e-07, + "loss": 0.2566, + "step": 29238 + }, + { + "epoch": 0.85, + "grad_norm": 3.6705041962550484, + "learning_rate": 5.932060588765609e-07, + "loss": 0.2646, + "step": 29239 + }, + { + "epoch": 0.85, + "grad_norm": 1.6883549695116589, + "learning_rate": 5.92984166093451e-07, + "loss": 0.3036, + "step": 29240 + }, + { + "epoch": 0.85, + "grad_norm": 1.2387039962790583, + "learning_rate": 5.927623122023052e-07, + "loss": 0.2626, + "step": 29241 + }, + { + "epoch": 0.85, + "grad_norm": 1.392590275781973, + "learning_rate": 5.925404972050802e-07, + "loss": 0.2768, + "step": 29242 + }, + { + "epoch": 0.85, + "grad_norm": 1.3781893448695537, + "learning_rate": 5.923187211037346e-07, + "loss": 0.2655, + "step": 29243 + }, + { + "epoch": 0.85, + "grad_norm": 1.4744149212674276, + "learning_rate": 5.920969839002255e-07, + "loss": 0.2611, + "step": 29244 + }, + { + "epoch": 0.85, + "grad_norm": 1.3360892500938932, + "learning_rate": 5.918752855965088e-07, + "loss": 0.2815, + "step": 29245 + }, + { + "epoch": 0.85, + "grad_norm": 1.3139826531940726, + "learning_rate": 5.916536261945422e-07, + "loss": 0.2928, + "step": 29246 + }, + { + "epoch": 0.85, + "grad_norm": 1.3138519661993395, + "learning_rate": 5.914320056962819e-07, + "loss": 0.278, + "step": 29247 + }, + { + "epoch": 0.85, + "grad_norm": 1.6941360889925658, + "learning_rate": 5.91210424103682e-07, + "loss": 0.2785, + "step": 29248 + }, + { + "epoch": 0.85, + "grad_norm": 1.3040401462858733, + "learning_rate": 5.909888814186992e-07, + "loss": 0.2741, + "step": 29249 + }, + { + "epoch": 0.85, + "grad_norm": 2.337507390565161, + "learning_rate": 5.907673776432882e-07, + "loss": 0.2745, + "step": 29250 + }, + { + "epoch": 0.85, + "grad_norm": 1.3560127559082966, + "learning_rate": 5.905459127794038e-07, + "loss": 0.2732, + "step": 29251 + }, + { + "epoch": 0.85, + "grad_norm": 1.3233477244405396, + "learning_rate": 5.903244868290015e-07, + "loss": 0.2817, + "step": 29252 + }, + { + "epoch": 0.85, + "grad_norm": 1.3972393020316825, + "learning_rate": 5.901030997940338e-07, + "loss": 0.2594, + "step": 29253 + }, + { + "epoch": 0.85, + "grad_norm": 1.3422100034492992, + "learning_rate": 5.898817516764555e-07, + "loss": 0.2906, + "step": 29254 + }, + { + "epoch": 0.85, + "grad_norm": 1.3692084774817264, + "learning_rate": 5.896604424782204e-07, + "loss": 0.2701, + "step": 29255 + }, + { + "epoch": 0.85, + "grad_norm": 1.415508125464748, + "learning_rate": 5.894391722012798e-07, + "loss": 0.2735, + "step": 29256 + }, + { + "epoch": 0.85, + "grad_norm": 1.4066782265222888, + "learning_rate": 5.892179408475878e-07, + "loss": 0.261, + "step": 29257 + }, + { + "epoch": 0.85, + "grad_norm": 1.9508917913706447, + "learning_rate": 5.889967484190962e-07, + "loss": 0.2523, + "step": 29258 + }, + { + "epoch": 0.85, + "grad_norm": 1.3512669908805757, + "learning_rate": 5.887755949177571e-07, + "loss": 0.2578, + "step": 29259 + }, + { + "epoch": 0.85, + "grad_norm": 1.351465918914757, + "learning_rate": 5.885544803455223e-07, + "loss": 0.2817, + "step": 29260 + }, + { + "epoch": 0.85, + "grad_norm": 1.4970715549275826, + "learning_rate": 5.883334047043431e-07, + "loss": 0.2826, + "step": 29261 + }, + { + "epoch": 0.85, + "grad_norm": 1.327713613930285, + "learning_rate": 5.881123679961709e-07, + "loss": 0.2937, + "step": 29262 + }, + { + "epoch": 0.85, + "grad_norm": 1.4271162040663137, + "learning_rate": 5.878913702229561e-07, + "loss": 0.2761, + "step": 29263 + }, + { + "epoch": 0.85, + "grad_norm": 1.4963607792818379, + "learning_rate": 5.876704113866494e-07, + "loss": 0.3113, + "step": 29264 + }, + { + "epoch": 0.85, + "grad_norm": 1.3296771259039823, + "learning_rate": 5.874494914891993e-07, + "loss": 0.264, + "step": 29265 + }, + { + "epoch": 0.85, + "grad_norm": 1.450403645690295, + "learning_rate": 5.872286105325581e-07, + "loss": 0.2769, + "step": 29266 + }, + { + "epoch": 0.85, + "grad_norm": 1.7329549777057311, + "learning_rate": 5.870077685186715e-07, + "loss": 0.2703, + "step": 29267 + }, + { + "epoch": 0.85, + "grad_norm": 1.2422680610984802, + "learning_rate": 5.867869654494906e-07, + "loss": 0.2652, + "step": 29268 + }, + { + "epoch": 0.85, + "grad_norm": 2.267114761149417, + "learning_rate": 5.865662013269641e-07, + "loss": 0.2831, + "step": 29269 + }, + { + "epoch": 0.85, + "grad_norm": 1.4392355317210732, + "learning_rate": 5.863454761530391e-07, + "loss": 0.2551, + "step": 29270 + }, + { + "epoch": 0.85, + "grad_norm": 0.9731891180149385, + "learning_rate": 5.861247899296646e-07, + "loss": 0.5928, + "step": 29271 + }, + { + "epoch": 0.85, + "grad_norm": 1.3397529534864032, + "learning_rate": 5.85904142658788e-07, + "loss": 0.2835, + "step": 29272 + }, + { + "epoch": 0.85, + "grad_norm": 1.3778123905951198, + "learning_rate": 5.856835343423562e-07, + "loss": 0.272, + "step": 29273 + }, + { + "epoch": 0.85, + "grad_norm": 1.4856307865842073, + "learning_rate": 5.854629649823168e-07, + "loss": 0.2771, + "step": 29274 + }, + { + "epoch": 0.85, + "grad_norm": 1.5554884529177022, + "learning_rate": 5.852424345806152e-07, + "loss": 0.2667, + "step": 29275 + }, + { + "epoch": 0.85, + "grad_norm": 2.366241921236422, + "learning_rate": 5.850219431391979e-07, + "loss": 0.2689, + "step": 29276 + }, + { + "epoch": 0.85, + "grad_norm": 1.3760532978270723, + "learning_rate": 5.848014906600108e-07, + "loss": 0.2671, + "step": 29277 + }, + { + "epoch": 0.85, + "grad_norm": 1.380147689895391, + "learning_rate": 5.845810771449995e-07, + "loss": 0.272, + "step": 29278 + }, + { + "epoch": 0.85, + "grad_norm": 1.3684349586910536, + "learning_rate": 5.84360702596109e-07, + "loss": 0.2679, + "step": 29279 + }, + { + "epoch": 0.85, + "grad_norm": 1.4382917211320356, + "learning_rate": 5.841403670152845e-07, + "loss": 0.2839, + "step": 29280 + }, + { + "epoch": 0.85, + "grad_norm": 1.4240839950248885, + "learning_rate": 5.839200704044706e-07, + "loss": 0.2796, + "step": 29281 + }, + { + "epoch": 0.85, + "grad_norm": 1.3504774675611038, + "learning_rate": 5.836998127656107e-07, + "loss": 0.2852, + "step": 29282 + }, + { + "epoch": 0.85, + "grad_norm": 2.002918567290369, + "learning_rate": 5.834795941006505e-07, + "loss": 0.2558, + "step": 29283 + }, + { + "epoch": 0.85, + "grad_norm": 1.4245561731650411, + "learning_rate": 5.8325941441153e-07, + "loss": 0.3009, + "step": 29284 + }, + { + "epoch": 0.85, + "grad_norm": 5.847558526087226, + "learning_rate": 5.830392737001944e-07, + "loss": 0.3244, + "step": 29285 + }, + { + "epoch": 0.85, + "grad_norm": 1.5917423427360315, + "learning_rate": 5.828191719685867e-07, + "loss": 0.248, + "step": 29286 + }, + { + "epoch": 0.85, + "grad_norm": 1.5841875413153417, + "learning_rate": 5.825991092186484e-07, + "loss": 0.2751, + "step": 29287 + }, + { + "epoch": 0.85, + "grad_norm": 1.2802007534572517, + "learning_rate": 5.82379085452322e-07, + "loss": 0.2944, + "step": 29288 + }, + { + "epoch": 0.85, + "grad_norm": 1.5642217814187374, + "learning_rate": 5.821591006715493e-07, + "loss": 0.2471, + "step": 29289 + }, + { + "epoch": 0.85, + "grad_norm": 1.5695380800417311, + "learning_rate": 5.819391548782716e-07, + "loss": 0.2783, + "step": 29290 + }, + { + "epoch": 0.85, + "grad_norm": 1.510956365369447, + "learning_rate": 5.81719248074431e-07, + "loss": 0.2475, + "step": 29291 + }, + { + "epoch": 0.85, + "grad_norm": 1.8288043420583844, + "learning_rate": 5.814993802619656e-07, + "loss": 0.2488, + "step": 29292 + }, + { + "epoch": 0.85, + "grad_norm": 1.5534678177214485, + "learning_rate": 5.812795514428182e-07, + "loss": 0.2812, + "step": 29293 + }, + { + "epoch": 0.85, + "grad_norm": 1.4938028234186151, + "learning_rate": 5.810597616189273e-07, + "loss": 0.2617, + "step": 29294 + }, + { + "epoch": 0.85, + "grad_norm": 1.306305253209463, + "learning_rate": 5.808400107922324e-07, + "loss": 0.2512, + "step": 29295 + }, + { + "epoch": 0.85, + "grad_norm": 1.254723099672241, + "learning_rate": 5.806202989646737e-07, + "loss": 0.2752, + "step": 29296 + }, + { + "epoch": 0.85, + "grad_norm": 1.276068191658126, + "learning_rate": 5.804006261381901e-07, + "loss": 0.2601, + "step": 29297 + }, + { + "epoch": 0.85, + "grad_norm": 1.3304459816852736, + "learning_rate": 5.8018099231472e-07, + "loss": 0.2664, + "step": 29298 + }, + { + "epoch": 0.85, + "grad_norm": 1.3810900111416895, + "learning_rate": 5.799613974962015e-07, + "loss": 0.2663, + "step": 29299 + }, + { + "epoch": 0.85, + "grad_norm": 1.4706547249307622, + "learning_rate": 5.797418416845723e-07, + "loss": 0.2679, + "step": 29300 + }, + { + "epoch": 0.85, + "grad_norm": 1.2913173760240744, + "learning_rate": 5.79522324881771e-07, + "loss": 0.3041, + "step": 29301 + }, + { + "epoch": 0.85, + "grad_norm": 1.7997756379839678, + "learning_rate": 5.793028470897349e-07, + "loss": 0.2798, + "step": 29302 + }, + { + "epoch": 0.85, + "grad_norm": 1.304473633056696, + "learning_rate": 5.790834083103991e-07, + "loss": 0.2569, + "step": 29303 + }, + { + "epoch": 0.85, + "grad_norm": 1.372911031342338, + "learning_rate": 5.788640085457015e-07, + "loss": 0.2682, + "step": 29304 + }, + { + "epoch": 0.85, + "grad_norm": 1.4185782121563735, + "learning_rate": 5.786446477975782e-07, + "loss": 0.2536, + "step": 29305 + }, + { + "epoch": 0.85, + "grad_norm": 1.5502132581184722, + "learning_rate": 5.784253260679651e-07, + "loss": 0.274, + "step": 29306 + }, + { + "epoch": 0.85, + "grad_norm": 1.569551036811528, + "learning_rate": 5.782060433587972e-07, + "loss": 0.2532, + "step": 29307 + }, + { + "epoch": 0.85, + "grad_norm": 1.800534060083042, + "learning_rate": 5.779867996720096e-07, + "loss": 0.2648, + "step": 29308 + }, + { + "epoch": 0.85, + "grad_norm": 1.2689106552952343, + "learning_rate": 5.77767595009538e-07, + "loss": 0.252, + "step": 29309 + }, + { + "epoch": 0.85, + "grad_norm": 1.2542707708460932, + "learning_rate": 5.775484293733175e-07, + "loss": 0.2569, + "step": 29310 + }, + { + "epoch": 0.85, + "grad_norm": 1.686762703456752, + "learning_rate": 5.773293027652805e-07, + "loss": 0.268, + "step": 29311 + }, + { + "epoch": 0.85, + "grad_norm": 1.3334096981576167, + "learning_rate": 5.771102151873609e-07, + "loss": 0.2543, + "step": 29312 + }, + { + "epoch": 0.85, + "grad_norm": 0.9791340590382419, + "learning_rate": 5.768911666414929e-07, + "loss": 0.6225, + "step": 29313 + }, + { + "epoch": 0.85, + "grad_norm": 1.7104482034797284, + "learning_rate": 5.766721571296091e-07, + "loss": 0.2814, + "step": 29314 + }, + { + "epoch": 0.85, + "grad_norm": 2.6947580211972495, + "learning_rate": 5.764531866536433e-07, + "loss": 0.2731, + "step": 29315 + }, + { + "epoch": 0.85, + "grad_norm": 1.3431232306581324, + "learning_rate": 5.762342552155264e-07, + "loss": 0.267, + "step": 29316 + }, + { + "epoch": 0.85, + "grad_norm": 1.4666259896368534, + "learning_rate": 5.760153628171917e-07, + "loss": 0.255, + "step": 29317 + }, + { + "epoch": 0.85, + "grad_norm": 1.5015796833205846, + "learning_rate": 5.757965094605706e-07, + "loss": 0.3028, + "step": 29318 + }, + { + "epoch": 0.85, + "grad_norm": 1.3651736839811168, + "learning_rate": 5.75577695147595e-07, + "loss": 0.2712, + "step": 29319 + }, + { + "epoch": 0.85, + "grad_norm": 1.8407310990091914, + "learning_rate": 5.753589198801946e-07, + "loss": 0.2778, + "step": 29320 + }, + { + "epoch": 0.85, + "grad_norm": 4.237356725689757, + "learning_rate": 5.751401836603021e-07, + "loss": 0.2596, + "step": 29321 + }, + { + "epoch": 0.85, + "grad_norm": 1.6906711841445456, + "learning_rate": 5.74921486489845e-07, + "loss": 0.287, + "step": 29322 + }, + { + "epoch": 0.85, + "grad_norm": 1.5310387377135795, + "learning_rate": 5.747028283707551e-07, + "loss": 0.2433, + "step": 29323 + }, + { + "epoch": 0.85, + "grad_norm": 2.700420517605755, + "learning_rate": 5.744842093049619e-07, + "loss": 0.296, + "step": 29324 + }, + { + "epoch": 0.85, + "grad_norm": 1.3288629324912922, + "learning_rate": 5.742656292943943e-07, + "loss": 0.2604, + "step": 29325 + }, + { + "epoch": 0.85, + "grad_norm": 1.4893448512541076, + "learning_rate": 5.740470883409821e-07, + "loss": 0.2743, + "step": 29326 + }, + { + "epoch": 0.85, + "grad_norm": 1.3123865670691284, + "learning_rate": 5.738285864466537e-07, + "loss": 0.2713, + "step": 29327 + }, + { + "epoch": 0.85, + "grad_norm": 2.6131763904328724, + "learning_rate": 5.736101236133362e-07, + "loss": 0.2644, + "step": 29328 + }, + { + "epoch": 0.85, + "grad_norm": 1.3858401399909668, + "learning_rate": 5.7339169984296e-07, + "loss": 0.2743, + "step": 29329 + }, + { + "epoch": 0.85, + "grad_norm": 1.3840848378155923, + "learning_rate": 5.731733151374502e-07, + "loss": 0.2706, + "step": 29330 + }, + { + "epoch": 0.85, + "grad_norm": 1.6329485882146155, + "learning_rate": 5.729549694987347e-07, + "loss": 0.2978, + "step": 29331 + }, + { + "epoch": 0.85, + "grad_norm": 1.3078658533098506, + "learning_rate": 5.727366629287412e-07, + "loss": 0.2719, + "step": 29332 + }, + { + "epoch": 0.85, + "grad_norm": 1.3834662116012877, + "learning_rate": 5.725183954293956e-07, + "loss": 0.2829, + "step": 29333 + }, + { + "epoch": 0.85, + "grad_norm": 1.522155867351708, + "learning_rate": 5.72300167002624e-07, + "loss": 0.2727, + "step": 29334 + }, + { + "epoch": 0.85, + "grad_norm": 1.5730818639920503, + "learning_rate": 5.720819776503528e-07, + "loss": 0.2771, + "step": 29335 + }, + { + "epoch": 0.85, + "grad_norm": 1.3819815283563535, + "learning_rate": 5.71863827374507e-07, + "loss": 0.2578, + "step": 29336 + }, + { + "epoch": 0.85, + "grad_norm": 1.9121430049365433, + "learning_rate": 5.716457161770123e-07, + "loss": 0.2783, + "step": 29337 + }, + { + "epoch": 0.85, + "grad_norm": 1.4178994212200942, + "learning_rate": 5.714276440597938e-07, + "loss": 0.2917, + "step": 29338 + }, + { + "epoch": 0.85, + "grad_norm": 0.9903868032027224, + "learning_rate": 5.71209611024775e-07, + "loss": 0.5876, + "step": 29339 + }, + { + "epoch": 0.85, + "grad_norm": 1.3102588192194649, + "learning_rate": 5.709916170738805e-07, + "loss": 0.2601, + "step": 29340 + }, + { + "epoch": 0.85, + "grad_norm": 1.3894754085815806, + "learning_rate": 5.707736622090337e-07, + "loss": 0.2723, + "step": 29341 + }, + { + "epoch": 0.85, + "grad_norm": 1.5548899446198627, + "learning_rate": 5.705557464321587e-07, + "loss": 0.2665, + "step": 29342 + }, + { + "epoch": 0.85, + "grad_norm": 1.5249662566203144, + "learning_rate": 5.703378697451789e-07, + "loss": 0.2854, + "step": 29343 + }, + { + "epoch": 0.85, + "grad_norm": 1.3283352044004466, + "learning_rate": 5.701200321500161e-07, + "loss": 0.2555, + "step": 29344 + }, + { + "epoch": 0.85, + "grad_norm": 1.2879129125506914, + "learning_rate": 5.699022336485927e-07, + "loss": 0.2585, + "step": 29345 + }, + { + "epoch": 0.85, + "grad_norm": 1.352519255078921, + "learning_rate": 5.696844742428332e-07, + "loss": 0.2984, + "step": 29346 + }, + { + "epoch": 0.85, + "grad_norm": 1.317687649016138, + "learning_rate": 5.694667539346554e-07, + "loss": 0.265, + "step": 29347 + }, + { + "epoch": 0.85, + "grad_norm": 1.2765066136542425, + "learning_rate": 5.692490727259831e-07, + "loss": 0.2654, + "step": 29348 + }, + { + "epoch": 0.85, + "grad_norm": 1.3602529512936439, + "learning_rate": 5.690314306187378e-07, + "loss": 0.2637, + "step": 29349 + }, + { + "epoch": 0.85, + "grad_norm": 2.0126827657144055, + "learning_rate": 5.688138276148386e-07, + "loss": 0.2693, + "step": 29350 + }, + { + "epoch": 0.85, + "grad_norm": 1.442109777130624, + "learning_rate": 5.685962637162062e-07, + "loss": 0.2721, + "step": 29351 + }, + { + "epoch": 0.85, + "grad_norm": 1.2012709196712192, + "learning_rate": 5.683787389247608e-07, + "loss": 0.2517, + "step": 29352 + }, + { + "epoch": 0.85, + "grad_norm": 1.2961666548374573, + "learning_rate": 5.681612532424225e-07, + "loss": 0.247, + "step": 29353 + }, + { + "epoch": 0.85, + "grad_norm": 1.6042326148215535, + "learning_rate": 5.679438066711101e-07, + "loss": 0.255, + "step": 29354 + }, + { + "epoch": 0.85, + "grad_norm": 1.3768672249347602, + "learning_rate": 5.677263992127429e-07, + "loss": 0.2702, + "step": 29355 + }, + { + "epoch": 0.85, + "grad_norm": 1.3920688088476652, + "learning_rate": 5.675090308692394e-07, + "loss": 0.2608, + "step": 29356 + }, + { + "epoch": 0.85, + "grad_norm": 1.306930605491594, + "learning_rate": 5.672917016425189e-07, + "loss": 0.2585, + "step": 29357 + }, + { + "epoch": 0.85, + "grad_norm": 1.5689487723475555, + "learning_rate": 5.670744115344972e-07, + "loss": 0.2777, + "step": 29358 + }, + { + "epoch": 0.85, + "grad_norm": 1.381467988011028, + "learning_rate": 5.668571605470929e-07, + "loss": 0.279, + "step": 29359 + }, + { + "epoch": 0.85, + "grad_norm": 1.287665212797208, + "learning_rate": 5.666399486822233e-07, + "loss": 0.2547, + "step": 29360 + }, + { + "epoch": 0.85, + "grad_norm": 3.963504542616841, + "learning_rate": 5.664227759418056e-07, + "loss": 0.2739, + "step": 29361 + }, + { + "epoch": 0.85, + "grad_norm": 2.510268160092855, + "learning_rate": 5.662056423277562e-07, + "loss": 0.2916, + "step": 29362 + }, + { + "epoch": 0.85, + "grad_norm": 1.5424195525918698, + "learning_rate": 5.659885478419908e-07, + "loss": 0.3061, + "step": 29363 + }, + { + "epoch": 0.85, + "grad_norm": 1.2699628750783793, + "learning_rate": 5.65771492486426e-07, + "loss": 0.2836, + "step": 29364 + }, + { + "epoch": 0.85, + "grad_norm": 1.3160618105172308, + "learning_rate": 5.655544762629778e-07, + "loss": 0.2721, + "step": 29365 + }, + { + "epoch": 0.85, + "grad_norm": 1.4141993743492531, + "learning_rate": 5.653374991735594e-07, + "loss": 0.2638, + "step": 29366 + }, + { + "epoch": 0.85, + "grad_norm": 1.7453266494329176, + "learning_rate": 5.651205612200872e-07, + "loss": 0.2839, + "step": 29367 + }, + { + "epoch": 0.85, + "grad_norm": 1.3043459781398707, + "learning_rate": 5.649036624044746e-07, + "loss": 0.2797, + "step": 29368 + }, + { + "epoch": 0.85, + "grad_norm": 1.4165443253416152, + "learning_rate": 5.646868027286367e-07, + "loss": 0.2537, + "step": 29369 + }, + { + "epoch": 0.85, + "grad_norm": 1.353703664195557, + "learning_rate": 5.644699821944871e-07, + "loss": 0.2847, + "step": 29370 + }, + { + "epoch": 0.85, + "grad_norm": 1.280775308693643, + "learning_rate": 5.642532008039392e-07, + "loss": 0.2698, + "step": 29371 + }, + { + "epoch": 0.85, + "grad_norm": 7.073082682194295, + "learning_rate": 5.640364585589058e-07, + "loss": 0.2727, + "step": 29372 + }, + { + "epoch": 0.85, + "grad_norm": 1.4365333202958521, + "learning_rate": 5.638197554612995e-07, + "loss": 0.2805, + "step": 29373 + }, + { + "epoch": 0.85, + "grad_norm": 1.3645382548912703, + "learning_rate": 5.636030915130347e-07, + "loss": 0.2939, + "step": 29374 + }, + { + "epoch": 0.85, + "grad_norm": 1.37873596911712, + "learning_rate": 5.633864667160205e-07, + "loss": 0.289, + "step": 29375 + }, + { + "epoch": 0.85, + "grad_norm": 1.3085854484120836, + "learning_rate": 5.631698810721702e-07, + "loss": 0.2686, + "step": 29376 + }, + { + "epoch": 0.85, + "grad_norm": 1.4991278819715819, + "learning_rate": 5.629533345833954e-07, + "loss": 0.2841, + "step": 29377 + }, + { + "epoch": 0.85, + "grad_norm": 1.222043397444021, + "learning_rate": 5.627368272516065e-07, + "loss": 0.2439, + "step": 29378 + }, + { + "epoch": 0.85, + "grad_norm": 1.8343438416227755, + "learning_rate": 5.625203590787137e-07, + "loss": 0.3002, + "step": 29379 + }, + { + "epoch": 0.85, + "grad_norm": 1.613192460283711, + "learning_rate": 5.623039300666278e-07, + "loss": 0.2693, + "step": 29380 + }, + { + "epoch": 0.85, + "grad_norm": 1.408544724894528, + "learning_rate": 5.620875402172593e-07, + "loss": 0.2937, + "step": 29381 + }, + { + "epoch": 0.85, + "grad_norm": 1.3101320094609394, + "learning_rate": 5.618711895325174e-07, + "loss": 0.2562, + "step": 29382 + }, + { + "epoch": 0.85, + "grad_norm": 1.322580180101866, + "learning_rate": 5.616548780143116e-07, + "loss": 0.2894, + "step": 29383 + }, + { + "epoch": 0.85, + "grad_norm": 1.3974211394993241, + "learning_rate": 5.614386056645516e-07, + "loss": 0.2651, + "step": 29384 + }, + { + "epoch": 0.85, + "grad_norm": 2.2531830862073017, + "learning_rate": 5.61222372485144e-07, + "loss": 0.2534, + "step": 29385 + }, + { + "epoch": 0.85, + "grad_norm": 1.8377418852900356, + "learning_rate": 5.610061784779986e-07, + "loss": 0.2867, + "step": 29386 + }, + { + "epoch": 0.85, + "grad_norm": 1.4405264109616762, + "learning_rate": 5.607900236450226e-07, + "loss": 0.2749, + "step": 29387 + }, + { + "epoch": 0.85, + "grad_norm": 1.4512088943447272, + "learning_rate": 5.60573907988124e-07, + "loss": 0.2798, + "step": 29388 + }, + { + "epoch": 0.85, + "grad_norm": 1.9606732502001054, + "learning_rate": 5.6035783150921e-07, + "loss": 0.2659, + "step": 29389 + }, + { + "epoch": 0.85, + "grad_norm": 1.400062846154378, + "learning_rate": 5.601417942101872e-07, + "loss": 0.2966, + "step": 29390 + }, + { + "epoch": 0.85, + "grad_norm": 1.4046757131828256, + "learning_rate": 5.599257960929627e-07, + "loss": 0.2756, + "step": 29391 + }, + { + "epoch": 0.85, + "grad_norm": 1.2949109059841772, + "learning_rate": 5.597098371594417e-07, + "loss": 0.2828, + "step": 29392 + }, + { + "epoch": 0.85, + "grad_norm": 1.508782837963352, + "learning_rate": 5.594939174115322e-07, + "loss": 0.2634, + "step": 29393 + }, + { + "epoch": 0.85, + "grad_norm": 1.4333524299484885, + "learning_rate": 5.592780368511369e-07, + "loss": 0.2833, + "step": 29394 + }, + { + "epoch": 0.85, + "grad_norm": 1.455491175429515, + "learning_rate": 5.590621954801623e-07, + "loss": 0.2734, + "step": 29395 + }, + { + "epoch": 0.85, + "grad_norm": 1.3677537490800988, + "learning_rate": 5.588463933005128e-07, + "loss": 0.259, + "step": 29396 + }, + { + "epoch": 0.85, + "grad_norm": 1.4856861685626916, + "learning_rate": 5.586306303140931e-07, + "loss": 0.2803, + "step": 29397 + }, + { + "epoch": 0.85, + "grad_norm": 1.43140807092977, + "learning_rate": 5.584149065228078e-07, + "loss": 0.2882, + "step": 29398 + }, + { + "epoch": 0.85, + "grad_norm": 1.3299333275878458, + "learning_rate": 5.581992219285598e-07, + "loss": 0.2686, + "step": 29399 + }, + { + "epoch": 0.85, + "grad_norm": 3.365897984078076, + "learning_rate": 5.579835765332536e-07, + "loss": 0.2674, + "step": 29400 + }, + { + "epoch": 0.85, + "grad_norm": 1.4517931003037863, + "learning_rate": 5.577679703387917e-07, + "loss": 0.289, + "step": 29401 + }, + { + "epoch": 0.85, + "grad_norm": 2.759973200789558, + "learning_rate": 5.575524033470764e-07, + "loss": 0.3017, + "step": 29402 + }, + { + "epoch": 0.85, + "grad_norm": 1.3748054792716065, + "learning_rate": 5.5733687556001e-07, + "loss": 0.2839, + "step": 29403 + }, + { + "epoch": 0.85, + "grad_norm": 1.2429005430492834, + "learning_rate": 5.571213869794956e-07, + "loss": 0.2724, + "step": 29404 + }, + { + "epoch": 0.85, + "grad_norm": 1.8127771192231668, + "learning_rate": 5.569059376074348e-07, + "loss": 0.2572, + "step": 29405 + }, + { + "epoch": 0.85, + "grad_norm": 0.9710739570871557, + "learning_rate": 5.566905274457274e-07, + "loss": 0.5585, + "step": 29406 + }, + { + "epoch": 0.85, + "grad_norm": 1.293952989146746, + "learning_rate": 5.564751564962756e-07, + "loss": 0.2743, + "step": 29407 + }, + { + "epoch": 0.85, + "grad_norm": 1.4234159733949734, + "learning_rate": 5.562598247609796e-07, + "loss": 0.2931, + "step": 29408 + }, + { + "epoch": 0.85, + "grad_norm": 1.8609167004791243, + "learning_rate": 5.560445322417396e-07, + "loss": 0.2725, + "step": 29409 + }, + { + "epoch": 0.85, + "grad_norm": 1.319027130331858, + "learning_rate": 5.558292789404563e-07, + "loss": 0.2527, + "step": 29410 + }, + { + "epoch": 0.85, + "grad_norm": 2.8218843788140995, + "learning_rate": 5.556140648590302e-07, + "loss": 0.2841, + "step": 29411 + }, + { + "epoch": 0.85, + "grad_norm": 1.3708155196686445, + "learning_rate": 5.553988899993579e-07, + "loss": 0.2771, + "step": 29412 + }, + { + "epoch": 0.85, + "grad_norm": 1.419784856221228, + "learning_rate": 5.551837543633398e-07, + "loss": 0.2904, + "step": 29413 + }, + { + "epoch": 0.85, + "grad_norm": 1.448479897534117, + "learning_rate": 5.549686579528746e-07, + "loss": 0.2522, + "step": 29414 + }, + { + "epoch": 0.85, + "grad_norm": 1.430219255877035, + "learning_rate": 5.547536007698601e-07, + "loss": 0.2727, + "step": 29415 + }, + { + "epoch": 0.85, + "grad_norm": 8.276635657548004, + "learning_rate": 5.545385828161942e-07, + "loss": 0.283, + "step": 29416 + }, + { + "epoch": 0.85, + "grad_norm": 1.5088434053940059, + "learning_rate": 5.543236040937744e-07, + "loss": 0.259, + "step": 29417 + }, + { + "epoch": 0.85, + "grad_norm": 1.2700420799820005, + "learning_rate": 5.541086646044985e-07, + "loss": 0.2885, + "step": 29418 + }, + { + "epoch": 0.85, + "grad_norm": 1.3857130808844256, + "learning_rate": 5.538937643502629e-07, + "loss": 0.2654, + "step": 29419 + }, + { + "epoch": 0.85, + "grad_norm": 1.4187479956437594, + "learning_rate": 5.536789033329654e-07, + "loss": 0.2654, + "step": 29420 + }, + { + "epoch": 0.85, + "grad_norm": 1.3389899383545667, + "learning_rate": 5.534640815544994e-07, + "loss": 0.2513, + "step": 29421 + }, + { + "epoch": 0.85, + "grad_norm": 1.34762030706331, + "learning_rate": 5.532492990167626e-07, + "loss": 0.268, + "step": 29422 + }, + { + "epoch": 0.85, + "grad_norm": 1.3575191251684697, + "learning_rate": 5.530345557216504e-07, + "loss": 0.2735, + "step": 29423 + }, + { + "epoch": 0.85, + "grad_norm": 1.3665875247415213, + "learning_rate": 5.52819851671057e-07, + "loss": 0.2684, + "step": 29424 + }, + { + "epoch": 0.85, + "grad_norm": 1.4895772091047597, + "learning_rate": 5.526051868668781e-07, + "loss": 0.265, + "step": 29425 + }, + { + "epoch": 0.85, + "grad_norm": 1.2105616989123744, + "learning_rate": 5.523905613110076e-07, + "loss": 0.2468, + "step": 29426 + }, + { + "epoch": 0.85, + "grad_norm": 1.3207534423458587, + "learning_rate": 5.521759750053396e-07, + "loss": 0.2522, + "step": 29427 + }, + { + "epoch": 0.85, + "grad_norm": 1.4230056899055443, + "learning_rate": 5.519614279517676e-07, + "loss": 0.2564, + "step": 29428 + }, + { + "epoch": 0.85, + "grad_norm": 1.220682048949825, + "learning_rate": 5.517469201521869e-07, + "loss": 0.2567, + "step": 29429 + }, + { + "epoch": 0.85, + "grad_norm": 1.3984182457679115, + "learning_rate": 5.51532451608488e-07, + "loss": 0.2772, + "step": 29430 + }, + { + "epoch": 0.85, + "grad_norm": 1.298363571164761, + "learning_rate": 5.513180223225645e-07, + "loss": 0.2722, + "step": 29431 + }, + { + "epoch": 0.85, + "grad_norm": 1.2989545633868522, + "learning_rate": 5.511036322963087e-07, + "loss": 0.2609, + "step": 29432 + }, + { + "epoch": 0.85, + "grad_norm": 1.4474599912403447, + "learning_rate": 5.508892815316136e-07, + "loss": 0.2848, + "step": 29433 + }, + { + "epoch": 0.85, + "grad_norm": 1.361959113560904, + "learning_rate": 5.506749700303693e-07, + "loss": 0.3056, + "step": 29434 + }, + { + "epoch": 0.85, + "grad_norm": 1.30320504976911, + "learning_rate": 5.504606977944677e-07, + "loss": 0.2582, + "step": 29435 + }, + { + "epoch": 0.85, + "grad_norm": 1.3574962083394757, + "learning_rate": 5.502464648257993e-07, + "loss": 0.2847, + "step": 29436 + }, + { + "epoch": 0.85, + "grad_norm": 1.402697066422074, + "learning_rate": 5.500322711262556e-07, + "loss": 0.2729, + "step": 29437 + }, + { + "epoch": 0.85, + "grad_norm": 1.3426946938655409, + "learning_rate": 5.498181166977262e-07, + "loss": 0.2723, + "step": 29438 + }, + { + "epoch": 0.85, + "grad_norm": 1.2303173060094246, + "learning_rate": 5.496040015421028e-07, + "loss": 0.2584, + "step": 29439 + }, + { + "epoch": 0.85, + "grad_norm": 1.2127258104946004, + "learning_rate": 5.493899256612722e-07, + "loss": 0.2712, + "step": 29440 + }, + { + "epoch": 0.85, + "grad_norm": 1.3957640982248039, + "learning_rate": 5.491758890571247e-07, + "loss": 0.2679, + "step": 29441 + }, + { + "epoch": 0.85, + "grad_norm": 1.3545208681125962, + "learning_rate": 5.489618917315498e-07, + "loss": 0.2539, + "step": 29442 + }, + { + "epoch": 0.85, + "grad_norm": 1.546834008655472, + "learning_rate": 5.487479336864348e-07, + "loss": 0.2668, + "step": 29443 + }, + { + "epoch": 0.85, + "grad_norm": 1.5491097709098667, + "learning_rate": 5.485340149236696e-07, + "loss": 0.2513, + "step": 29444 + }, + { + "epoch": 0.85, + "grad_norm": 1.4908731913352047, + "learning_rate": 5.483201354451406e-07, + "loss": 0.2795, + "step": 29445 + }, + { + "epoch": 0.85, + "grad_norm": 1.2876679978254393, + "learning_rate": 5.481062952527355e-07, + "loss": 0.2877, + "step": 29446 + }, + { + "epoch": 0.85, + "grad_norm": 1.753083397815016, + "learning_rate": 5.478924943483432e-07, + "loss": 0.3163, + "step": 29447 + }, + { + "epoch": 0.85, + "grad_norm": 1.4364158080654639, + "learning_rate": 5.476787327338478e-07, + "loss": 0.259, + "step": 29448 + }, + { + "epoch": 0.85, + "grad_norm": 1.5166505938210892, + "learning_rate": 5.474650104111373e-07, + "loss": 0.2816, + "step": 29449 + }, + { + "epoch": 0.85, + "grad_norm": 1.3363034186941234, + "learning_rate": 5.472513273820973e-07, + "loss": 0.2538, + "step": 29450 + }, + { + "epoch": 0.85, + "grad_norm": 1.2962497268259847, + "learning_rate": 5.470376836486135e-07, + "loss": 0.2629, + "step": 29451 + }, + { + "epoch": 0.85, + "grad_norm": 1.2537758721326429, + "learning_rate": 5.468240792125717e-07, + "loss": 0.2647, + "step": 29452 + }, + { + "epoch": 0.85, + "grad_norm": 1.4015648818283224, + "learning_rate": 5.466105140758571e-07, + "loss": 0.2554, + "step": 29453 + }, + { + "epoch": 0.85, + "grad_norm": 1.4278566273812099, + "learning_rate": 5.463969882403536e-07, + "loss": 0.2687, + "step": 29454 + }, + { + "epoch": 0.85, + "grad_norm": 1.3116350292517718, + "learning_rate": 5.461835017079465e-07, + "loss": 0.3083, + "step": 29455 + }, + { + "epoch": 0.85, + "grad_norm": 1.3987491968567287, + "learning_rate": 5.459700544805203e-07, + "loss": 0.2509, + "step": 29456 + }, + { + "epoch": 0.85, + "grad_norm": 1.6603912310179751, + "learning_rate": 5.457566465599562e-07, + "loss": 0.2751, + "step": 29457 + }, + { + "epoch": 0.85, + "grad_norm": 1.4507946267742613, + "learning_rate": 5.455432779481401e-07, + "loss": 0.2753, + "step": 29458 + }, + { + "epoch": 0.85, + "grad_norm": 1.5617595144470668, + "learning_rate": 5.453299486469532e-07, + "loss": 0.3009, + "step": 29459 + }, + { + "epoch": 0.85, + "grad_norm": 2.447523110108493, + "learning_rate": 5.45116658658279e-07, + "loss": 0.2611, + "step": 29460 + }, + { + "epoch": 0.85, + "grad_norm": 1.3581414146167001, + "learning_rate": 5.44903407984001e-07, + "loss": 0.2602, + "step": 29461 + }, + { + "epoch": 0.85, + "grad_norm": 1.2844269877811763, + "learning_rate": 5.446901966259987e-07, + "loss": 0.2734, + "step": 29462 + }, + { + "epoch": 0.85, + "grad_norm": 1.7402729944665098, + "learning_rate": 5.444770245861553e-07, + "loss": 0.2874, + "step": 29463 + }, + { + "epoch": 0.85, + "grad_norm": 1.388692635399056, + "learning_rate": 5.442638918663512e-07, + "loss": 0.2672, + "step": 29464 + }, + { + "epoch": 0.85, + "grad_norm": 2.027446185790789, + "learning_rate": 5.440507984684673e-07, + "loss": 0.2617, + "step": 29465 + }, + { + "epoch": 0.85, + "grad_norm": 1.4566003410752115, + "learning_rate": 5.43837744394386e-07, + "loss": 0.2781, + "step": 29466 + }, + { + "epoch": 0.85, + "grad_norm": 1.4056302251835242, + "learning_rate": 5.436247296459851e-07, + "loss": 0.2749, + "step": 29467 + }, + { + "epoch": 0.85, + "grad_norm": 1.5052265362863262, + "learning_rate": 5.434117542251455e-07, + "loss": 0.2527, + "step": 29468 + }, + { + "epoch": 0.85, + "grad_norm": 1.2810772136677342, + "learning_rate": 5.431988181337461e-07, + "loss": 0.2676, + "step": 29469 + }, + { + "epoch": 0.85, + "grad_norm": 1.5647709852855936, + "learning_rate": 5.429859213736671e-07, + "loss": 0.29, + "step": 29470 + }, + { + "epoch": 0.85, + "grad_norm": 2.142587241001323, + "learning_rate": 5.427730639467866e-07, + "loss": 0.2715, + "step": 29471 + }, + { + "epoch": 0.85, + "grad_norm": 1.457159630582793, + "learning_rate": 5.425602458549828e-07, + "loss": 0.2739, + "step": 29472 + }, + { + "epoch": 0.85, + "grad_norm": 1.8906009730168243, + "learning_rate": 5.42347467100135e-07, + "loss": 0.2759, + "step": 29473 + }, + { + "epoch": 0.85, + "grad_norm": 1.6592090655690892, + "learning_rate": 5.421347276841199e-07, + "loss": 0.278, + "step": 29474 + }, + { + "epoch": 0.85, + "grad_norm": 1.5723011192513587, + "learning_rate": 5.419220276088161e-07, + "loss": 0.2616, + "step": 29475 + }, + { + "epoch": 0.85, + "grad_norm": 1.3032545564025597, + "learning_rate": 5.417093668760992e-07, + "loss": 0.2682, + "step": 29476 + }, + { + "epoch": 0.85, + "grad_norm": 2.1984492678229026, + "learning_rate": 5.414967454878462e-07, + "loss": 0.273, + "step": 29477 + }, + { + "epoch": 0.86, + "grad_norm": 1.3161135932974408, + "learning_rate": 5.412841634459343e-07, + "loss": 0.2514, + "step": 29478 + }, + { + "epoch": 0.86, + "grad_norm": 1.9645041751523349, + "learning_rate": 5.410716207522387e-07, + "loss": 0.2892, + "step": 29479 + }, + { + "epoch": 0.86, + "grad_norm": 1.4145124038170322, + "learning_rate": 5.408591174086358e-07, + "loss": 0.2716, + "step": 29480 + }, + { + "epoch": 0.86, + "grad_norm": 1.3639023925811835, + "learning_rate": 5.406466534170008e-07, + "loss": 0.2778, + "step": 29481 + }, + { + "epoch": 0.86, + "grad_norm": 0.9092022747216927, + "learning_rate": 5.404342287792081e-07, + "loss": 0.5453, + "step": 29482 + }, + { + "epoch": 0.86, + "grad_norm": 1.6720362223576488, + "learning_rate": 5.40221843497134e-07, + "loss": 0.2641, + "step": 29483 + }, + { + "epoch": 0.86, + "grad_norm": 1.4074701030261323, + "learning_rate": 5.400094975726511e-07, + "loss": 0.2549, + "step": 29484 + }, + { + "epoch": 0.86, + "grad_norm": 1.522696119735161, + "learning_rate": 5.397971910076333e-07, + "loss": 0.2954, + "step": 29485 + }, + { + "epoch": 0.86, + "grad_norm": 1.7947218326396395, + "learning_rate": 5.395849238039547e-07, + "loss": 0.2628, + "step": 29486 + }, + { + "epoch": 0.86, + "grad_norm": 1.468557441767362, + "learning_rate": 5.393726959634888e-07, + "loss": 0.2883, + "step": 29487 + }, + { + "epoch": 0.86, + "grad_norm": 1.432892689703916, + "learning_rate": 5.391605074881084e-07, + "loss": 0.2542, + "step": 29488 + }, + { + "epoch": 0.86, + "grad_norm": 1.3671477156454388, + "learning_rate": 5.389483583796873e-07, + "loss": 0.2655, + "step": 29489 + }, + { + "epoch": 0.86, + "grad_norm": 1.3954863275136122, + "learning_rate": 5.387362486400949e-07, + "loss": 0.2762, + "step": 29490 + }, + { + "epoch": 0.86, + "grad_norm": 1.2564351306732806, + "learning_rate": 5.385241782712048e-07, + "loss": 0.285, + "step": 29491 + }, + { + "epoch": 0.86, + "grad_norm": 1.2721837401085248, + "learning_rate": 5.383121472748881e-07, + "loss": 0.2642, + "step": 29492 + }, + { + "epoch": 0.86, + "grad_norm": 1.2796470733419512, + "learning_rate": 5.381001556530169e-07, + "loss": 0.275, + "step": 29493 + }, + { + "epoch": 0.86, + "grad_norm": 1.2577550231885732, + "learning_rate": 5.378882034074618e-07, + "loss": 0.2736, + "step": 29494 + }, + { + "epoch": 0.86, + "grad_norm": 1.3329555756294789, + "learning_rate": 5.376762905400917e-07, + "loss": 0.2721, + "step": 29495 + }, + { + "epoch": 0.86, + "grad_norm": 1.3774355258977795, + "learning_rate": 5.374644170527782e-07, + "loss": 0.2692, + "step": 29496 + }, + { + "epoch": 0.86, + "grad_norm": 1.4501174270712942, + "learning_rate": 5.372525829473901e-07, + "loss": 0.2927, + "step": 29497 + }, + { + "epoch": 0.86, + "grad_norm": 1.4005422866624735, + "learning_rate": 5.370407882257983e-07, + "loss": 0.2711, + "step": 29498 + }, + { + "epoch": 0.86, + "grad_norm": 1.373410578587539, + "learning_rate": 5.368290328898706e-07, + "loss": 0.2845, + "step": 29499 + }, + { + "epoch": 0.86, + "grad_norm": 1.994491689889052, + "learning_rate": 5.366173169414763e-07, + "loss": 0.2682, + "step": 29500 + }, + { + "epoch": 0.86, + "grad_norm": 1.4355261856442705, + "learning_rate": 5.364056403824835e-07, + "loss": 0.273, + "step": 29501 + }, + { + "epoch": 0.86, + "grad_norm": 1.3958532103814987, + "learning_rate": 5.361940032147617e-07, + "loss": 0.2821, + "step": 29502 + }, + { + "epoch": 0.86, + "grad_norm": 1.2872782984388702, + "learning_rate": 5.359824054401758e-07, + "loss": 0.2701, + "step": 29503 + }, + { + "epoch": 0.86, + "grad_norm": 1.3471282940291474, + "learning_rate": 5.357708470605955e-07, + "loss": 0.2722, + "step": 29504 + }, + { + "epoch": 0.86, + "grad_norm": 1.3644658977499615, + "learning_rate": 5.355593280778864e-07, + "loss": 0.2763, + "step": 29505 + }, + { + "epoch": 0.86, + "grad_norm": 1.3456475605761908, + "learning_rate": 5.353478484939156e-07, + "loss": 0.267, + "step": 29506 + }, + { + "epoch": 0.86, + "grad_norm": 1.3021210191186043, + "learning_rate": 5.351364083105503e-07, + "loss": 0.2858, + "step": 29507 + }, + { + "epoch": 0.86, + "grad_norm": 1.3231218247833312, + "learning_rate": 5.349250075296552e-07, + "loss": 0.269, + "step": 29508 + }, + { + "epoch": 0.86, + "grad_norm": 0.949200796278084, + "learning_rate": 5.347136461530966e-07, + "loss": 0.5695, + "step": 29509 + }, + { + "epoch": 0.86, + "grad_norm": 1.4988600662448723, + "learning_rate": 5.345023241827396e-07, + "loss": 0.2567, + "step": 29510 + }, + { + "epoch": 0.86, + "grad_norm": 1.3868968162445787, + "learning_rate": 5.3429104162045e-07, + "loss": 0.2817, + "step": 29511 + }, + { + "epoch": 0.86, + "grad_norm": 1.410634773345574, + "learning_rate": 5.340797984680906e-07, + "loss": 0.2901, + "step": 29512 + }, + { + "epoch": 0.86, + "grad_norm": 1.4418522346763603, + "learning_rate": 5.338685947275269e-07, + "loss": 0.2883, + "step": 29513 + }, + { + "epoch": 0.86, + "grad_norm": 3.136588278711423, + "learning_rate": 5.33657430400622e-07, + "loss": 0.3106, + "step": 29514 + }, + { + "epoch": 0.86, + "grad_norm": 1.3264116834615483, + "learning_rate": 5.334463054892397e-07, + "loss": 0.2828, + "step": 29515 + }, + { + "epoch": 0.86, + "grad_norm": 0.9287739025046509, + "learning_rate": 5.332352199952434e-07, + "loss": 0.5678, + "step": 29516 + }, + { + "epoch": 0.86, + "grad_norm": 1.3352245187973015, + "learning_rate": 5.330241739204967e-07, + "loss": 0.288, + "step": 29517 + }, + { + "epoch": 0.86, + "grad_norm": 1.5980120630382908, + "learning_rate": 5.328131672668607e-07, + "loss": 0.2433, + "step": 29518 + }, + { + "epoch": 0.86, + "grad_norm": 1.4820409772080905, + "learning_rate": 5.326022000361975e-07, + "loss": 0.266, + "step": 29519 + }, + { + "epoch": 0.86, + "grad_norm": 3.175939586117147, + "learning_rate": 5.323912722303697e-07, + "loss": 0.2881, + "step": 29520 + }, + { + "epoch": 0.86, + "grad_norm": 3.2535605783237536, + "learning_rate": 5.321803838512396e-07, + "loss": 0.2802, + "step": 29521 + }, + { + "epoch": 0.86, + "grad_norm": 0.9823605039445825, + "learning_rate": 5.31969534900666e-07, + "loss": 0.5598, + "step": 29522 + }, + { + "epoch": 0.86, + "grad_norm": 1.2798795020236997, + "learning_rate": 5.317587253805113e-07, + "loss": 0.2667, + "step": 29523 + }, + { + "epoch": 0.86, + "grad_norm": 1.194574776275654, + "learning_rate": 5.315479552926344e-07, + "loss": 0.2895, + "step": 29524 + }, + { + "epoch": 0.86, + "grad_norm": 4.055215075733647, + "learning_rate": 5.313372246388971e-07, + "loss": 0.2593, + "step": 29525 + }, + { + "epoch": 0.86, + "grad_norm": 1.4710569025341589, + "learning_rate": 5.311265334211585e-07, + "loss": 0.2665, + "step": 29526 + }, + { + "epoch": 0.86, + "grad_norm": 1.3660118566989896, + "learning_rate": 5.309158816412774e-07, + "loss": 0.2651, + "step": 29527 + }, + { + "epoch": 0.86, + "grad_norm": 1.3701864564646784, + "learning_rate": 5.307052693011133e-07, + "loss": 0.2814, + "step": 29528 + }, + { + "epoch": 0.86, + "grad_norm": 1.3161772998154067, + "learning_rate": 5.304946964025248e-07, + "loss": 0.2761, + "step": 29529 + }, + { + "epoch": 0.86, + "grad_norm": 1.3680039069974506, + "learning_rate": 5.302841629473715e-07, + "loss": 0.2831, + "step": 29530 + }, + { + "epoch": 0.86, + "grad_norm": 1.26843442480274, + "learning_rate": 5.300736689375086e-07, + "loss": 0.2631, + "step": 29531 + }, + { + "epoch": 0.86, + "grad_norm": 1.381357807386149, + "learning_rate": 5.298632143747956e-07, + "loss": 0.2766, + "step": 29532 + }, + { + "epoch": 0.86, + "grad_norm": 1.5286650912594337, + "learning_rate": 5.296527992610889e-07, + "loss": 0.2879, + "step": 29533 + }, + { + "epoch": 0.86, + "grad_norm": 1.3545053471011643, + "learning_rate": 5.294424235982459e-07, + "loss": 0.2673, + "step": 29534 + }, + { + "epoch": 0.86, + "grad_norm": 1.3252084823443544, + "learning_rate": 5.292320873881235e-07, + "loss": 0.2794, + "step": 29535 + }, + { + "epoch": 0.86, + "grad_norm": 1.3322683382881058, + "learning_rate": 5.290217906325773e-07, + "loss": 0.2711, + "step": 29536 + }, + { + "epoch": 0.86, + "grad_norm": 1.3517150255996602, + "learning_rate": 5.288115333334631e-07, + "loss": 0.2659, + "step": 29537 + }, + { + "epoch": 0.86, + "grad_norm": 1.7244343654297083, + "learning_rate": 5.286013154926379e-07, + "loss": 0.27, + "step": 29538 + }, + { + "epoch": 0.86, + "grad_norm": 1.3448435532466958, + "learning_rate": 5.283911371119549e-07, + "loss": 0.272, + "step": 29539 + }, + { + "epoch": 0.86, + "grad_norm": 1.279260099721594, + "learning_rate": 5.281809981932695e-07, + "loss": 0.2764, + "step": 29540 + }, + { + "epoch": 0.86, + "grad_norm": 1.3849435767003628, + "learning_rate": 5.279708987384363e-07, + "loss": 0.2745, + "step": 29541 + }, + { + "epoch": 0.86, + "grad_norm": 1.3413681737823244, + "learning_rate": 5.2776083874931e-07, + "loss": 0.2698, + "step": 29542 + }, + { + "epoch": 0.86, + "grad_norm": 1.3237455259706488, + "learning_rate": 5.275508182277434e-07, + "loss": 0.2748, + "step": 29543 + }, + { + "epoch": 0.86, + "grad_norm": 1.2747149752852978, + "learning_rate": 5.273408371755906e-07, + "loss": 0.3281, + "step": 29544 + }, + { + "epoch": 0.86, + "grad_norm": 1.9229421863586984, + "learning_rate": 5.271308955947046e-07, + "loss": 0.2625, + "step": 29545 + }, + { + "epoch": 0.86, + "grad_norm": 1.3985989829825811, + "learning_rate": 5.26920993486939e-07, + "loss": 0.2781, + "step": 29546 + }, + { + "epoch": 0.86, + "grad_norm": 1.285848684401022, + "learning_rate": 5.267111308541445e-07, + "loss": 0.2611, + "step": 29547 + }, + { + "epoch": 0.86, + "grad_norm": 1.5626079042936056, + "learning_rate": 5.265013076981734e-07, + "loss": 0.2486, + "step": 29548 + }, + { + "epoch": 0.86, + "grad_norm": 1.6077103141661258, + "learning_rate": 5.262915240208794e-07, + "loss": 0.2904, + "step": 29549 + }, + { + "epoch": 0.86, + "grad_norm": 1.3050225024288535, + "learning_rate": 5.260817798241114e-07, + "loss": 0.28, + "step": 29550 + }, + { + "epoch": 0.86, + "grad_norm": 1.5295193946947847, + "learning_rate": 5.258720751097207e-07, + "loss": 0.2854, + "step": 29551 + }, + { + "epoch": 0.86, + "grad_norm": 3.630699333380374, + "learning_rate": 5.256624098795587e-07, + "loss": 0.2559, + "step": 29552 + }, + { + "epoch": 0.86, + "grad_norm": 1.4977318263090869, + "learning_rate": 5.254527841354762e-07, + "loss": 0.2581, + "step": 29553 + }, + { + "epoch": 0.86, + "grad_norm": 2.093274505205371, + "learning_rate": 5.252431978793221e-07, + "loss": 0.2683, + "step": 29554 + }, + { + "epoch": 0.86, + "grad_norm": 1.261438241654434, + "learning_rate": 5.250336511129462e-07, + "loss": 0.2728, + "step": 29555 + }, + { + "epoch": 0.86, + "grad_norm": 1.5339684662485666, + "learning_rate": 5.248241438381985e-07, + "loss": 0.2664, + "step": 29556 + }, + { + "epoch": 0.86, + "grad_norm": 1.934220239638883, + "learning_rate": 5.246146760569276e-07, + "loss": 0.2688, + "step": 29557 + }, + { + "epoch": 0.86, + "grad_norm": 1.4060690356623773, + "learning_rate": 5.244052477709816e-07, + "loss": 0.2767, + "step": 29558 + }, + { + "epoch": 0.86, + "grad_norm": 1.2203406024587498, + "learning_rate": 5.241958589822083e-07, + "loss": 0.2664, + "step": 29559 + }, + { + "epoch": 0.86, + "grad_norm": 1.3179027773941705, + "learning_rate": 5.239865096924568e-07, + "loss": 0.2817, + "step": 29560 + }, + { + "epoch": 0.86, + "grad_norm": 1.448310719469509, + "learning_rate": 5.237771999035734e-07, + "loss": 0.2624, + "step": 29561 + }, + { + "epoch": 0.86, + "grad_norm": 1.4411871354733967, + "learning_rate": 5.235679296174062e-07, + "loss": 0.2655, + "step": 29562 + }, + { + "epoch": 0.86, + "grad_norm": 1.6386831314748913, + "learning_rate": 5.23358698835802e-07, + "loss": 0.286, + "step": 29563 + }, + { + "epoch": 0.86, + "grad_norm": 1.2883702658740341, + "learning_rate": 5.231495075606064e-07, + "loss": 0.2555, + "step": 29564 + }, + { + "epoch": 0.86, + "grad_norm": 1.390031602867809, + "learning_rate": 5.229403557936663e-07, + "loss": 0.2668, + "step": 29565 + }, + { + "epoch": 0.86, + "grad_norm": 1.3681223689269133, + "learning_rate": 5.227312435368281e-07, + "loss": 0.2767, + "step": 29566 + }, + { + "epoch": 0.86, + "grad_norm": 1.3903040076417599, + "learning_rate": 5.225221707919359e-07, + "loss": 0.2614, + "step": 29567 + }, + { + "epoch": 0.86, + "grad_norm": 1.6037889303167954, + "learning_rate": 5.223131375608348e-07, + "loss": 0.2787, + "step": 29568 + }, + { + "epoch": 0.86, + "grad_norm": 1.3025562779276847, + "learning_rate": 5.221041438453694e-07, + "loss": 0.299, + "step": 29569 + }, + { + "epoch": 0.86, + "grad_norm": 1.4547983312322945, + "learning_rate": 5.218951896473856e-07, + "loss": 0.2908, + "step": 29570 + }, + { + "epoch": 0.86, + "grad_norm": 2.363116949928613, + "learning_rate": 5.216862749687257e-07, + "loss": 0.2712, + "step": 29571 + }, + { + "epoch": 0.86, + "grad_norm": 1.3118001069320049, + "learning_rate": 5.214773998112343e-07, + "loss": 0.2736, + "step": 29572 + }, + { + "epoch": 0.86, + "grad_norm": 0.9264480176390926, + "learning_rate": 5.212685641767545e-07, + "loss": 0.5915, + "step": 29573 + }, + { + "epoch": 0.86, + "grad_norm": 1.9932469965433492, + "learning_rate": 5.210597680671304e-07, + "loss": 0.2804, + "step": 29574 + }, + { + "epoch": 0.86, + "grad_norm": 1.4721663888470093, + "learning_rate": 5.208510114842025e-07, + "loss": 0.2888, + "step": 29575 + }, + { + "epoch": 0.86, + "grad_norm": 1.3212760357358586, + "learning_rate": 5.206422944298151e-07, + "loss": 0.2741, + "step": 29576 + }, + { + "epoch": 0.86, + "grad_norm": 1.6522069093834388, + "learning_rate": 5.204336169058082e-07, + "loss": 0.2521, + "step": 29577 + }, + { + "epoch": 0.86, + "grad_norm": 1.352488605240972, + "learning_rate": 5.202249789140246e-07, + "loss": 0.2672, + "step": 29578 + }, + { + "epoch": 0.86, + "grad_norm": 1.3934880618430732, + "learning_rate": 5.200163804563046e-07, + "loss": 0.2624, + "step": 29579 + }, + { + "epoch": 0.86, + "grad_norm": 1.2386366472456374, + "learning_rate": 5.198078215344904e-07, + "loss": 0.2804, + "step": 29580 + }, + { + "epoch": 0.86, + "grad_norm": 1.3835763668044783, + "learning_rate": 5.195993021504214e-07, + "loss": 0.2888, + "step": 29581 + }, + { + "epoch": 0.86, + "grad_norm": 1.2940441604312665, + "learning_rate": 5.193908223059385e-07, + "loss": 0.2656, + "step": 29582 + }, + { + "epoch": 0.86, + "grad_norm": 1.3331727109596876, + "learning_rate": 5.191823820028813e-07, + "loss": 0.2739, + "step": 29583 + }, + { + "epoch": 0.86, + "grad_norm": 1.250577039993021, + "learning_rate": 5.189739812430888e-07, + "loss": 0.2698, + "step": 29584 + }, + { + "epoch": 0.86, + "grad_norm": 1.4959873852858465, + "learning_rate": 5.187656200284019e-07, + "loss": 0.2696, + "step": 29585 + }, + { + "epoch": 0.86, + "grad_norm": 1.2756472596009691, + "learning_rate": 5.185572983606574e-07, + "loss": 0.251, + "step": 29586 + }, + { + "epoch": 0.86, + "grad_norm": 1.556250528840067, + "learning_rate": 5.183490162416943e-07, + "loss": 0.2763, + "step": 29587 + }, + { + "epoch": 0.86, + "grad_norm": 2.236767937824272, + "learning_rate": 5.181407736733507e-07, + "loss": 0.2726, + "step": 29588 + }, + { + "epoch": 0.86, + "grad_norm": 1.339190678751568, + "learning_rate": 5.17932570657465e-07, + "loss": 0.2735, + "step": 29589 + }, + { + "epoch": 0.86, + "grad_norm": 1.5087275149345143, + "learning_rate": 5.177244071958737e-07, + "loss": 0.2603, + "step": 29590 + }, + { + "epoch": 0.86, + "grad_norm": 1.3338614769080692, + "learning_rate": 5.17516283290414e-07, + "loss": 0.2746, + "step": 29591 + }, + { + "epoch": 0.86, + "grad_norm": 1.3915830274902006, + "learning_rate": 5.173081989429235e-07, + "loss": 0.2629, + "step": 29592 + }, + { + "epoch": 0.86, + "grad_norm": 1.424655854394042, + "learning_rate": 5.171001541552384e-07, + "loss": 0.2874, + "step": 29593 + }, + { + "epoch": 0.86, + "grad_norm": 1.3836625123324497, + "learning_rate": 5.168921489291928e-07, + "loss": 0.279, + "step": 29594 + }, + { + "epoch": 0.86, + "grad_norm": 1.3354570864151865, + "learning_rate": 5.166841832666241e-07, + "loss": 0.2684, + "step": 29595 + }, + { + "epoch": 0.86, + "grad_norm": 1.3914207922408028, + "learning_rate": 5.164762571693676e-07, + "loss": 0.3093, + "step": 29596 + }, + { + "epoch": 0.86, + "grad_norm": 1.337699712757885, + "learning_rate": 5.162683706392574e-07, + "loss": 0.2845, + "step": 29597 + }, + { + "epoch": 0.86, + "grad_norm": 1.372511831242864, + "learning_rate": 5.160605236781285e-07, + "loss": 0.2741, + "step": 29598 + }, + { + "epoch": 0.86, + "grad_norm": 1.4707966940987403, + "learning_rate": 5.158527162878158e-07, + "loss": 0.2671, + "step": 29599 + }, + { + "epoch": 0.86, + "grad_norm": 1.3108562667449717, + "learning_rate": 5.15644948470152e-07, + "loss": 0.2941, + "step": 29600 + }, + { + "epoch": 0.86, + "grad_norm": 1.3865919254352013, + "learning_rate": 5.15437220226972e-07, + "loss": 0.2731, + "step": 29601 + }, + { + "epoch": 0.86, + "grad_norm": 1.51033901076793, + "learning_rate": 5.152295315601086e-07, + "loss": 0.2663, + "step": 29602 + }, + { + "epoch": 0.86, + "grad_norm": 1.368731347944783, + "learning_rate": 5.150218824713932e-07, + "loss": 0.3259, + "step": 29603 + }, + { + "epoch": 0.86, + "grad_norm": 1.5021770044165934, + "learning_rate": 5.148142729626609e-07, + "loss": 0.2845, + "step": 29604 + }, + { + "epoch": 0.86, + "grad_norm": 1.3945001558221684, + "learning_rate": 5.146067030357416e-07, + "loss": 0.2672, + "step": 29605 + }, + { + "epoch": 0.86, + "grad_norm": 1.4748402022520515, + "learning_rate": 5.143991726924674e-07, + "loss": 0.2726, + "step": 29606 + }, + { + "epoch": 0.86, + "grad_norm": 1.4254582732000018, + "learning_rate": 5.141916819346703e-07, + "loss": 0.2832, + "step": 29607 + }, + { + "epoch": 0.86, + "grad_norm": 1.513717092306676, + "learning_rate": 5.139842307641818e-07, + "loss": 0.2605, + "step": 29608 + }, + { + "epoch": 0.86, + "grad_norm": 1.4265721264126652, + "learning_rate": 5.137768191828319e-07, + "loss": 0.2915, + "step": 29609 + }, + { + "epoch": 0.86, + "grad_norm": 1.6139988743823663, + "learning_rate": 5.135694471924513e-07, + "loss": 0.2714, + "step": 29610 + }, + { + "epoch": 0.86, + "grad_norm": 1.4793730904616957, + "learning_rate": 5.133621147948703e-07, + "loss": 0.2682, + "step": 29611 + }, + { + "epoch": 0.86, + "grad_norm": 2.007991883661837, + "learning_rate": 5.131548219919191e-07, + "loss": 0.276, + "step": 29612 + }, + { + "epoch": 0.86, + "grad_norm": 1.331759668479034, + "learning_rate": 5.129475687854258e-07, + "loss": 0.2727, + "step": 29613 + }, + { + "epoch": 0.86, + "grad_norm": 1.6172788766244892, + "learning_rate": 5.127403551772192e-07, + "loss": 0.2583, + "step": 29614 + }, + { + "epoch": 0.86, + "grad_norm": 1.3067764741904941, + "learning_rate": 5.125331811691298e-07, + "loss": 0.2835, + "step": 29615 + }, + { + "epoch": 0.86, + "grad_norm": 1.2266357840308375, + "learning_rate": 5.123260467629843e-07, + "loss": 0.2641, + "step": 29616 + }, + { + "epoch": 0.86, + "grad_norm": 1.5158959636457954, + "learning_rate": 5.121189519606112e-07, + "loss": 0.2766, + "step": 29617 + }, + { + "epoch": 0.86, + "grad_norm": 1.5366981959479091, + "learning_rate": 5.119118967638381e-07, + "loss": 0.2809, + "step": 29618 + }, + { + "epoch": 0.86, + "grad_norm": 1.3824395787359924, + "learning_rate": 5.117048811744923e-07, + "loss": 0.2781, + "step": 29619 + }, + { + "epoch": 0.86, + "grad_norm": 1.2728136316777605, + "learning_rate": 5.114979051944007e-07, + "loss": 0.293, + "step": 29620 + }, + { + "epoch": 0.86, + "grad_norm": 1.815949586967186, + "learning_rate": 5.112909688253914e-07, + "loss": 0.2729, + "step": 29621 + }, + { + "epoch": 0.86, + "grad_norm": 1.4252982575806343, + "learning_rate": 5.110840720692872e-07, + "loss": 0.2682, + "step": 29622 + }, + { + "epoch": 0.86, + "grad_norm": 1.4607458866317446, + "learning_rate": 5.108772149279167e-07, + "loss": 0.2824, + "step": 29623 + }, + { + "epoch": 0.86, + "grad_norm": 1.652890048194805, + "learning_rate": 5.106703974031041e-07, + "loss": 0.2714, + "step": 29624 + }, + { + "epoch": 0.86, + "grad_norm": 0.9119434969809197, + "learning_rate": 5.10463619496675e-07, + "loss": 0.5492, + "step": 29625 + }, + { + "epoch": 0.86, + "grad_norm": 1.46243032703519, + "learning_rate": 5.102568812104547e-07, + "loss": 0.2856, + "step": 29626 + }, + { + "epoch": 0.86, + "grad_norm": 1.3815003055657347, + "learning_rate": 5.100501825462667e-07, + "loss": 0.2662, + "step": 29627 + }, + { + "epoch": 0.86, + "grad_norm": 1.351652433182212, + "learning_rate": 5.098435235059363e-07, + "loss": 0.2762, + "step": 29628 + }, + { + "epoch": 0.86, + "grad_norm": 2.2821343600512636, + "learning_rate": 5.096369040912869e-07, + "loss": 0.2859, + "step": 29629 + }, + { + "epoch": 0.86, + "grad_norm": 1.6355548839641414, + "learning_rate": 5.09430324304141e-07, + "loss": 0.2667, + "step": 29630 + }, + { + "epoch": 0.86, + "grad_norm": 1.2855671500264119, + "learning_rate": 5.092237841463233e-07, + "loss": 0.279, + "step": 29631 + }, + { + "epoch": 0.86, + "grad_norm": 1.8783117316880509, + "learning_rate": 5.090172836196544e-07, + "loss": 0.291, + "step": 29632 + }, + { + "epoch": 0.86, + "grad_norm": 1.3638196167325347, + "learning_rate": 5.08810822725958e-07, + "loss": 0.2623, + "step": 29633 + }, + { + "epoch": 0.86, + "grad_norm": 1.4767511236324846, + "learning_rate": 5.08604401467056e-07, + "loss": 0.2663, + "step": 29634 + }, + { + "epoch": 0.86, + "grad_norm": 1.3056572694772939, + "learning_rate": 5.083980198447697e-07, + "loss": 0.2524, + "step": 29635 + }, + { + "epoch": 0.86, + "grad_norm": 1.905562443968886, + "learning_rate": 5.081916778609208e-07, + "loss": 0.3057, + "step": 29636 + }, + { + "epoch": 0.86, + "grad_norm": 1.4482882203021876, + "learning_rate": 5.079853755173298e-07, + "loss": 0.2769, + "step": 29637 + }, + { + "epoch": 0.86, + "grad_norm": 1.4934569464945906, + "learning_rate": 5.077791128158183e-07, + "loss": 0.292, + "step": 29638 + }, + { + "epoch": 0.86, + "grad_norm": 1.3418530446813892, + "learning_rate": 5.075728897582055e-07, + "loss": 0.2823, + "step": 29639 + }, + { + "epoch": 0.86, + "grad_norm": 1.4301807388649201, + "learning_rate": 5.073667063463134e-07, + "loss": 0.2932, + "step": 29640 + }, + { + "epoch": 0.86, + "grad_norm": 1.5294404409242381, + "learning_rate": 5.071605625819581e-07, + "loss": 0.2868, + "step": 29641 + }, + { + "epoch": 0.86, + "grad_norm": 1.5628646213980846, + "learning_rate": 5.069544584669612e-07, + "loss": 0.2577, + "step": 29642 + }, + { + "epoch": 0.86, + "grad_norm": 1.7042358987953805, + "learning_rate": 5.067483940031409e-07, + "loss": 0.2652, + "step": 29643 + }, + { + "epoch": 0.86, + "grad_norm": 1.273856070684045, + "learning_rate": 5.065423691923161e-07, + "loss": 0.2696, + "step": 29644 + }, + { + "epoch": 0.86, + "grad_norm": 1.360816151691372, + "learning_rate": 5.063363840363044e-07, + "loss": 0.2686, + "step": 29645 + }, + { + "epoch": 0.86, + "grad_norm": 1.444672039930165, + "learning_rate": 5.061304385369242e-07, + "loss": 0.28, + "step": 29646 + }, + { + "epoch": 0.86, + "grad_norm": 1.3304703556456077, + "learning_rate": 5.059245326959927e-07, + "loss": 0.2682, + "step": 29647 + }, + { + "epoch": 0.86, + "grad_norm": 1.4093534296706498, + "learning_rate": 5.05718666515328e-07, + "loss": 0.265, + "step": 29648 + }, + { + "epoch": 0.86, + "grad_norm": 1.3427621286020728, + "learning_rate": 5.055128399967451e-07, + "loss": 0.2718, + "step": 29649 + }, + { + "epoch": 0.86, + "grad_norm": 1.3216653414551223, + "learning_rate": 5.053070531420612e-07, + "loss": 0.2626, + "step": 29650 + }, + { + "epoch": 0.86, + "grad_norm": 0.8692473906830692, + "learning_rate": 5.051013059530924e-07, + "loss": 0.5765, + "step": 29651 + }, + { + "epoch": 0.86, + "grad_norm": 1.6620802809340567, + "learning_rate": 5.04895598431654e-07, + "loss": 0.27, + "step": 29652 + }, + { + "epoch": 0.86, + "grad_norm": 1.4946294015866324, + "learning_rate": 5.046899305795622e-07, + "loss": 0.2744, + "step": 29653 + }, + { + "epoch": 0.86, + "grad_norm": 1.3721665902630118, + "learning_rate": 5.044843023986318e-07, + "loss": 0.284, + "step": 29654 + }, + { + "epoch": 0.86, + "grad_norm": 1.2422523279296223, + "learning_rate": 5.042787138906768e-07, + "loss": 0.2542, + "step": 29655 + }, + { + "epoch": 0.86, + "grad_norm": 1.3122688528201722, + "learning_rate": 5.040731650575126e-07, + "loss": 0.2623, + "step": 29656 + }, + { + "epoch": 0.86, + "grad_norm": 1.3882612398638166, + "learning_rate": 5.038676559009531e-07, + "loss": 0.2797, + "step": 29657 + }, + { + "epoch": 0.86, + "grad_norm": 1.3142822499442597, + "learning_rate": 5.036621864228109e-07, + "loss": 0.2736, + "step": 29658 + }, + { + "epoch": 0.86, + "grad_norm": 1.324487684690546, + "learning_rate": 5.034567566249004e-07, + "loss": 0.2778, + "step": 29659 + }, + { + "epoch": 0.86, + "grad_norm": 1.3213119703382283, + "learning_rate": 5.032513665090327e-07, + "loss": 0.2717, + "step": 29660 + }, + { + "epoch": 0.86, + "grad_norm": 1.6061056931434308, + "learning_rate": 5.030460160770223e-07, + "loss": 0.2659, + "step": 29661 + }, + { + "epoch": 0.86, + "grad_norm": 1.322458551857199, + "learning_rate": 5.0284070533068e-07, + "loss": 0.2651, + "step": 29662 + }, + { + "epoch": 0.86, + "grad_norm": 1.796022749131908, + "learning_rate": 5.026354342718188e-07, + "loss": 0.278, + "step": 29663 + }, + { + "epoch": 0.86, + "grad_norm": 1.6406194802433294, + "learning_rate": 5.024302029022499e-07, + "loss": 0.2703, + "step": 29664 + }, + { + "epoch": 0.86, + "grad_norm": 1.7246144905691445, + "learning_rate": 5.022250112237837e-07, + "loss": 0.2522, + "step": 29665 + }, + { + "epoch": 0.86, + "grad_norm": 1.4017475225054443, + "learning_rate": 5.020198592382319e-07, + "loss": 0.2902, + "step": 29666 + }, + { + "epoch": 0.86, + "grad_norm": 1.3819724967043425, + "learning_rate": 5.018147469474061e-07, + "loss": 0.2662, + "step": 29667 + }, + { + "epoch": 0.86, + "grad_norm": 1.882773484469915, + "learning_rate": 5.01609674353114e-07, + "loss": 0.2862, + "step": 29668 + }, + { + "epoch": 0.86, + "grad_norm": 1.5870329670972143, + "learning_rate": 5.014046414571661e-07, + "loss": 0.2861, + "step": 29669 + }, + { + "epoch": 0.86, + "grad_norm": 1.4812902962410566, + "learning_rate": 5.011996482613723e-07, + "loss": 0.2813, + "step": 29670 + }, + { + "epoch": 0.86, + "grad_norm": 11.840146816984443, + "learning_rate": 5.009946947675415e-07, + "loss": 0.2615, + "step": 29671 + }, + { + "epoch": 0.86, + "grad_norm": 1.425435502138968, + "learning_rate": 5.007897809774825e-07, + "loss": 0.2578, + "step": 29672 + }, + { + "epoch": 0.86, + "grad_norm": 1.3242998003565507, + "learning_rate": 5.005849068930036e-07, + "loss": 0.286, + "step": 29673 + }, + { + "epoch": 0.86, + "grad_norm": 1.4985500620554837, + "learning_rate": 5.00380072515913e-07, + "loss": 0.2786, + "step": 29674 + }, + { + "epoch": 0.86, + "grad_norm": 1.413299149811484, + "learning_rate": 5.001752778480179e-07, + "loss": 0.2811, + "step": 29675 + }, + { + "epoch": 0.86, + "grad_norm": 1.2380219301113122, + "learning_rate": 4.999705228911267e-07, + "loss": 0.2596, + "step": 29676 + }, + { + "epoch": 0.86, + "grad_norm": 1.4712699547573471, + "learning_rate": 4.997658076470452e-07, + "loss": 0.2651, + "step": 29677 + }, + { + "epoch": 0.86, + "grad_norm": 1.284694097447552, + "learning_rate": 4.995611321175797e-07, + "loss": 0.2786, + "step": 29678 + }, + { + "epoch": 0.86, + "grad_norm": 1.3075262897164375, + "learning_rate": 4.993564963045372e-07, + "loss": 0.2744, + "step": 29679 + }, + { + "epoch": 0.86, + "grad_norm": 1.3034333587537201, + "learning_rate": 4.991519002097239e-07, + "loss": 0.2608, + "step": 29680 + }, + { + "epoch": 0.86, + "grad_norm": 1.8766089894614295, + "learning_rate": 4.989473438349452e-07, + "loss": 0.2673, + "step": 29681 + }, + { + "epoch": 0.86, + "grad_norm": 1.292397057563738, + "learning_rate": 4.987428271820055e-07, + "loss": 0.2701, + "step": 29682 + }, + { + "epoch": 0.86, + "grad_norm": 1.2920908375614497, + "learning_rate": 4.985383502527108e-07, + "loss": 0.2624, + "step": 29683 + }, + { + "epoch": 0.86, + "grad_norm": 1.4061608592157178, + "learning_rate": 4.983339130488657e-07, + "loss": 0.3085, + "step": 29684 + }, + { + "epoch": 0.86, + "grad_norm": 1.3081083034806444, + "learning_rate": 4.981295155722726e-07, + "loss": 0.2823, + "step": 29685 + }, + { + "epoch": 0.86, + "grad_norm": 1.8465413983784928, + "learning_rate": 4.979251578247379e-07, + "loss": 0.2597, + "step": 29686 + }, + { + "epoch": 0.86, + "grad_norm": 1.3161684584499922, + "learning_rate": 4.977208398080625e-07, + "loss": 0.2722, + "step": 29687 + }, + { + "epoch": 0.86, + "grad_norm": 1.315575508176875, + "learning_rate": 4.975165615240508e-07, + "loss": 0.2729, + "step": 29688 + }, + { + "epoch": 0.86, + "grad_norm": 1.6286510769133513, + "learning_rate": 4.973123229745053e-07, + "loss": 0.2815, + "step": 29689 + }, + { + "epoch": 0.86, + "grad_norm": 1.6011515907186258, + "learning_rate": 4.971081241612286e-07, + "loss": 0.2617, + "step": 29690 + }, + { + "epoch": 0.86, + "grad_norm": 1.3861737807871226, + "learning_rate": 4.969039650860225e-07, + "loss": 0.2713, + "step": 29691 + }, + { + "epoch": 0.86, + "grad_norm": 1.3983462856692312, + "learning_rate": 4.966998457506889e-07, + "loss": 0.2956, + "step": 29692 + }, + { + "epoch": 0.86, + "grad_norm": 1.2509606393953052, + "learning_rate": 4.964957661570285e-07, + "loss": 0.2698, + "step": 29693 + }, + { + "epoch": 0.86, + "grad_norm": 1.2406361804423938, + "learning_rate": 4.96291726306844e-07, + "loss": 0.2738, + "step": 29694 + }, + { + "epoch": 0.86, + "grad_norm": 1.5962808755282778, + "learning_rate": 4.960877262019354e-07, + "loss": 0.2954, + "step": 29695 + }, + { + "epoch": 0.86, + "grad_norm": 1.3123532814101868, + "learning_rate": 4.958837658441012e-07, + "loss": 0.2633, + "step": 29696 + }, + { + "epoch": 0.86, + "grad_norm": 1.4769510863640651, + "learning_rate": 4.956798452351436e-07, + "loss": 0.2867, + "step": 29697 + }, + { + "epoch": 0.86, + "grad_norm": 1.3099557739906817, + "learning_rate": 4.954759643768609e-07, + "loss": 0.2695, + "step": 29698 + }, + { + "epoch": 0.86, + "grad_norm": 1.5029933028122922, + "learning_rate": 4.952721232710528e-07, + "loss": 0.2781, + "step": 29699 + }, + { + "epoch": 0.86, + "grad_norm": 1.5363709575804703, + "learning_rate": 4.950683219195179e-07, + "loss": 0.2808, + "step": 29700 + }, + { + "epoch": 0.86, + "grad_norm": 1.3796916650173427, + "learning_rate": 4.948645603240554e-07, + "loss": 0.2827, + "step": 29701 + }, + { + "epoch": 0.86, + "grad_norm": 1.3636652540505332, + "learning_rate": 4.946608384864632e-07, + "loss": 0.2371, + "step": 29702 + }, + { + "epoch": 0.86, + "grad_norm": 2.0012439468554026, + "learning_rate": 4.944571564085399e-07, + "loss": 0.2621, + "step": 29703 + }, + { + "epoch": 0.86, + "grad_norm": 1.3756105776916263, + "learning_rate": 4.942535140920812e-07, + "loss": 0.2724, + "step": 29704 + }, + { + "epoch": 0.86, + "grad_norm": 1.3317550213924425, + "learning_rate": 4.940499115388853e-07, + "loss": 0.2558, + "step": 29705 + }, + { + "epoch": 0.86, + "grad_norm": 1.519669343195745, + "learning_rate": 4.938463487507488e-07, + "loss": 0.2712, + "step": 29706 + }, + { + "epoch": 0.86, + "grad_norm": 1.3657077205763937, + "learning_rate": 4.936428257294684e-07, + "loss": 0.2631, + "step": 29707 + }, + { + "epoch": 0.86, + "grad_norm": 1.633605502042415, + "learning_rate": 4.934393424768396e-07, + "loss": 0.2562, + "step": 29708 + }, + { + "epoch": 0.86, + "grad_norm": 1.8002939687706092, + "learning_rate": 4.932358989946596e-07, + "loss": 0.2726, + "step": 29709 + }, + { + "epoch": 0.86, + "grad_norm": 1.352192732694051, + "learning_rate": 4.930324952847221e-07, + "loss": 0.3223, + "step": 29710 + }, + { + "epoch": 0.86, + "grad_norm": 1.2683436092048979, + "learning_rate": 4.928291313488226e-07, + "loss": 0.2603, + "step": 29711 + }, + { + "epoch": 0.86, + "grad_norm": 1.3840246079757756, + "learning_rate": 4.926258071887574e-07, + "loss": 0.2808, + "step": 29712 + }, + { + "epoch": 0.86, + "grad_norm": 2.3230703248767197, + "learning_rate": 4.924225228063184e-07, + "loss": 0.3122, + "step": 29713 + }, + { + "epoch": 0.86, + "grad_norm": 1.4940992261546313, + "learning_rate": 4.922192782033014e-07, + "loss": 0.2533, + "step": 29714 + }, + { + "epoch": 0.86, + "grad_norm": 1.1781526495550265, + "learning_rate": 4.920160733814982e-07, + "loss": 0.2493, + "step": 29715 + }, + { + "epoch": 0.86, + "grad_norm": 1.887698146663634, + "learning_rate": 4.918129083427037e-07, + "loss": 0.2657, + "step": 29716 + }, + { + "epoch": 0.86, + "grad_norm": 1.4041238342928881, + "learning_rate": 4.916097830887101e-07, + "loss": 0.2916, + "step": 29717 + }, + { + "epoch": 0.86, + "grad_norm": 1.3885627013542978, + "learning_rate": 4.9140669762131e-07, + "loss": 0.2702, + "step": 29718 + }, + { + "epoch": 0.86, + "grad_norm": 1.5029775392680071, + "learning_rate": 4.912036519422958e-07, + "loss": 0.2628, + "step": 29719 + }, + { + "epoch": 0.86, + "grad_norm": 1.6166258645896954, + "learning_rate": 4.910006460534594e-07, + "loss": 0.2812, + "step": 29720 + }, + { + "epoch": 0.86, + "grad_norm": 1.407259929060467, + "learning_rate": 4.907976799565928e-07, + "loss": 0.2556, + "step": 29721 + }, + { + "epoch": 0.86, + "grad_norm": 1.497051187743165, + "learning_rate": 4.905947536534872e-07, + "loss": 0.2763, + "step": 29722 + }, + { + "epoch": 0.86, + "grad_norm": 1.230476937464151, + "learning_rate": 4.903918671459317e-07, + "loss": 0.2585, + "step": 29723 + }, + { + "epoch": 0.86, + "grad_norm": 1.5800589946873282, + "learning_rate": 4.901890204357185e-07, + "loss": 0.2736, + "step": 29724 + }, + { + "epoch": 0.86, + "grad_norm": 1.321990973568711, + "learning_rate": 4.899862135246375e-07, + "loss": 0.2679, + "step": 29725 + }, + { + "epoch": 0.86, + "grad_norm": 1.740043218677247, + "learning_rate": 4.897834464144774e-07, + "loss": 0.2765, + "step": 29726 + }, + { + "epoch": 0.86, + "grad_norm": 1.4798283984032659, + "learning_rate": 4.89580719107029e-07, + "loss": 0.266, + "step": 29727 + }, + { + "epoch": 0.86, + "grad_norm": 1.2966478750460462, + "learning_rate": 4.893780316040802e-07, + "loss": 0.2786, + "step": 29728 + }, + { + "epoch": 0.86, + "grad_norm": 1.382913095519281, + "learning_rate": 4.89175383907421e-07, + "loss": 0.2952, + "step": 29729 + }, + { + "epoch": 0.86, + "grad_norm": 1.2368868244851705, + "learning_rate": 4.88972776018839e-07, + "loss": 0.2504, + "step": 29730 + }, + { + "epoch": 0.86, + "grad_norm": 1.4453404395390947, + "learning_rate": 4.887702079401229e-07, + "loss": 0.2734, + "step": 29731 + }, + { + "epoch": 0.86, + "grad_norm": 1.2681754856935419, + "learning_rate": 4.885676796730588e-07, + "loss": 0.257, + "step": 29732 + }, + { + "epoch": 0.86, + "grad_norm": 1.486123131143053, + "learning_rate": 4.883651912194348e-07, + "loss": 0.2775, + "step": 29733 + }, + { + "epoch": 0.86, + "grad_norm": 1.3085609374679588, + "learning_rate": 4.881627425810387e-07, + "loss": 0.2944, + "step": 29734 + }, + { + "epoch": 0.86, + "grad_norm": 1.3011919510287264, + "learning_rate": 4.879603337596561e-07, + "loss": 0.2506, + "step": 29735 + }, + { + "epoch": 0.86, + "grad_norm": 1.4112502988223534, + "learning_rate": 4.877579647570735e-07, + "loss": 0.305, + "step": 29736 + }, + { + "epoch": 0.86, + "grad_norm": 1.359418107991758, + "learning_rate": 4.875556355750771e-07, + "loss": 0.2834, + "step": 29737 + }, + { + "epoch": 0.86, + "grad_norm": 2.047378698367009, + "learning_rate": 4.873533462154517e-07, + "loss": 0.286, + "step": 29738 + }, + { + "epoch": 0.86, + "grad_norm": 1.472398875792801, + "learning_rate": 4.871510966799847e-07, + "loss": 0.264, + "step": 29739 + }, + { + "epoch": 0.86, + "grad_norm": 1.3810791644442981, + "learning_rate": 4.869488869704581e-07, + "loss": 0.2714, + "step": 29740 + }, + { + "epoch": 0.86, + "grad_norm": 1.397768661432949, + "learning_rate": 4.867467170886581e-07, + "loss": 0.25, + "step": 29741 + }, + { + "epoch": 0.86, + "grad_norm": 1.3626459664047061, + "learning_rate": 4.865445870363688e-07, + "loss": 0.2782, + "step": 29742 + }, + { + "epoch": 0.86, + "grad_norm": 1.4007508780522593, + "learning_rate": 4.86342496815373e-07, + "loss": 0.2572, + "step": 29743 + }, + { + "epoch": 0.86, + "grad_norm": 1.4024420029314562, + "learning_rate": 4.861404464274544e-07, + "loss": 0.2586, + "step": 29744 + }, + { + "epoch": 0.86, + "grad_norm": 1.2960251872906254, + "learning_rate": 4.859384358743963e-07, + "loss": 0.2653, + "step": 29745 + }, + { + "epoch": 0.86, + "grad_norm": 1.3464177745799781, + "learning_rate": 4.857364651579821e-07, + "loss": 0.2536, + "step": 29746 + }, + { + "epoch": 0.86, + "grad_norm": 1.2268560603197347, + "learning_rate": 4.855345342799933e-07, + "loss": 0.261, + "step": 29747 + }, + { + "epoch": 0.86, + "grad_norm": 1.508512116569694, + "learning_rate": 4.853326432422123e-07, + "loss": 0.2792, + "step": 29748 + }, + { + "epoch": 0.86, + "grad_norm": 1.3827509162060392, + "learning_rate": 4.851307920464221e-07, + "loss": 0.2898, + "step": 29749 + }, + { + "epoch": 0.86, + "grad_norm": 1.3249880257293298, + "learning_rate": 4.849289806944018e-07, + "loss": 0.2658, + "step": 29750 + }, + { + "epoch": 0.86, + "grad_norm": 1.7422678439353854, + "learning_rate": 4.847272091879329e-07, + "loss": 0.2774, + "step": 29751 + }, + { + "epoch": 0.86, + "grad_norm": 1.4575374643792323, + "learning_rate": 4.845254775287966e-07, + "loss": 0.2839, + "step": 29752 + }, + { + "epoch": 0.86, + "grad_norm": 1.2343695062516045, + "learning_rate": 4.843237857187733e-07, + "loss": 0.2549, + "step": 29753 + }, + { + "epoch": 0.86, + "grad_norm": 1.551058548988941, + "learning_rate": 4.841221337596424e-07, + "loss": 0.2692, + "step": 29754 + }, + { + "epoch": 0.86, + "grad_norm": 1.5643675086961288, + "learning_rate": 4.839205216531839e-07, + "loss": 0.2776, + "step": 29755 + }, + { + "epoch": 0.86, + "grad_norm": 1.9987405915882919, + "learning_rate": 4.837189494011774e-07, + "loss": 0.2817, + "step": 29756 + }, + { + "epoch": 0.86, + "grad_norm": 1.3865052114975658, + "learning_rate": 4.835174170054002e-07, + "loss": 0.2857, + "step": 29757 + }, + { + "epoch": 0.86, + "grad_norm": 4.176961800262627, + "learning_rate": 4.833159244676339e-07, + "loss": 0.256, + "step": 29758 + }, + { + "epoch": 0.86, + "grad_norm": 1.4027669933951288, + "learning_rate": 4.83114471789653e-07, + "loss": 0.289, + "step": 29759 + }, + { + "epoch": 0.86, + "grad_norm": 1.0442182103098336, + "learning_rate": 4.829130589732373e-07, + "loss": 0.5781, + "step": 29760 + }, + { + "epoch": 0.86, + "grad_norm": 1.4328501258124942, + "learning_rate": 4.827116860201636e-07, + "loss": 0.2665, + "step": 29761 + }, + { + "epoch": 0.86, + "grad_norm": 1.3743983294586823, + "learning_rate": 4.825103529322101e-07, + "loss": 0.275, + "step": 29762 + }, + { + "epoch": 0.86, + "grad_norm": 1.2277685018651066, + "learning_rate": 4.823090597111524e-07, + "loss": 0.2826, + "step": 29763 + }, + { + "epoch": 0.86, + "grad_norm": 1.3940217265876458, + "learning_rate": 4.821078063587675e-07, + "loss": 0.2628, + "step": 29764 + }, + { + "epoch": 0.86, + "grad_norm": 1.605666527799218, + "learning_rate": 4.819065928768313e-07, + "loss": 0.2805, + "step": 29765 + }, + { + "epoch": 0.86, + "grad_norm": 1.345074358731372, + "learning_rate": 4.817054192671194e-07, + "loss": 0.2655, + "step": 29766 + }, + { + "epoch": 0.86, + "grad_norm": 1.3636589391170602, + "learning_rate": 4.815042855314084e-07, + "loss": 0.2671, + "step": 29767 + }, + { + "epoch": 0.86, + "grad_norm": 1.4192492530471692, + "learning_rate": 4.813031916714711e-07, + "loss": 0.2681, + "step": 29768 + }, + { + "epoch": 0.86, + "grad_norm": 0.8962108391442349, + "learning_rate": 4.811021376890829e-07, + "loss": 0.5346, + "step": 29769 + }, + { + "epoch": 0.86, + "grad_norm": 1.2239527686150393, + "learning_rate": 4.8090112358602e-07, + "loss": 0.2583, + "step": 29770 + }, + { + "epoch": 0.86, + "grad_norm": 0.9645492866221388, + "learning_rate": 4.807001493640539e-07, + "loss": 0.5491, + "step": 29771 + }, + { + "epoch": 0.86, + "grad_norm": 1.3703448431722434, + "learning_rate": 4.804992150249588e-07, + "loss": 0.2784, + "step": 29772 + }, + { + "epoch": 0.86, + "grad_norm": 1.475425533930551, + "learning_rate": 4.802983205705086e-07, + "loss": 0.286, + "step": 29773 + }, + { + "epoch": 0.86, + "grad_norm": 1.3137771328115868, + "learning_rate": 4.800974660024754e-07, + "loss": 0.253, + "step": 29774 + }, + { + "epoch": 0.86, + "grad_norm": 1.6856418280443544, + "learning_rate": 4.798966513226327e-07, + "loss": 0.2622, + "step": 29775 + }, + { + "epoch": 0.86, + "grad_norm": 1.4074508411155342, + "learning_rate": 4.79695876532752e-07, + "loss": 0.2912, + "step": 29776 + }, + { + "epoch": 0.86, + "grad_norm": 1.3917835396500398, + "learning_rate": 4.794951416346061e-07, + "loss": 0.273, + "step": 29777 + }, + { + "epoch": 0.86, + "grad_norm": 1.383721882864752, + "learning_rate": 4.792944466299649e-07, + "loss": 0.2621, + "step": 29778 + }, + { + "epoch": 0.86, + "grad_norm": 1.341208842531556, + "learning_rate": 4.790937915206001e-07, + "loss": 0.2517, + "step": 29779 + }, + { + "epoch": 0.86, + "grad_norm": 1.4957530725358865, + "learning_rate": 4.788931763082833e-07, + "loss": 0.2733, + "step": 29780 + }, + { + "epoch": 0.86, + "grad_norm": 1.33124179071731, + "learning_rate": 4.78692600994784e-07, + "loss": 0.2683, + "step": 29781 + }, + { + "epoch": 0.86, + "grad_norm": 1.538140280984938, + "learning_rate": 4.784920655818725e-07, + "loss": 0.3034, + "step": 29782 + }, + { + "epoch": 0.86, + "grad_norm": 1.4392639240962914, + "learning_rate": 4.782915700713192e-07, + "loss": 0.2637, + "step": 29783 + }, + { + "epoch": 0.86, + "grad_norm": 1.2732174893724388, + "learning_rate": 4.780911144648925e-07, + "loss": 0.2641, + "step": 29784 + }, + { + "epoch": 0.86, + "grad_norm": 1.4515775483862745, + "learning_rate": 4.778906987643633e-07, + "loss": 0.258, + "step": 29785 + }, + { + "epoch": 0.86, + "grad_norm": 1.7384230926871687, + "learning_rate": 4.776903229714974e-07, + "loss": 0.2777, + "step": 29786 + }, + { + "epoch": 0.86, + "grad_norm": 1.566632402982056, + "learning_rate": 4.77489987088065e-07, + "loss": 0.2944, + "step": 29787 + }, + { + "epoch": 0.86, + "grad_norm": 1.2888992620709527, + "learning_rate": 4.772896911158331e-07, + "loss": 0.2663, + "step": 29788 + }, + { + "epoch": 0.86, + "grad_norm": 1.5680949426491724, + "learning_rate": 4.770894350565702e-07, + "loss": 0.2708, + "step": 29789 + }, + { + "epoch": 0.86, + "grad_norm": 1.4563297064424856, + "learning_rate": 4.768892189120434e-07, + "loss": 0.2546, + "step": 29790 + }, + { + "epoch": 0.86, + "grad_norm": 1.367819484264982, + "learning_rate": 4.7668904268401927e-07, + "loss": 0.27, + "step": 29791 + }, + { + "epoch": 0.86, + "grad_norm": 1.231341734476396, + "learning_rate": 4.764889063742645e-07, + "loss": 0.2821, + "step": 29792 + }, + { + "epoch": 0.86, + "grad_norm": 1.3158702874620154, + "learning_rate": 4.7628880998454473e-07, + "loss": 0.2784, + "step": 29793 + }, + { + "epoch": 0.86, + "grad_norm": 1.6312421717167014, + "learning_rate": 4.760887535166281e-07, + "loss": 0.2755, + "step": 29794 + }, + { + "epoch": 0.86, + "grad_norm": 1.2898821213220752, + "learning_rate": 4.75888736972277e-07, + "loss": 0.2792, + "step": 29795 + }, + { + "epoch": 0.86, + "grad_norm": 1.4710452048076108, + "learning_rate": 4.7568876035325837e-07, + "loss": 0.3134, + "step": 29796 + }, + { + "epoch": 0.86, + "grad_norm": 1.3942667055392142, + "learning_rate": 4.7548882366133643e-07, + "loss": 0.2705, + "step": 29797 + }, + { + "epoch": 0.86, + "grad_norm": 1.2749137581457346, + "learning_rate": 4.752889268982769e-07, + "loss": 0.2399, + "step": 29798 + }, + { + "epoch": 0.86, + "grad_norm": 1.5425054955528659, + "learning_rate": 4.750890700658417e-07, + "loss": 0.2655, + "step": 29799 + }, + { + "epoch": 0.86, + "grad_norm": 1.4184125995694676, + "learning_rate": 4.748892531657956e-07, + "loss": 0.2932, + "step": 29800 + }, + { + "epoch": 0.86, + "grad_norm": 1.509589918472809, + "learning_rate": 4.7468947619990213e-07, + "loss": 0.2646, + "step": 29801 + }, + { + "epoch": 0.86, + "grad_norm": 1.5534710695961433, + "learning_rate": 4.7448973916992435e-07, + "loss": 0.2703, + "step": 29802 + }, + { + "epoch": 0.86, + "grad_norm": 1.4677993150608224, + "learning_rate": 4.7429004207762473e-07, + "loss": 0.291, + "step": 29803 + }, + { + "epoch": 0.86, + "grad_norm": 0.9726074176237306, + "learning_rate": 4.740903849247663e-07, + "loss": 0.5824, + "step": 29804 + }, + { + "epoch": 0.86, + "grad_norm": 1.261483172751202, + "learning_rate": 4.7389076771310994e-07, + "loss": 0.2411, + "step": 29805 + }, + { + "epoch": 0.86, + "grad_norm": 1.460053328737801, + "learning_rate": 4.7369119044441747e-07, + "loss": 0.2726, + "step": 29806 + }, + { + "epoch": 0.86, + "grad_norm": 1.9337542023079666, + "learning_rate": 4.734916531204509e-07, + "loss": 0.279, + "step": 29807 + }, + { + "epoch": 0.86, + "grad_norm": 1.2630855986552685, + "learning_rate": 4.732921557429704e-07, + "loss": 0.2618, + "step": 29808 + }, + { + "epoch": 0.86, + "grad_norm": 1.446297934474838, + "learning_rate": 4.730926983137368e-07, + "loss": 0.2718, + "step": 29809 + }, + { + "epoch": 0.86, + "grad_norm": 1.3725279060579791, + "learning_rate": 4.7289328083451045e-07, + "loss": 0.2698, + "step": 29810 + }, + { + "epoch": 0.86, + "grad_norm": 1.381162664616086, + "learning_rate": 4.72693903307051e-07, + "loss": 0.2879, + "step": 29811 + }, + { + "epoch": 0.86, + "grad_norm": 1.1946266851660667, + "learning_rate": 4.7249456573311814e-07, + "loss": 0.2551, + "step": 29812 + }, + { + "epoch": 0.86, + "grad_norm": 1.5163151195812627, + "learning_rate": 4.722952681144716e-07, + "loss": 0.2633, + "step": 29813 + }, + { + "epoch": 0.86, + "grad_norm": 1.2500004945543828, + "learning_rate": 4.7209601045286937e-07, + "loss": 0.2568, + "step": 29814 + }, + { + "epoch": 0.86, + "grad_norm": 1.2626275816121804, + "learning_rate": 4.718967927500695e-07, + "loss": 0.2532, + "step": 29815 + }, + { + "epoch": 0.86, + "grad_norm": 1.46646231392797, + "learning_rate": 4.716976150078306e-07, + "loss": 0.2528, + "step": 29816 + }, + { + "epoch": 0.86, + "grad_norm": 1.53146828594487, + "learning_rate": 4.7149847722791077e-07, + "loss": 0.2603, + "step": 29817 + }, + { + "epoch": 0.86, + "grad_norm": 1.6021448434911933, + "learning_rate": 4.7129937941206683e-07, + "loss": 0.2834, + "step": 29818 + }, + { + "epoch": 0.86, + "grad_norm": 1.3396384583935959, + "learning_rate": 4.711003215620563e-07, + "loss": 0.2831, + "step": 29819 + }, + { + "epoch": 0.86, + "grad_norm": 1.3333402680928155, + "learning_rate": 4.709013036796356e-07, + "loss": 0.2699, + "step": 29820 + }, + { + "epoch": 0.86, + "grad_norm": 0.9200689465972087, + "learning_rate": 4.707023257665622e-07, + "loss": 0.5219, + "step": 29821 + }, + { + "epoch": 0.86, + "grad_norm": 1.4580880726833825, + "learning_rate": 4.7050338782459016e-07, + "loss": 0.2632, + "step": 29822 + }, + { + "epoch": 0.87, + "grad_norm": 1.447998446534, + "learning_rate": 4.7030448985547596e-07, + "loss": 0.2896, + "step": 29823 + }, + { + "epoch": 0.87, + "grad_norm": 1.275270420799808, + "learning_rate": 4.701056318609748e-07, + "loss": 0.2636, + "step": 29824 + }, + { + "epoch": 0.87, + "grad_norm": 1.589461540633514, + "learning_rate": 4.6990681384284144e-07, + "loss": 0.2741, + "step": 29825 + }, + { + "epoch": 0.87, + "grad_norm": 1.5323796119279596, + "learning_rate": 4.6970803580283107e-07, + "loss": 0.2842, + "step": 29826 + }, + { + "epoch": 0.87, + "grad_norm": 1.3494932451953077, + "learning_rate": 4.6950929774269846e-07, + "loss": 0.2616, + "step": 29827 + }, + { + "epoch": 0.87, + "grad_norm": 1.6689310142432223, + "learning_rate": 4.69310599664195e-07, + "loss": 0.2798, + "step": 29828 + }, + { + "epoch": 0.87, + "grad_norm": 1.4586026304173585, + "learning_rate": 4.691119415690765e-07, + "loss": 0.257, + "step": 29829 + }, + { + "epoch": 0.87, + "grad_norm": 1.7442685520924104, + "learning_rate": 4.6891332345909537e-07, + "loss": 0.2611, + "step": 29830 + }, + { + "epoch": 0.87, + "grad_norm": 1.4515896457941664, + "learning_rate": 4.6871474533600413e-07, + "loss": 0.2704, + "step": 29831 + }, + { + "epoch": 0.87, + "grad_norm": 1.3852876459259953, + "learning_rate": 4.6851620720155643e-07, + "loss": 0.2709, + "step": 29832 + }, + { + "epoch": 0.87, + "grad_norm": 1.7014049966499756, + "learning_rate": 4.6831770905750253e-07, + "loss": 0.27, + "step": 29833 + }, + { + "epoch": 0.87, + "grad_norm": 1.3804620104291834, + "learning_rate": 4.681192509055954e-07, + "loss": 0.2583, + "step": 29834 + }, + { + "epoch": 0.87, + "grad_norm": 1.5649376593370665, + "learning_rate": 4.679208327475859e-07, + "loss": 0.2676, + "step": 29835 + }, + { + "epoch": 0.87, + "grad_norm": 1.7840611808502342, + "learning_rate": 4.6772245458522546e-07, + "loss": 0.2689, + "step": 29836 + }, + { + "epoch": 0.87, + "grad_norm": 1.3180016703610025, + "learning_rate": 4.675241164202643e-07, + "loss": 0.2604, + "step": 29837 + }, + { + "epoch": 0.87, + "grad_norm": 1.3907472699179746, + "learning_rate": 4.673258182544532e-07, + "loss": 0.2673, + "step": 29838 + }, + { + "epoch": 0.87, + "grad_norm": 1.3816376436121143, + "learning_rate": 4.671275600895425e-07, + "loss": 0.2725, + "step": 29839 + }, + { + "epoch": 0.87, + "grad_norm": 1.8381522042281628, + "learning_rate": 4.669293419272819e-07, + "loss": 0.2682, + "step": 29840 + }, + { + "epoch": 0.87, + "grad_norm": 1.3208382949811979, + "learning_rate": 4.667311637694194e-07, + "loss": 0.2709, + "step": 29841 + }, + { + "epoch": 0.87, + "grad_norm": 1.442161621395561, + "learning_rate": 4.6653302561770476e-07, + "loss": 0.2753, + "step": 29842 + }, + { + "epoch": 0.87, + "grad_norm": 1.3527411612302764, + "learning_rate": 4.66334927473886e-07, + "loss": 0.2668, + "step": 29843 + }, + { + "epoch": 0.87, + "grad_norm": 1.411861838020957, + "learning_rate": 4.6613686933971224e-07, + "loss": 0.2714, + "step": 29844 + }, + { + "epoch": 0.87, + "grad_norm": 1.3603363588758166, + "learning_rate": 4.659388512169305e-07, + "loss": 0.2673, + "step": 29845 + }, + { + "epoch": 0.87, + "grad_norm": 1.3179987874790693, + "learning_rate": 4.6574087310728877e-07, + "loss": 0.2764, + "step": 29846 + }, + { + "epoch": 0.87, + "grad_norm": 1.2682143415699627, + "learning_rate": 4.655429350125346e-07, + "loss": 0.28, + "step": 29847 + }, + { + "epoch": 0.87, + "grad_norm": 1.6871530439289297, + "learning_rate": 4.6534503693441426e-07, + "loss": 0.2953, + "step": 29848 + }, + { + "epoch": 0.87, + "grad_norm": 1.4932499802374575, + "learning_rate": 4.6514717887467474e-07, + "loss": 0.2555, + "step": 29849 + }, + { + "epoch": 0.87, + "grad_norm": 1.1856888375994685, + "learning_rate": 4.649493608350608e-07, + "loss": 0.2677, + "step": 29850 + }, + { + "epoch": 0.87, + "grad_norm": 1.790753379363996, + "learning_rate": 4.6475158281731935e-07, + "loss": 0.2659, + "step": 29851 + }, + { + "epoch": 0.87, + "grad_norm": 1.4990561603353842, + "learning_rate": 4.645538448231957e-07, + "loss": 0.2618, + "step": 29852 + }, + { + "epoch": 0.87, + "grad_norm": 1.3097892201975283, + "learning_rate": 4.6435614685443385e-07, + "loss": 0.2582, + "step": 29853 + }, + { + "epoch": 0.87, + "grad_norm": 1.3539455085121688, + "learning_rate": 4.641584889127798e-07, + "loss": 0.2739, + "step": 29854 + }, + { + "epoch": 0.87, + "grad_norm": 2.0258641029964877, + "learning_rate": 4.639608709999788e-07, + "loss": 0.2545, + "step": 29855 + }, + { + "epoch": 0.87, + "grad_norm": 1.2395649518135092, + "learning_rate": 4.6376329311777215e-07, + "loss": 0.2575, + "step": 29856 + }, + { + "epoch": 0.87, + "grad_norm": 1.3415929038513952, + "learning_rate": 4.635657552679046e-07, + "loss": 0.2562, + "step": 29857 + }, + { + "epoch": 0.87, + "grad_norm": 1.3125154953819382, + "learning_rate": 4.633682574521198e-07, + "loss": 0.288, + "step": 29858 + }, + { + "epoch": 0.87, + "grad_norm": 1.3674563300949323, + "learning_rate": 4.6317079967216127e-07, + "loss": 0.2737, + "step": 29859 + }, + { + "epoch": 0.87, + "grad_norm": 1.3880251315415968, + "learning_rate": 4.629733819297694e-07, + "loss": 0.2782, + "step": 29860 + }, + { + "epoch": 0.87, + "grad_norm": 1.2978074390902226, + "learning_rate": 4.6277600422668823e-07, + "loss": 0.2757, + "step": 29861 + }, + { + "epoch": 0.87, + "grad_norm": 1.8553742828825555, + "learning_rate": 4.625786665646592e-07, + "loss": 0.2366, + "step": 29862 + }, + { + "epoch": 0.87, + "grad_norm": 1.343598640068186, + "learning_rate": 4.6238136894542374e-07, + "loss": 0.2847, + "step": 29863 + }, + { + "epoch": 0.87, + "grad_norm": 1.649842399734744, + "learning_rate": 4.6218411137072314e-07, + "loss": 0.2925, + "step": 29864 + }, + { + "epoch": 0.87, + "grad_norm": 1.3100205128954612, + "learning_rate": 4.619868938422978e-07, + "loss": 0.2707, + "step": 29865 + }, + { + "epoch": 0.87, + "grad_norm": 0.9351845814983424, + "learning_rate": 4.617897163618884e-07, + "loss": 0.5188, + "step": 29866 + }, + { + "epoch": 0.87, + "grad_norm": 1.2570838790450385, + "learning_rate": 4.615925789312353e-07, + "loss": 0.2505, + "step": 29867 + }, + { + "epoch": 0.87, + "grad_norm": 1.404586765532476, + "learning_rate": 4.6139548155207935e-07, + "loss": 0.2775, + "step": 29868 + }, + { + "epoch": 0.87, + "grad_norm": 1.3925139301931613, + "learning_rate": 4.611984242261569e-07, + "loss": 0.2889, + "step": 29869 + }, + { + "epoch": 0.87, + "grad_norm": 1.480701285453632, + "learning_rate": 4.6100140695520935e-07, + "loss": 0.2678, + "step": 29870 + }, + { + "epoch": 0.87, + "grad_norm": 1.3387138666241747, + "learning_rate": 4.608044297409742e-07, + "loss": 0.2749, + "step": 29871 + }, + { + "epoch": 0.87, + "grad_norm": 1.7033599791913827, + "learning_rate": 4.6060749258519055e-07, + "loss": 0.2735, + "step": 29872 + }, + { + "epoch": 0.87, + "grad_norm": 1.3941576578631065, + "learning_rate": 4.6041059548959655e-07, + "loss": 0.2468, + "step": 29873 + }, + { + "epoch": 0.87, + "grad_norm": 1.7927590593990765, + "learning_rate": 4.602137384559291e-07, + "loss": 0.2642, + "step": 29874 + }, + { + "epoch": 0.87, + "grad_norm": 1.3187516018381062, + "learning_rate": 4.600169214859257e-07, + "loss": 0.2622, + "step": 29875 + }, + { + "epoch": 0.87, + "grad_norm": 1.2614963380785418, + "learning_rate": 4.598201445813244e-07, + "loss": 0.267, + "step": 29876 + }, + { + "epoch": 0.87, + "grad_norm": 1.3374868066943757, + "learning_rate": 4.5962340774385936e-07, + "loss": 0.2662, + "step": 29877 + }, + { + "epoch": 0.87, + "grad_norm": 1.7570217354021853, + "learning_rate": 4.594267109752687e-07, + "loss": 0.2748, + "step": 29878 + }, + { + "epoch": 0.87, + "grad_norm": 0.9432636538518633, + "learning_rate": 4.592300542772871e-07, + "loss": 0.5628, + "step": 29879 + }, + { + "epoch": 0.87, + "grad_norm": 1.5923703928797615, + "learning_rate": 4.5903343765165085e-07, + "loss": 0.2909, + "step": 29880 + }, + { + "epoch": 0.87, + "grad_norm": 1.3863777335490897, + "learning_rate": 4.5883686110009483e-07, + "loss": 0.2746, + "step": 29881 + }, + { + "epoch": 0.87, + "grad_norm": 1.254855544896902, + "learning_rate": 4.586403246243543e-07, + "loss": 0.275, + "step": 29882 + }, + { + "epoch": 0.87, + "grad_norm": 1.4799766354900006, + "learning_rate": 4.5844382822616387e-07, + "loss": 0.2762, + "step": 29883 + }, + { + "epoch": 0.87, + "grad_norm": 1.512541814950968, + "learning_rate": 4.582473719072561e-07, + "loss": 0.2822, + "step": 29884 + }, + { + "epoch": 0.87, + "grad_norm": 1.6035599056789807, + "learning_rate": 4.5805095566936575e-07, + "loss": 0.2564, + "step": 29885 + }, + { + "epoch": 0.87, + "grad_norm": 1.277983697217061, + "learning_rate": 4.578545795142264e-07, + "loss": 0.2706, + "step": 29886 + }, + { + "epoch": 0.87, + "grad_norm": 1.8154913379134943, + "learning_rate": 4.576582434435711e-07, + "loss": 0.2665, + "step": 29887 + }, + { + "epoch": 0.87, + "grad_norm": 1.3381820307045498, + "learning_rate": 4.5746194745913187e-07, + "loss": 0.2655, + "step": 29888 + }, + { + "epoch": 0.87, + "grad_norm": 1.2801806702477025, + "learning_rate": 4.572656915626411e-07, + "loss": 0.2612, + "step": 29889 + }, + { + "epoch": 0.87, + "grad_norm": 1.4733319326251053, + "learning_rate": 4.5706947575583085e-07, + "loss": 0.2615, + "step": 29890 + }, + { + "epoch": 0.87, + "grad_norm": 3.05071006558212, + "learning_rate": 4.5687330004043295e-07, + "loss": 0.2832, + "step": 29891 + }, + { + "epoch": 0.87, + "grad_norm": 1.7811337349806116, + "learning_rate": 4.5667716441817833e-07, + "loss": 0.2705, + "step": 29892 + }, + { + "epoch": 0.87, + "grad_norm": 1.3127112894207498, + "learning_rate": 4.564810688907984e-07, + "loss": 0.278, + "step": 29893 + }, + { + "epoch": 0.87, + "grad_norm": 1.3262513899770054, + "learning_rate": 4.562850134600233e-07, + "loss": 0.2714, + "step": 29894 + }, + { + "epoch": 0.87, + "grad_norm": 1.2616534624158189, + "learning_rate": 4.5608899812758457e-07, + "loss": 0.2524, + "step": 29895 + }, + { + "epoch": 0.87, + "grad_norm": 0.9946773999999136, + "learning_rate": 4.5589302289520963e-07, + "loss": 0.5729, + "step": 29896 + }, + { + "epoch": 0.87, + "grad_norm": 2.4154709000898276, + "learning_rate": 4.556970877646294e-07, + "loss": 0.2707, + "step": 29897 + }, + { + "epoch": 0.87, + "grad_norm": 1.3970909687147206, + "learning_rate": 4.555011927375724e-07, + "loss": 0.2638, + "step": 29898 + }, + { + "epoch": 0.87, + "grad_norm": 1.3573032670045209, + "learning_rate": 4.553053378157685e-07, + "loss": 0.2517, + "step": 29899 + }, + { + "epoch": 0.87, + "grad_norm": 1.3401472994691295, + "learning_rate": 4.5510952300094504e-07, + "loss": 0.2804, + "step": 29900 + }, + { + "epoch": 0.87, + "grad_norm": 1.3993538767175406, + "learning_rate": 4.549137482948301e-07, + "loss": 0.2642, + "step": 29901 + }, + { + "epoch": 0.87, + "grad_norm": 1.2918511021830554, + "learning_rate": 4.5471801369915193e-07, + "loss": 0.2833, + "step": 29902 + }, + { + "epoch": 0.87, + "grad_norm": 1.3035444637895472, + "learning_rate": 4.545223192156378e-07, + "loss": 0.2559, + "step": 29903 + }, + { + "epoch": 0.87, + "grad_norm": 1.327712833538683, + "learning_rate": 4.5432666484601597e-07, + "loss": 0.2512, + "step": 29904 + }, + { + "epoch": 0.87, + "grad_norm": 2.232109546523687, + "learning_rate": 4.5413105059201044e-07, + "loss": 0.2829, + "step": 29905 + }, + { + "epoch": 0.87, + "grad_norm": 1.3288799210228965, + "learning_rate": 4.5393547645534887e-07, + "loss": 0.2805, + "step": 29906 + }, + { + "epoch": 0.87, + "grad_norm": 1.4107945942551872, + "learning_rate": 4.53739942437757e-07, + "loss": 0.2668, + "step": 29907 + }, + { + "epoch": 0.87, + "grad_norm": 6.123147666144795, + "learning_rate": 4.535444485409607e-07, + "loss": 0.2825, + "step": 29908 + }, + { + "epoch": 0.87, + "grad_norm": 1.3247500089561117, + "learning_rate": 4.533489947666847e-07, + "loss": 0.2803, + "step": 29909 + }, + { + "epoch": 0.87, + "grad_norm": 1.7506106036175049, + "learning_rate": 4.531535811166549e-07, + "loss": 0.2613, + "step": 29910 + }, + { + "epoch": 0.87, + "grad_norm": 1.3828443369080006, + "learning_rate": 4.529582075925959e-07, + "loss": 0.2702, + "step": 29911 + }, + { + "epoch": 0.87, + "grad_norm": 1.3629056561436088, + "learning_rate": 4.527628741962298e-07, + "loss": 0.3237, + "step": 29912 + }, + { + "epoch": 0.87, + "grad_norm": 1.4132606968520782, + "learning_rate": 4.525675809292823e-07, + "loss": 0.2964, + "step": 29913 + }, + { + "epoch": 0.87, + "grad_norm": 1.8176085076164088, + "learning_rate": 4.5237232779347716e-07, + "loss": 0.2683, + "step": 29914 + }, + { + "epoch": 0.87, + "grad_norm": 1.5025786427858445, + "learning_rate": 4.521771147905357e-07, + "loss": 0.2678, + "step": 29915 + }, + { + "epoch": 0.87, + "grad_norm": 1.412917234331832, + "learning_rate": 4.519819419221816e-07, + "loss": 0.2721, + "step": 29916 + }, + { + "epoch": 0.87, + "grad_norm": 1.3782380797642584, + "learning_rate": 4.517868091901373e-07, + "loss": 0.258, + "step": 29917 + }, + { + "epoch": 0.87, + "grad_norm": 1.3272430159400823, + "learning_rate": 4.5159171659612476e-07, + "loss": 0.272, + "step": 29918 + }, + { + "epoch": 0.87, + "grad_norm": 1.266917628676873, + "learning_rate": 4.51396664141866e-07, + "loss": 0.2478, + "step": 29919 + }, + { + "epoch": 0.87, + "grad_norm": 0.9212532951246011, + "learning_rate": 4.512016518290824e-07, + "loss": 0.5988, + "step": 29920 + }, + { + "epoch": 0.87, + "grad_norm": 1.3737374657945827, + "learning_rate": 4.5100667965949417e-07, + "loss": 0.2617, + "step": 29921 + }, + { + "epoch": 0.87, + "grad_norm": 1.4646746968976914, + "learning_rate": 4.508117476348228e-07, + "loss": 0.3097, + "step": 29922 + }, + { + "epoch": 0.87, + "grad_norm": 1.4213620621920522, + "learning_rate": 4.506168557567886e-07, + "loss": 0.2863, + "step": 29923 + }, + { + "epoch": 0.87, + "grad_norm": 0.9960087347743014, + "learning_rate": 4.5042200402711066e-07, + "loss": 0.5856, + "step": 29924 + }, + { + "epoch": 0.87, + "grad_norm": 1.3197913595003665, + "learning_rate": 4.502271924475093e-07, + "loss": 0.2669, + "step": 29925 + }, + { + "epoch": 0.87, + "grad_norm": 1.3639895560452093, + "learning_rate": 4.500324210197027e-07, + "loss": 0.2731, + "step": 29926 + }, + { + "epoch": 0.87, + "grad_norm": 1.5378781216267423, + "learning_rate": 4.4983768974541095e-07, + "loss": 0.2723, + "step": 29927 + }, + { + "epoch": 0.87, + "grad_norm": 1.2859822144201047, + "learning_rate": 4.4964299862635177e-07, + "loss": 0.2748, + "step": 29928 + }, + { + "epoch": 0.87, + "grad_norm": 1.2769836821122746, + "learning_rate": 4.4944834766424417e-07, + "loss": 0.2608, + "step": 29929 + }, + { + "epoch": 0.87, + "grad_norm": 1.241347332474979, + "learning_rate": 4.492537368608046e-07, + "loss": 0.2583, + "step": 29930 + }, + { + "epoch": 0.87, + "grad_norm": 1.5480215413383789, + "learning_rate": 4.490591662177529e-07, + "loss": 0.3051, + "step": 29931 + }, + { + "epoch": 0.87, + "grad_norm": 1.3456261857097855, + "learning_rate": 4.488646357368032e-07, + "loss": 0.276, + "step": 29932 + }, + { + "epoch": 0.87, + "grad_norm": 1.460848570467453, + "learning_rate": 4.48670145419674e-07, + "loss": 0.2682, + "step": 29933 + }, + { + "epoch": 0.87, + "grad_norm": 1.4436085861853771, + "learning_rate": 4.4847569526808074e-07, + "loss": 0.2819, + "step": 29934 + }, + { + "epoch": 0.87, + "grad_norm": 1.727415671483884, + "learning_rate": 4.4828128528374036e-07, + "loss": 0.2595, + "step": 29935 + }, + { + "epoch": 0.87, + "grad_norm": 1.2996466788224725, + "learning_rate": 4.4808691546836757e-07, + "loss": 0.2696, + "step": 29936 + }, + { + "epoch": 0.87, + "grad_norm": 1.582984846307488, + "learning_rate": 4.478925858236788e-07, + "loss": 0.2609, + "step": 29937 + }, + { + "epoch": 0.87, + "grad_norm": 1.4420858510179144, + "learning_rate": 4.4769829635138816e-07, + "loss": 0.2829, + "step": 29938 + }, + { + "epoch": 0.87, + "grad_norm": 1.5206413852713023, + "learning_rate": 4.475040470532116e-07, + "loss": 0.308, + "step": 29939 + }, + { + "epoch": 0.87, + "grad_norm": 1.90913720638784, + "learning_rate": 4.4730983793086157e-07, + "loss": 0.2953, + "step": 29940 + }, + { + "epoch": 0.87, + "grad_norm": 1.4983945515711243, + "learning_rate": 4.471156689860523e-07, + "loss": 0.2676, + "step": 29941 + }, + { + "epoch": 0.87, + "grad_norm": 1.3122230877425218, + "learning_rate": 4.4692154022049904e-07, + "loss": 0.2448, + "step": 29942 + }, + { + "epoch": 0.87, + "grad_norm": 1.2634281603334976, + "learning_rate": 4.467274516359127e-07, + "loss": 0.2528, + "step": 29943 + }, + { + "epoch": 0.87, + "grad_norm": 1.4472840918198722, + "learning_rate": 4.465334032340074e-07, + "loss": 0.2659, + "step": 29944 + }, + { + "epoch": 0.87, + "grad_norm": 1.4601003312328762, + "learning_rate": 4.4633939501649516e-07, + "loss": 0.283, + "step": 29945 + }, + { + "epoch": 0.87, + "grad_norm": 1.3539074534346383, + "learning_rate": 4.4614542698508787e-07, + "loss": 0.2754, + "step": 29946 + }, + { + "epoch": 0.87, + "grad_norm": 1.289460836690323, + "learning_rate": 4.4595149914149816e-07, + "loss": 0.2603, + "step": 29947 + }, + { + "epoch": 0.87, + "grad_norm": 1.4869282575899225, + "learning_rate": 4.457576114874368e-07, + "loss": 0.2574, + "step": 29948 + }, + { + "epoch": 0.87, + "grad_norm": 1.2545695201562523, + "learning_rate": 4.4556376402461464e-07, + "loss": 0.3322, + "step": 29949 + }, + { + "epoch": 0.87, + "grad_norm": 1.2766467604397378, + "learning_rate": 4.453699567547443e-07, + "loss": 0.2525, + "step": 29950 + }, + { + "epoch": 0.87, + "grad_norm": 1.5462335189953482, + "learning_rate": 4.451761896795337e-07, + "loss": 0.2722, + "step": 29951 + }, + { + "epoch": 0.87, + "grad_norm": 1.3325277940608116, + "learning_rate": 4.4498246280069325e-07, + "loss": 0.2861, + "step": 29952 + }, + { + "epoch": 0.87, + "grad_norm": 1.5381790980478929, + "learning_rate": 4.447887761199338e-07, + "loss": 0.2703, + "step": 29953 + }, + { + "epoch": 0.87, + "grad_norm": 1.408506059221811, + "learning_rate": 4.4459512963896334e-07, + "loss": 0.2752, + "step": 29954 + }, + { + "epoch": 0.87, + "grad_norm": 1.4473396392306679, + "learning_rate": 4.444015233594917e-07, + "loss": 0.2757, + "step": 29955 + }, + { + "epoch": 0.87, + "grad_norm": 0.9553443142996014, + "learning_rate": 4.4420795728322694e-07, + "loss": 0.5403, + "step": 29956 + }, + { + "epoch": 0.87, + "grad_norm": 1.273837889231968, + "learning_rate": 4.440144314118777e-07, + "loss": 0.267, + "step": 29957 + }, + { + "epoch": 0.87, + "grad_norm": 1.1975588035918363, + "learning_rate": 4.438209457471515e-07, + "loss": 0.2502, + "step": 29958 + }, + { + "epoch": 0.87, + "grad_norm": 1.2475229198621824, + "learning_rate": 4.4362750029075697e-07, + "loss": 0.2557, + "step": 29959 + }, + { + "epoch": 0.87, + "grad_norm": 1.3016296639323368, + "learning_rate": 4.434340950443994e-07, + "loss": 0.2604, + "step": 29960 + }, + { + "epoch": 0.87, + "grad_norm": 1.2841547929270263, + "learning_rate": 4.432407300097863e-07, + "loss": 0.2544, + "step": 29961 + }, + { + "epoch": 0.87, + "grad_norm": 1.344842111350335, + "learning_rate": 4.4304740518862465e-07, + "loss": 0.2666, + "step": 29962 + }, + { + "epoch": 0.87, + "grad_norm": 1.435624869770709, + "learning_rate": 4.428541205826198e-07, + "loss": 0.2883, + "step": 29963 + }, + { + "epoch": 0.87, + "grad_norm": 1.2534010171758894, + "learning_rate": 4.4266087619347807e-07, + "loss": 0.261, + "step": 29964 + }, + { + "epoch": 0.87, + "grad_norm": 1.317341102234714, + "learning_rate": 4.4246767202290487e-07, + "loss": 0.2624, + "step": 29965 + }, + { + "epoch": 0.87, + "grad_norm": 0.99155234045715, + "learning_rate": 4.4227450807260486e-07, + "loss": 0.5698, + "step": 29966 + }, + { + "epoch": 0.87, + "grad_norm": 1.395491583778845, + "learning_rate": 4.4208138434428394e-07, + "loss": 0.2642, + "step": 29967 + }, + { + "epoch": 0.87, + "grad_norm": 1.3970651059921695, + "learning_rate": 4.418883008396441e-07, + "loss": 0.2835, + "step": 29968 + }, + { + "epoch": 0.87, + "grad_norm": 1.3244776154192552, + "learning_rate": 4.4169525756039164e-07, + "loss": 0.2633, + "step": 29969 + }, + { + "epoch": 0.87, + "grad_norm": 1.9697814101407705, + "learning_rate": 4.4150225450822813e-07, + "loss": 0.2873, + "step": 29970 + }, + { + "epoch": 0.87, + "grad_norm": 4.114722764525453, + "learning_rate": 4.4130929168485827e-07, + "loss": 0.2627, + "step": 29971 + }, + { + "epoch": 0.87, + "grad_norm": 1.4294266654611778, + "learning_rate": 4.4111636909198395e-07, + "loss": 0.2739, + "step": 29972 + }, + { + "epoch": 0.87, + "grad_norm": 1.5026314331942046, + "learning_rate": 4.4092348673130834e-07, + "loss": 0.2883, + "step": 29973 + }, + { + "epoch": 0.87, + "grad_norm": 1.7892138847808445, + "learning_rate": 4.40730644604534e-07, + "loss": 0.2859, + "step": 29974 + }, + { + "epoch": 0.87, + "grad_norm": 1.3251357231676573, + "learning_rate": 4.405378427133616e-07, + "loss": 0.2721, + "step": 29975 + }, + { + "epoch": 0.87, + "grad_norm": 1.3440931438254489, + "learning_rate": 4.4034508105949336e-07, + "loss": 0.2974, + "step": 29976 + }, + { + "epoch": 0.87, + "grad_norm": 1.3817995047546443, + "learning_rate": 4.401523596446311e-07, + "loss": 0.263, + "step": 29977 + }, + { + "epoch": 0.87, + "grad_norm": 2.7755678141186997, + "learning_rate": 4.3995967847047516e-07, + "loss": 0.2584, + "step": 29978 + }, + { + "epoch": 0.87, + "grad_norm": 1.4843712555269697, + "learning_rate": 4.3976703753872473e-07, + "loss": 0.2635, + "step": 29979 + }, + { + "epoch": 0.87, + "grad_norm": 1.3807481352444235, + "learning_rate": 4.395744368510807e-07, + "loss": 0.2856, + "step": 29980 + }, + { + "epoch": 0.87, + "grad_norm": 1.2779760242972746, + "learning_rate": 4.3938187640924337e-07, + "loss": 0.2557, + "step": 29981 + }, + { + "epoch": 0.87, + "grad_norm": 1.3098516693099023, + "learning_rate": 4.391893562149113e-07, + "loss": 0.262, + "step": 29982 + }, + { + "epoch": 0.87, + "grad_norm": 1.420202653022854, + "learning_rate": 4.389968762697838e-07, + "loss": 0.2801, + "step": 29983 + }, + { + "epoch": 0.87, + "grad_norm": 1.330831599167078, + "learning_rate": 4.388044365755595e-07, + "loss": 0.2521, + "step": 29984 + }, + { + "epoch": 0.87, + "grad_norm": 1.3746179620736638, + "learning_rate": 4.386120371339364e-07, + "loss": 0.2613, + "step": 29985 + }, + { + "epoch": 0.87, + "grad_norm": 1.3654972634286404, + "learning_rate": 4.3841967794661376e-07, + "loss": 0.2583, + "step": 29986 + }, + { + "epoch": 0.87, + "grad_norm": 1.4820746902637822, + "learning_rate": 4.382273590152869e-07, + "loss": 0.2811, + "step": 29987 + }, + { + "epoch": 0.87, + "grad_norm": 2.192751400145695, + "learning_rate": 4.380350803416544e-07, + "loss": 0.2864, + "step": 29988 + }, + { + "epoch": 0.87, + "grad_norm": 1.364129820461553, + "learning_rate": 4.3784284192741276e-07, + "loss": 0.2546, + "step": 29989 + }, + { + "epoch": 0.87, + "grad_norm": 1.242513986337778, + "learning_rate": 4.376506437742589e-07, + "loss": 0.2728, + "step": 29990 + }, + { + "epoch": 0.87, + "grad_norm": 1.4108257939508813, + "learning_rate": 4.3745848588388874e-07, + "loss": 0.2686, + "step": 29991 + }, + { + "epoch": 0.87, + "grad_norm": 2.5874873009070853, + "learning_rate": 4.372663682579981e-07, + "loss": 0.3026, + "step": 29992 + }, + { + "epoch": 0.87, + "grad_norm": 1.3187512577824376, + "learning_rate": 4.3707429089828223e-07, + "loss": 0.2665, + "step": 29993 + }, + { + "epoch": 0.87, + "grad_norm": 1.7729088961920432, + "learning_rate": 4.3688225380643656e-07, + "loss": 0.2575, + "step": 29994 + }, + { + "epoch": 0.87, + "grad_norm": 1.2847674256367718, + "learning_rate": 4.3669025698415636e-07, + "loss": 0.2734, + "step": 29995 + }, + { + "epoch": 0.87, + "grad_norm": 1.4740990597779462, + "learning_rate": 4.364983004331341e-07, + "loss": 0.2869, + "step": 29996 + }, + { + "epoch": 0.87, + "grad_norm": 1.379896381550957, + "learning_rate": 4.363063841550663e-07, + "loss": 0.2624, + "step": 29997 + }, + { + "epoch": 0.87, + "grad_norm": 1.4739107090814698, + "learning_rate": 4.361145081516438e-07, + "loss": 0.2616, + "step": 29998 + }, + { + "epoch": 0.87, + "grad_norm": 1.3059631700419436, + "learning_rate": 4.359226724245619e-07, + "loss": 0.2632, + "step": 29999 + }, + { + "epoch": 0.87, + "grad_norm": 1.5186099897173275, + "learning_rate": 4.357308769755131e-07, + "loss": 0.2685, + "step": 30000 + }, + { + "epoch": 0.87, + "grad_norm": 1.602735958396658, + "learning_rate": 4.3553912180618995e-07, + "loss": 0.285, + "step": 30001 + }, + { + "epoch": 0.87, + "grad_norm": 1.4169452750940557, + "learning_rate": 4.3534740691828446e-07, + "loss": 0.2571, + "step": 30002 + }, + { + "epoch": 0.87, + "grad_norm": 1.441016482473074, + "learning_rate": 4.351557323134886e-07, + "loss": 0.3008, + "step": 30003 + }, + { + "epoch": 0.87, + "grad_norm": 1.4709747984635837, + "learning_rate": 4.349640979934944e-07, + "loss": 0.2749, + "step": 30004 + }, + { + "epoch": 0.87, + "grad_norm": 1.4357417514289885, + "learning_rate": 4.3477250395999315e-07, + "loss": 0.2879, + "step": 30005 + }, + { + "epoch": 0.87, + "grad_norm": 1.437004474046357, + "learning_rate": 4.345809502146747e-07, + "loss": 0.2744, + "step": 30006 + }, + { + "epoch": 0.87, + "grad_norm": 1.3393067502249532, + "learning_rate": 4.3438943675922997e-07, + "loss": 0.262, + "step": 30007 + }, + { + "epoch": 0.87, + "grad_norm": 1.4015381951098393, + "learning_rate": 4.3419796359534916e-07, + "loss": 0.2623, + "step": 30008 + }, + { + "epoch": 0.87, + "grad_norm": 1.2834729675096603, + "learning_rate": 4.3400653072472157e-07, + "loss": 0.2711, + "step": 30009 + }, + { + "epoch": 0.87, + "grad_norm": 1.4933786889047762, + "learning_rate": 4.338151381490374e-07, + "loss": 0.3019, + "step": 30010 + }, + { + "epoch": 0.87, + "grad_norm": 1.4267117599030872, + "learning_rate": 4.336237858699849e-07, + "loss": 0.3085, + "step": 30011 + }, + { + "epoch": 0.87, + "grad_norm": 1.8029808265193885, + "learning_rate": 4.334324738892537e-07, + "loss": 0.2632, + "step": 30012 + }, + { + "epoch": 0.87, + "grad_norm": 1.3154868184841277, + "learning_rate": 4.3324120220853083e-07, + "loss": 0.2744, + "step": 30013 + }, + { + "epoch": 0.87, + "grad_norm": 0.9911336861190547, + "learning_rate": 4.330499708295066e-07, + "loss": 0.599, + "step": 30014 + }, + { + "epoch": 0.87, + "grad_norm": 1.7477672510084143, + "learning_rate": 4.328587797538658e-07, + "loss": 0.2808, + "step": 30015 + }, + { + "epoch": 0.87, + "grad_norm": 1.589739243278707, + "learning_rate": 4.3266762898329704e-07, + "loss": 0.2767, + "step": 30016 + }, + { + "epoch": 0.87, + "grad_norm": 1.7565302015487405, + "learning_rate": 4.3247651851948736e-07, + "loss": 0.285, + "step": 30017 + }, + { + "epoch": 0.87, + "grad_norm": 3.002859710556832, + "learning_rate": 4.3228544836412254e-07, + "loss": 0.2829, + "step": 30018 + }, + { + "epoch": 0.87, + "grad_norm": 1.9488853283892735, + "learning_rate": 4.3209441851888967e-07, + "loss": 0.3264, + "step": 30019 + }, + { + "epoch": 0.87, + "grad_norm": 1.8197833521974636, + "learning_rate": 4.31903428985474e-07, + "loss": 0.2567, + "step": 30020 + }, + { + "epoch": 0.87, + "grad_norm": 1.2857538188657782, + "learning_rate": 4.317124797655614e-07, + "loss": 0.2523, + "step": 30021 + }, + { + "epoch": 0.87, + "grad_norm": 1.33121897551485, + "learning_rate": 4.315215708608378e-07, + "loss": 0.2641, + "step": 30022 + }, + { + "epoch": 0.87, + "grad_norm": 1.3802576910041167, + "learning_rate": 4.313307022729862e-07, + "loss": 0.2773, + "step": 30023 + }, + { + "epoch": 0.87, + "grad_norm": 1.3717336249015089, + "learning_rate": 4.3113987400369264e-07, + "loss": 0.2911, + "step": 30024 + }, + { + "epoch": 0.87, + "grad_norm": 1.52905649973905, + "learning_rate": 4.309490860546395e-07, + "loss": 0.2705, + "step": 30025 + }, + { + "epoch": 0.87, + "grad_norm": 1.3969948640605054, + "learning_rate": 4.30758338427511e-07, + "loss": 0.2805, + "step": 30026 + }, + { + "epoch": 0.87, + "grad_norm": 1.298586595486593, + "learning_rate": 4.305676311239915e-07, + "loss": 0.2876, + "step": 30027 + }, + { + "epoch": 0.87, + "grad_norm": 1.5286343541263072, + "learning_rate": 4.303769641457628e-07, + "loss": 0.2782, + "step": 30028 + }, + { + "epoch": 0.87, + "grad_norm": 1.4583111399762367, + "learning_rate": 4.301863374945081e-07, + "loss": 0.2704, + "step": 30029 + }, + { + "epoch": 0.87, + "grad_norm": 1.2778968666770065, + "learning_rate": 4.2999575117190995e-07, + "loss": 0.2711, + "step": 30030 + }, + { + "epoch": 0.87, + "grad_norm": 1.5546548719333402, + "learning_rate": 4.2980520517964974e-07, + "loss": 0.2713, + "step": 30031 + }, + { + "epoch": 0.87, + "grad_norm": 1.3647457197452373, + "learning_rate": 4.296146995194095e-07, + "loss": 0.2676, + "step": 30032 + }, + { + "epoch": 0.87, + "grad_norm": 1.1831592766086994, + "learning_rate": 4.2942423419287115e-07, + "loss": 0.257, + "step": 30033 + }, + { + "epoch": 0.87, + "grad_norm": 1.4636335768940303, + "learning_rate": 4.292338092017134e-07, + "loss": 0.2665, + "step": 30034 + }, + { + "epoch": 0.87, + "grad_norm": 1.4246631045501672, + "learning_rate": 4.2904342454761825e-07, + "loss": 0.2655, + "step": 30035 + }, + { + "epoch": 0.87, + "grad_norm": 1.3123011159700289, + "learning_rate": 4.28853080232266e-07, + "loss": 0.2489, + "step": 30036 + }, + { + "epoch": 0.87, + "grad_norm": 1.4644531016162736, + "learning_rate": 4.286627762573353e-07, + "loss": 0.287, + "step": 30037 + }, + { + "epoch": 0.87, + "grad_norm": 1.4019806638117986, + "learning_rate": 4.284725126245065e-07, + "loss": 0.2733, + "step": 30038 + }, + { + "epoch": 0.87, + "grad_norm": 1.4276068538859898, + "learning_rate": 4.282822893354588e-07, + "loss": 0.2754, + "step": 30039 + }, + { + "epoch": 0.87, + "grad_norm": 1.2209145132761428, + "learning_rate": 4.2809210639187024e-07, + "loss": 0.2579, + "step": 30040 + }, + { + "epoch": 0.87, + "grad_norm": 1.4101892032436605, + "learning_rate": 4.2790196379542073e-07, + "loss": 0.2729, + "step": 30041 + }, + { + "epoch": 0.87, + "grad_norm": 1.2865838573145223, + "learning_rate": 4.2771186154778657e-07, + "loss": 0.2591, + "step": 30042 + }, + { + "epoch": 0.87, + "grad_norm": 1.5234873305450705, + "learning_rate": 4.275217996506453e-07, + "loss": 0.2478, + "step": 30043 + }, + { + "epoch": 0.87, + "grad_norm": 1.984266692563303, + "learning_rate": 4.273317781056752e-07, + "loss": 0.2835, + "step": 30044 + }, + { + "epoch": 0.87, + "grad_norm": 1.4395698838437354, + "learning_rate": 4.271417969145525e-07, + "loss": 0.2776, + "step": 30045 + }, + { + "epoch": 0.87, + "grad_norm": 1.2806783610629096, + "learning_rate": 4.269518560789543e-07, + "loss": 0.275, + "step": 30046 + }, + { + "epoch": 0.87, + "grad_norm": 1.497353679759357, + "learning_rate": 4.26761955600557e-07, + "loss": 0.2524, + "step": 30047 + }, + { + "epoch": 0.87, + "grad_norm": 1.7552383233226607, + "learning_rate": 4.2657209548103596e-07, + "loss": 0.269, + "step": 30048 + }, + { + "epoch": 0.87, + "grad_norm": 1.283032206150373, + "learning_rate": 4.2638227572206703e-07, + "loss": 0.2961, + "step": 30049 + }, + { + "epoch": 0.87, + "grad_norm": 1.4853832250912513, + "learning_rate": 4.261924963253261e-07, + "loss": 0.2893, + "step": 30050 + }, + { + "epoch": 0.87, + "grad_norm": 0.9510844951144322, + "learning_rate": 4.260027572924863e-07, + "loss": 0.5553, + "step": 30051 + }, + { + "epoch": 0.87, + "grad_norm": 1.3247276975692197, + "learning_rate": 4.258130586252235e-07, + "loss": 0.2577, + "step": 30052 + }, + { + "epoch": 0.87, + "grad_norm": 1.2814774070235253, + "learning_rate": 4.2562340032521075e-07, + "loss": 0.2416, + "step": 30053 + }, + { + "epoch": 0.87, + "grad_norm": 1.3752752788575409, + "learning_rate": 4.254337823941218e-07, + "loss": 0.2991, + "step": 30054 + }, + { + "epoch": 0.87, + "grad_norm": 1.4025822837652633, + "learning_rate": 4.2524420483363084e-07, + "loss": 0.2875, + "step": 30055 + }, + { + "epoch": 0.87, + "grad_norm": 1.3268156688168085, + "learning_rate": 4.2505466764541036e-07, + "loss": 0.2802, + "step": 30056 + }, + { + "epoch": 0.87, + "grad_norm": 1.297882085959944, + "learning_rate": 4.24865170831133e-07, + "loss": 0.2624, + "step": 30057 + }, + { + "epoch": 0.87, + "grad_norm": 1.3407424655919753, + "learning_rate": 4.246757143924718e-07, + "loss": 0.2792, + "step": 30058 + }, + { + "epoch": 0.87, + "grad_norm": 0.9275960554621636, + "learning_rate": 4.244862983310977e-07, + "loss": 0.5618, + "step": 30059 + }, + { + "epoch": 0.87, + "grad_norm": 1.472096577614632, + "learning_rate": 4.2429692264868373e-07, + "loss": 0.2934, + "step": 30060 + }, + { + "epoch": 0.87, + "grad_norm": 1.3445995179064987, + "learning_rate": 4.2410758734689915e-07, + "loss": 0.26, + "step": 30061 + }, + { + "epoch": 0.87, + "grad_norm": 1.2920951959097524, + "learning_rate": 4.2391829242741653e-07, + "loss": 0.2697, + "step": 30062 + }, + { + "epoch": 0.87, + "grad_norm": 1.5093350493180377, + "learning_rate": 4.237290378919051e-07, + "loss": 0.2665, + "step": 30063 + }, + { + "epoch": 0.87, + "grad_norm": 1.5101173886458845, + "learning_rate": 4.2353982374203574e-07, + "loss": 0.275, + "step": 30064 + }, + { + "epoch": 0.87, + "grad_norm": 1.2244855870572897, + "learning_rate": 4.2335064997947874e-07, + "loss": 0.2531, + "step": 30065 + }, + { + "epoch": 0.87, + "grad_norm": 1.7660142051720042, + "learning_rate": 4.2316151660590223e-07, + "loss": 0.2772, + "step": 30066 + }, + { + "epoch": 0.87, + "grad_norm": 1.4757863423239832, + "learning_rate": 4.229724236229765e-07, + "loss": 0.2741, + "step": 30067 + }, + { + "epoch": 0.87, + "grad_norm": 1.3184010987916217, + "learning_rate": 4.2278337103236976e-07, + "loss": 0.2847, + "step": 30068 + }, + { + "epoch": 0.87, + "grad_norm": 1.4507932762533877, + "learning_rate": 4.2259435883575175e-07, + "loss": 0.2661, + "step": 30069 + }, + { + "epoch": 0.87, + "grad_norm": 1.3243669556003768, + "learning_rate": 4.224053870347883e-07, + "loss": 0.281, + "step": 30070 + }, + { + "epoch": 0.87, + "grad_norm": 1.4122638866177346, + "learning_rate": 4.222164556311481e-07, + "loss": 0.2862, + "step": 30071 + }, + { + "epoch": 0.87, + "grad_norm": 1.3664700299365946, + "learning_rate": 4.220275646264982e-07, + "loss": 0.2418, + "step": 30072 + }, + { + "epoch": 0.87, + "grad_norm": 1.6448743793370064, + "learning_rate": 4.218387140225061e-07, + "loss": 0.2555, + "step": 30073 + }, + { + "epoch": 0.87, + "grad_norm": 1.4327981416639988, + "learning_rate": 4.216499038208383e-07, + "loss": 0.2569, + "step": 30074 + }, + { + "epoch": 0.87, + "grad_norm": 1.7665412830027594, + "learning_rate": 4.214611340231606e-07, + "loss": 0.2845, + "step": 30075 + }, + { + "epoch": 0.87, + "grad_norm": 1.4502366814439323, + "learning_rate": 4.2127240463113896e-07, + "loss": 0.2777, + "step": 30076 + }, + { + "epoch": 0.87, + "grad_norm": 1.3940834264276833, + "learning_rate": 4.2108371564644034e-07, + "loss": 0.2567, + "step": 30077 + }, + { + "epoch": 0.87, + "grad_norm": 1.449213702444826, + "learning_rate": 4.208950670707279e-07, + "loss": 0.2713, + "step": 30078 + }, + { + "epoch": 0.87, + "grad_norm": 1.3250586573072296, + "learning_rate": 4.207064589056681e-07, + "loss": 0.2674, + "step": 30079 + }, + { + "epoch": 0.87, + "grad_norm": 1.3071224592034654, + "learning_rate": 4.2051789115292395e-07, + "loss": 0.2602, + "step": 30080 + }, + { + "epoch": 0.87, + "grad_norm": 1.4095585370264951, + "learning_rate": 4.203293638141598e-07, + "loss": 0.2678, + "step": 30081 + }, + { + "epoch": 0.87, + "grad_norm": 1.356985831250465, + "learning_rate": 4.2014087689103975e-07, + "loss": 0.2766, + "step": 30082 + }, + { + "epoch": 0.87, + "grad_norm": 1.4292005133561048, + "learning_rate": 4.199524303852276e-07, + "loss": 0.2717, + "step": 30083 + }, + { + "epoch": 0.87, + "grad_norm": 1.4725412666995026, + "learning_rate": 4.197640242983858e-07, + "loss": 0.2513, + "step": 30084 + }, + { + "epoch": 0.87, + "grad_norm": 1.3153021872538322, + "learning_rate": 4.195756586321775e-07, + "loss": 0.2791, + "step": 30085 + }, + { + "epoch": 0.87, + "grad_norm": 1.3569174976030194, + "learning_rate": 4.193873333882648e-07, + "loss": 0.2695, + "step": 30086 + }, + { + "epoch": 0.87, + "grad_norm": 1.432407095378266, + "learning_rate": 4.1919904856831007e-07, + "loss": 0.2739, + "step": 30087 + }, + { + "epoch": 0.87, + "grad_norm": 1.3240174285341695, + "learning_rate": 4.190108041739749e-07, + "loss": 0.2847, + "step": 30088 + }, + { + "epoch": 0.87, + "grad_norm": 3.7617360284982473, + "learning_rate": 4.1882260020691956e-07, + "loss": 0.2705, + "step": 30089 + }, + { + "epoch": 0.87, + "grad_norm": 1.326420580010174, + "learning_rate": 4.1863443666880497e-07, + "loss": 0.2864, + "step": 30090 + }, + { + "epoch": 0.87, + "grad_norm": 1.402991582081491, + "learning_rate": 4.1844631356129317e-07, + "loss": 0.287, + "step": 30091 + }, + { + "epoch": 0.87, + "grad_norm": 1.224593075094635, + "learning_rate": 4.1825823088604334e-07, + "loss": 0.2623, + "step": 30092 + }, + { + "epoch": 0.87, + "grad_norm": 1.3633181998076127, + "learning_rate": 4.180701886447147e-07, + "loss": 0.2565, + "step": 30093 + }, + { + "epoch": 0.87, + "grad_norm": 1.296648721160349, + "learning_rate": 4.178821868389682e-07, + "loss": 0.2894, + "step": 30094 + }, + { + "epoch": 0.87, + "grad_norm": 1.2120655325756262, + "learning_rate": 4.176942254704619e-07, + "loss": 0.2736, + "step": 30095 + }, + { + "epoch": 0.87, + "grad_norm": 1.7903056544153313, + "learning_rate": 4.175063045408562e-07, + "loss": 0.2571, + "step": 30096 + }, + { + "epoch": 0.87, + "grad_norm": 1.5741584766536187, + "learning_rate": 4.173184240518069e-07, + "loss": 0.2845, + "step": 30097 + }, + { + "epoch": 0.87, + "grad_norm": 3.4132852406961245, + "learning_rate": 4.171305840049733e-07, + "loss": 0.2712, + "step": 30098 + }, + { + "epoch": 0.87, + "grad_norm": 1.4213084054911145, + "learning_rate": 4.169427844020135e-07, + "loss": 0.2885, + "step": 30099 + }, + { + "epoch": 0.87, + "grad_norm": 1.3058617511139476, + "learning_rate": 4.1675502524458456e-07, + "loss": 0.2783, + "step": 30100 + }, + { + "epoch": 0.87, + "grad_norm": 1.432506288065432, + "learning_rate": 4.1656730653434285e-07, + "loss": 0.2813, + "step": 30101 + }, + { + "epoch": 0.87, + "grad_norm": 1.4307337725730553, + "learning_rate": 4.16379628272946e-07, + "loss": 0.2687, + "step": 30102 + }, + { + "epoch": 0.87, + "grad_norm": 4.376730825055701, + "learning_rate": 4.1619199046204926e-07, + "loss": 0.2804, + "step": 30103 + }, + { + "epoch": 0.87, + "grad_norm": 1.3737849221514182, + "learning_rate": 4.160043931033092e-07, + "loss": 0.2702, + "step": 30104 + }, + { + "epoch": 0.87, + "grad_norm": 1.30813342339628, + "learning_rate": 4.158168361983822e-07, + "loss": 0.256, + "step": 30105 + }, + { + "epoch": 0.87, + "grad_norm": 1.2905088938449658, + "learning_rate": 4.1562931974892196e-07, + "loss": 0.2556, + "step": 30106 + }, + { + "epoch": 0.87, + "grad_norm": 1.2211763632351484, + "learning_rate": 4.1544184375658326e-07, + "loss": 0.2704, + "step": 30107 + }, + { + "epoch": 0.87, + "grad_norm": 1.5524375074143506, + "learning_rate": 4.152544082230225e-07, + "loss": 0.2708, + "step": 30108 + }, + { + "epoch": 0.87, + "grad_norm": 1.3808333451606525, + "learning_rate": 4.150670131498913e-07, + "loss": 0.2708, + "step": 30109 + }, + { + "epoch": 0.87, + "grad_norm": 1.2875888860151994, + "learning_rate": 4.1487965853884424e-07, + "loss": 0.2698, + "step": 30110 + }, + { + "epoch": 0.87, + "grad_norm": 1.4301285897091545, + "learning_rate": 4.1469234439153573e-07, + "loss": 0.2739, + "step": 30111 + }, + { + "epoch": 0.87, + "grad_norm": 1.358975366384689, + "learning_rate": 4.1450507070961766e-07, + "loss": 0.2733, + "step": 30112 + }, + { + "epoch": 0.87, + "grad_norm": 1.2329871549006932, + "learning_rate": 4.1431783749474375e-07, + "loss": 0.2509, + "step": 30113 + }, + { + "epoch": 0.87, + "grad_norm": 1.4728803038540845, + "learning_rate": 4.1413064474856544e-07, + "loss": 0.2845, + "step": 30114 + }, + { + "epoch": 0.87, + "grad_norm": 1.433761423502354, + "learning_rate": 4.139434924727359e-07, + "loss": 0.2648, + "step": 30115 + }, + { + "epoch": 0.87, + "grad_norm": 1.3546087862202878, + "learning_rate": 4.137563806689049e-07, + "loss": 0.2825, + "step": 30116 + }, + { + "epoch": 0.87, + "grad_norm": 1.4725685345313082, + "learning_rate": 4.13569309338725e-07, + "loss": 0.2798, + "step": 30117 + }, + { + "epoch": 0.87, + "grad_norm": 1.3856354028857445, + "learning_rate": 4.1338227848384647e-07, + "loss": 0.26, + "step": 30118 + }, + { + "epoch": 0.87, + "grad_norm": 1.4953729494868202, + "learning_rate": 4.1319528810592033e-07, + "loss": 0.2881, + "step": 30119 + }, + { + "epoch": 0.87, + "grad_norm": 1.3868753436416312, + "learning_rate": 4.130083382065969e-07, + "loss": 0.2996, + "step": 30120 + }, + { + "epoch": 0.87, + "grad_norm": 0.967947255131416, + "learning_rate": 4.12821428787526e-07, + "loss": 0.6064, + "step": 30121 + }, + { + "epoch": 0.87, + "grad_norm": 1.4235107994728222, + "learning_rate": 4.1263455985035615e-07, + "loss": 0.312, + "step": 30122 + }, + { + "epoch": 0.87, + "grad_norm": 1.276713529578494, + "learning_rate": 4.124477313967379e-07, + "loss": 0.2859, + "step": 30123 + }, + { + "epoch": 0.87, + "grad_norm": 1.3776874576288196, + "learning_rate": 4.122609434283198e-07, + "loss": 0.2549, + "step": 30124 + }, + { + "epoch": 0.87, + "grad_norm": 1.2359893280753487, + "learning_rate": 4.1207419594674893e-07, + "loss": 0.24, + "step": 30125 + }, + { + "epoch": 0.87, + "grad_norm": 1.5241969900894046, + "learning_rate": 4.1188748895367393e-07, + "loss": 0.2774, + "step": 30126 + }, + { + "epoch": 0.87, + "grad_norm": 1.3467243655254928, + "learning_rate": 4.1170082245074295e-07, + "loss": 0.3086, + "step": 30127 + }, + { + "epoch": 0.87, + "grad_norm": 1.5873021295886773, + "learning_rate": 4.1151419643960357e-07, + "loss": 0.2624, + "step": 30128 + }, + { + "epoch": 0.87, + "grad_norm": 1.22759563319316, + "learning_rate": 4.1132761092190164e-07, + "loss": 0.2427, + "step": 30129 + }, + { + "epoch": 0.87, + "grad_norm": 1.3860668010754251, + "learning_rate": 4.1114106589928483e-07, + "loss": 0.3056, + "step": 30130 + }, + { + "epoch": 0.87, + "grad_norm": 1.341001650266675, + "learning_rate": 4.1095456137339895e-07, + "loss": 0.2714, + "step": 30131 + }, + { + "epoch": 0.87, + "grad_norm": 1.367510796519272, + "learning_rate": 4.107680973458905e-07, + "loss": 0.2644, + "step": 30132 + }, + { + "epoch": 0.87, + "grad_norm": 1.4051244420304994, + "learning_rate": 4.105816738184043e-07, + "loss": 0.2698, + "step": 30133 + }, + { + "epoch": 0.87, + "grad_norm": 1.3233510701962905, + "learning_rate": 4.103952907925851e-07, + "loss": 0.2583, + "step": 30134 + }, + { + "epoch": 0.87, + "grad_norm": 1.3231019202484018, + "learning_rate": 4.1020894827007885e-07, + "loss": 0.2715, + "step": 30135 + }, + { + "epoch": 0.87, + "grad_norm": 1.4459721817144473, + "learning_rate": 4.100226462525303e-07, + "loss": 0.2592, + "step": 30136 + }, + { + "epoch": 0.87, + "grad_norm": 1.2723498995883404, + "learning_rate": 4.098363847415815e-07, + "loss": 0.2707, + "step": 30137 + }, + { + "epoch": 0.87, + "grad_norm": 2.2162540428469177, + "learning_rate": 4.0965016373887836e-07, + "loss": 0.2705, + "step": 30138 + }, + { + "epoch": 0.87, + "grad_norm": 1.3634494422664878, + "learning_rate": 4.0946398324606287e-07, + "loss": 0.2685, + "step": 30139 + }, + { + "epoch": 0.87, + "grad_norm": 1.3937580665865925, + "learning_rate": 4.092778432647787e-07, + "loss": 0.2891, + "step": 30140 + }, + { + "epoch": 0.87, + "grad_norm": 1.4513246321969713, + "learning_rate": 4.090917437966685e-07, + "loss": 0.2737, + "step": 30141 + }, + { + "epoch": 0.87, + "grad_norm": 1.252433214875389, + "learning_rate": 4.0890568484337534e-07, + "loss": 0.2544, + "step": 30142 + }, + { + "epoch": 0.87, + "grad_norm": 1.4008784983531428, + "learning_rate": 4.0871966640654016e-07, + "loss": 0.286, + "step": 30143 + }, + { + "epoch": 0.87, + "grad_norm": 1.6192815505629965, + "learning_rate": 4.0853368848780437e-07, + "loss": 0.2656, + "step": 30144 + }, + { + "epoch": 0.87, + "grad_norm": 1.4980885585226882, + "learning_rate": 4.083477510888095e-07, + "loss": 0.2961, + "step": 30145 + }, + { + "epoch": 0.87, + "grad_norm": 1.3613126917200877, + "learning_rate": 4.0816185421119705e-07, + "loss": 0.2544, + "step": 30146 + }, + { + "epoch": 0.87, + "grad_norm": 1.281460137807718, + "learning_rate": 4.0797599785660724e-07, + "loss": 0.2857, + "step": 30147 + }, + { + "epoch": 0.87, + "grad_norm": 1.6423842062538003, + "learning_rate": 4.0779018202667995e-07, + "loss": 0.2758, + "step": 30148 + }, + { + "epoch": 0.87, + "grad_norm": 1.3576048126704585, + "learning_rate": 4.0760440672305503e-07, + "loss": 0.28, + "step": 30149 + }, + { + "epoch": 0.87, + "grad_norm": 1.287541036884108, + "learning_rate": 4.074186719473727e-07, + "loss": 0.2524, + "step": 30150 + }, + { + "epoch": 0.87, + "grad_norm": 1.4074324662056092, + "learning_rate": 4.0723297770127237e-07, + "loss": 0.2768, + "step": 30151 + }, + { + "epoch": 0.87, + "grad_norm": 1.4096897910953625, + "learning_rate": 4.0704732398639037e-07, + "loss": 0.277, + "step": 30152 + }, + { + "epoch": 0.87, + "grad_norm": 1.3791689836684184, + "learning_rate": 4.0686171080436767e-07, + "loss": 0.2666, + "step": 30153 + }, + { + "epoch": 0.87, + "grad_norm": 3.9242013689378736, + "learning_rate": 4.066761381568407e-07, + "loss": 0.2796, + "step": 30154 + }, + { + "epoch": 0.87, + "grad_norm": 1.3577452117594866, + "learning_rate": 4.0649060604544767e-07, + "loss": 0.2598, + "step": 30155 + }, + { + "epoch": 0.87, + "grad_norm": 1.3748828839407228, + "learning_rate": 4.0630511447182663e-07, + "loss": 0.2563, + "step": 30156 + }, + { + "epoch": 0.87, + "grad_norm": 1.505693681140028, + "learning_rate": 4.0611966343761357e-07, + "loss": 0.3216, + "step": 30157 + }, + { + "epoch": 0.87, + "grad_norm": 1.5226162403520491, + "learning_rate": 4.0593425294444544e-07, + "loss": 0.2559, + "step": 30158 + }, + { + "epoch": 0.87, + "grad_norm": 1.6082442828510133, + "learning_rate": 4.057488829939582e-07, + "loss": 0.2673, + "step": 30159 + }, + { + "epoch": 0.87, + "grad_norm": 1.3096347344334052, + "learning_rate": 4.055635535877894e-07, + "loss": 0.2967, + "step": 30160 + }, + { + "epoch": 0.87, + "grad_norm": 1.5043981463957072, + "learning_rate": 4.053782647275717e-07, + "loss": 0.2944, + "step": 30161 + }, + { + "epoch": 0.87, + "grad_norm": 1.282892022781776, + "learning_rate": 4.0519301641494257e-07, + "loss": 0.2392, + "step": 30162 + }, + { + "epoch": 0.87, + "grad_norm": 1.3244254952711334, + "learning_rate": 4.0500780865153524e-07, + "loss": 0.2561, + "step": 30163 + }, + { + "epoch": 0.87, + "grad_norm": 1.4333301800721057, + "learning_rate": 4.048226414389866e-07, + "loss": 0.2754, + "step": 30164 + }, + { + "epoch": 0.87, + "grad_norm": 1.3533277868948665, + "learning_rate": 4.046375147789278e-07, + "loss": 0.262, + "step": 30165 + }, + { + "epoch": 0.87, + "grad_norm": 1.514439182405154, + "learning_rate": 4.0445242867299395e-07, + "loss": 0.2859, + "step": 30166 + }, + { + "epoch": 0.87, + "grad_norm": 1.3884761903358644, + "learning_rate": 4.0426738312281834e-07, + "loss": 0.2626, + "step": 30167 + }, + { + "epoch": 0.88, + "grad_norm": 1.4251091931732072, + "learning_rate": 4.040823781300346e-07, + "loss": 0.266, + "step": 30168 + }, + { + "epoch": 0.88, + "grad_norm": 5.434611985718689, + "learning_rate": 4.038974136962742e-07, + "loss": 0.2677, + "step": 30169 + }, + { + "epoch": 0.88, + "grad_norm": 2.6384681096385645, + "learning_rate": 4.037124898231715e-07, + "loss": 0.28, + "step": 30170 + }, + { + "epoch": 0.88, + "grad_norm": 1.3191504833954466, + "learning_rate": 4.035276065123556e-07, + "loss": 0.2653, + "step": 30171 + }, + { + "epoch": 0.88, + "grad_norm": 1.5552890584397305, + "learning_rate": 4.033427637654602e-07, + "loss": 0.257, + "step": 30172 + }, + { + "epoch": 0.88, + "grad_norm": 1.5193463067389192, + "learning_rate": 4.031579615841152e-07, + "loss": 0.2724, + "step": 30173 + }, + { + "epoch": 0.88, + "grad_norm": 1.2065022255126958, + "learning_rate": 4.0297319996995265e-07, + "loss": 0.247, + "step": 30174 + }, + { + "epoch": 0.88, + "grad_norm": 1.3844387616266622, + "learning_rate": 4.027884789246028e-07, + "loss": 0.28, + "step": 30175 + }, + { + "epoch": 0.88, + "grad_norm": 2.4123736756648233, + "learning_rate": 4.02603798449695e-07, + "loss": 0.2705, + "step": 30176 + }, + { + "epoch": 0.88, + "grad_norm": 1.4419129938648523, + "learning_rate": 4.024191585468601e-07, + "loss": 0.2677, + "step": 30177 + }, + { + "epoch": 0.88, + "grad_norm": 1.6772155289789223, + "learning_rate": 4.022345592177279e-07, + "loss": 0.2652, + "step": 30178 + }, + { + "epoch": 0.88, + "grad_norm": 1.2660606497300884, + "learning_rate": 4.0205000046392606e-07, + "loss": 0.2584, + "step": 30179 + }, + { + "epoch": 0.88, + "grad_norm": 1.597287280715559, + "learning_rate": 4.0186548228708377e-07, + "loss": 0.2669, + "step": 30180 + }, + { + "epoch": 0.88, + "grad_norm": 1.2297223377649504, + "learning_rate": 4.0168100468882977e-07, + "loss": 0.286, + "step": 30181 + }, + { + "epoch": 0.88, + "grad_norm": 1.4097322448081315, + "learning_rate": 4.0149656767079157e-07, + "loss": 0.2723, + "step": 30182 + }, + { + "epoch": 0.88, + "grad_norm": 1.2645721765888467, + "learning_rate": 4.0131217123459745e-07, + "loss": 0.2597, + "step": 30183 + }, + { + "epoch": 0.88, + "grad_norm": 1.3249628103125413, + "learning_rate": 4.011278153818743e-07, + "loss": 0.2671, + "step": 30184 + }, + { + "epoch": 0.88, + "grad_norm": 1.9300932884137347, + "learning_rate": 4.009435001142492e-07, + "loss": 0.2775, + "step": 30185 + }, + { + "epoch": 0.88, + "grad_norm": 0.8830079780500499, + "learning_rate": 4.0075922543334866e-07, + "loss": 0.5046, + "step": 30186 + }, + { + "epoch": 0.88, + "grad_norm": 1.4975480289258296, + "learning_rate": 4.0057499134080026e-07, + "loss": 0.262, + "step": 30187 + }, + { + "epoch": 0.88, + "grad_norm": 1.2807282706639374, + "learning_rate": 4.0039079783822765e-07, + "loss": 0.256, + "step": 30188 + }, + { + "epoch": 0.88, + "grad_norm": 1.2298624041740929, + "learning_rate": 4.002066449272568e-07, + "loss": 0.2674, + "step": 30189 + }, + { + "epoch": 0.88, + "grad_norm": 1.380139669499502, + "learning_rate": 4.000225326095136e-07, + "loss": 0.261, + "step": 30190 + }, + { + "epoch": 0.88, + "grad_norm": 1.3655028262673028, + "learning_rate": 3.9983846088662235e-07, + "loss": 0.3062, + "step": 30191 + }, + { + "epoch": 0.88, + "grad_norm": 1.2575457709920828, + "learning_rate": 3.99654429760209e-07, + "loss": 0.2767, + "step": 30192 + }, + { + "epoch": 0.88, + "grad_norm": 1.843790774413122, + "learning_rate": 3.9947043923189546e-07, + "loss": 0.2516, + "step": 30193 + }, + { + "epoch": 0.88, + "grad_norm": 1.3048494957378887, + "learning_rate": 3.9928648930330603e-07, + "loss": 0.2693, + "step": 30194 + }, + { + "epoch": 0.88, + "grad_norm": 1.8776743192654897, + "learning_rate": 3.991025799760645e-07, + "loss": 0.2672, + "step": 30195 + }, + { + "epoch": 0.88, + "grad_norm": 1.3351629845263544, + "learning_rate": 3.9891871125179395e-07, + "loss": 0.2594, + "step": 30196 + }, + { + "epoch": 0.88, + "grad_norm": 1.6266239888012362, + "learning_rate": 3.987348831321175e-07, + "loss": 0.2416, + "step": 30197 + }, + { + "epoch": 0.88, + "grad_norm": 1.3125043541281385, + "learning_rate": 3.985510956186556e-07, + "loss": 0.2853, + "step": 30198 + }, + { + "epoch": 0.88, + "grad_norm": 1.2944797560818824, + "learning_rate": 3.983673487130313e-07, + "loss": 0.2659, + "step": 30199 + }, + { + "epoch": 0.88, + "grad_norm": 1.2425412523081243, + "learning_rate": 3.981836424168667e-07, + "loss": 0.2556, + "step": 30200 + }, + { + "epoch": 0.88, + "grad_norm": 1.3007051089687134, + "learning_rate": 3.9799997673178216e-07, + "loss": 0.2666, + "step": 30201 + }, + { + "epoch": 0.88, + "grad_norm": 0.9411107329628892, + "learning_rate": 3.9781635165939916e-07, + "loss": 0.5235, + "step": 30202 + }, + { + "epoch": 0.88, + "grad_norm": 1.6341839411986856, + "learning_rate": 3.976327672013375e-07, + "loss": 0.2831, + "step": 30203 + }, + { + "epoch": 0.88, + "grad_norm": 1.4332908806112923, + "learning_rate": 3.974492233592181e-07, + "loss": 0.2625, + "step": 30204 + }, + { + "epoch": 0.88, + "grad_norm": 1.3927291073055186, + "learning_rate": 3.9726572013466023e-07, + "loss": 0.2832, + "step": 30205 + }, + { + "epoch": 0.88, + "grad_norm": 1.4730883153056844, + "learning_rate": 3.970822575292843e-07, + "loss": 0.2803, + "step": 30206 + }, + { + "epoch": 0.88, + "grad_norm": 1.333328289918658, + "learning_rate": 3.9689883554470785e-07, + "loss": 0.2678, + "step": 30207 + }, + { + "epoch": 0.88, + "grad_norm": 1.4328436303814287, + "learning_rate": 3.967154541825502e-07, + "loss": 0.2776, + "step": 30208 + }, + { + "epoch": 0.88, + "grad_norm": 1.4655836921028071, + "learning_rate": 3.9653211344442946e-07, + "loss": 0.2637, + "step": 30209 + }, + { + "epoch": 0.88, + "grad_norm": 1.3765530326319833, + "learning_rate": 3.963488133319637e-07, + "loss": 0.2536, + "step": 30210 + }, + { + "epoch": 0.88, + "grad_norm": 1.3506660758201479, + "learning_rate": 3.9616555384677126e-07, + "loss": 0.274, + "step": 30211 + }, + { + "epoch": 0.88, + "grad_norm": 1.2941067331363045, + "learning_rate": 3.9598233499046847e-07, + "loss": 0.2713, + "step": 30212 + }, + { + "epoch": 0.88, + "grad_norm": 1.414449884496783, + "learning_rate": 3.9579915676467296e-07, + "loss": 0.2878, + "step": 30213 + }, + { + "epoch": 0.88, + "grad_norm": 1.224852048561316, + "learning_rate": 3.9561601917100177e-07, + "loss": 0.2784, + "step": 30214 + }, + { + "epoch": 0.88, + "grad_norm": 1.5923044366111156, + "learning_rate": 3.9543292221106923e-07, + "loss": 0.2873, + "step": 30215 + }, + { + "epoch": 0.88, + "grad_norm": 1.6006166566274345, + "learning_rate": 3.952498658864923e-07, + "loss": 0.2708, + "step": 30216 + }, + { + "epoch": 0.88, + "grad_norm": 1.4669613507670305, + "learning_rate": 3.9506685019888637e-07, + "loss": 0.2679, + "step": 30217 + }, + { + "epoch": 0.88, + "grad_norm": 0.8869747531721995, + "learning_rate": 3.9488387514986684e-07, + "loss": 0.5439, + "step": 30218 + }, + { + "epoch": 0.88, + "grad_norm": 1.2918324939244068, + "learning_rate": 3.947009407410479e-07, + "loss": 0.2734, + "step": 30219 + }, + { + "epoch": 0.88, + "grad_norm": 1.2800172157468652, + "learning_rate": 3.9451804697404503e-07, + "loss": 0.2696, + "step": 30220 + }, + { + "epoch": 0.88, + "grad_norm": 1.3696290233333719, + "learning_rate": 3.943351938504708e-07, + "loss": 0.2812, + "step": 30221 + }, + { + "epoch": 0.88, + "grad_norm": 1.9492341716842945, + "learning_rate": 3.9415238137194003e-07, + "loss": 0.2783, + "step": 30222 + }, + { + "epoch": 0.88, + "grad_norm": 1.2723278407785081, + "learning_rate": 3.939696095400647e-07, + "loss": 0.2659, + "step": 30223 + }, + { + "epoch": 0.88, + "grad_norm": 1.3885280380318896, + "learning_rate": 3.9378687835645914e-07, + "loss": 0.2553, + "step": 30224 + }, + { + "epoch": 0.88, + "grad_norm": 1.3982360952930584, + "learning_rate": 3.9360418782273647e-07, + "loss": 0.2875, + "step": 30225 + }, + { + "epoch": 0.88, + "grad_norm": 1.3031150517000853, + "learning_rate": 3.9342153794050706e-07, + "loss": 0.2623, + "step": 30226 + }, + { + "epoch": 0.88, + "grad_norm": 1.365798037962306, + "learning_rate": 3.932389287113836e-07, + "loss": 0.2563, + "step": 30227 + }, + { + "epoch": 0.88, + "grad_norm": 1.5971876000882652, + "learning_rate": 3.9305636013697743e-07, + "loss": 0.2771, + "step": 30228 + }, + { + "epoch": 0.88, + "grad_norm": 1.5658655347963986, + "learning_rate": 3.928738322189002e-07, + "loss": 0.2726, + "step": 30229 + }, + { + "epoch": 0.88, + "grad_norm": 1.4195199200947994, + "learning_rate": 3.926913449587627e-07, + "loss": 0.2797, + "step": 30230 + }, + { + "epoch": 0.88, + "grad_norm": 1.4762491312385666, + "learning_rate": 3.9250889835817485e-07, + "loss": 0.2975, + "step": 30231 + }, + { + "epoch": 0.88, + "grad_norm": 1.2676131196821059, + "learning_rate": 3.92326492418747e-07, + "loss": 0.263, + "step": 30232 + }, + { + "epoch": 0.88, + "grad_norm": 1.3637096682538155, + "learning_rate": 3.9214412714209005e-07, + "loss": 0.2668, + "step": 30233 + }, + { + "epoch": 0.88, + "grad_norm": 1.3975956492064787, + "learning_rate": 3.9196180252981164e-07, + "loss": 0.2782, + "step": 30234 + }, + { + "epoch": 0.88, + "grad_norm": 1.3098577269899445, + "learning_rate": 3.9177951858352103e-07, + "loss": 0.2752, + "step": 30235 + }, + { + "epoch": 0.88, + "grad_norm": 1.3454924048485888, + "learning_rate": 3.9159727530482805e-07, + "loss": 0.2831, + "step": 30236 + }, + { + "epoch": 0.88, + "grad_norm": 1.7149855885097176, + "learning_rate": 3.914150726953392e-07, + "loss": 0.322, + "step": 30237 + }, + { + "epoch": 0.88, + "grad_norm": 1.2842123427131211, + "learning_rate": 3.9123291075666423e-07, + "loss": 0.2563, + "step": 30238 + }, + { + "epoch": 0.88, + "grad_norm": 1.4561090516827488, + "learning_rate": 3.9105078949040974e-07, + "loss": 0.2715, + "step": 30239 + }, + { + "epoch": 0.88, + "grad_norm": 1.99536263990717, + "learning_rate": 3.9086870889818276e-07, + "loss": 0.2731, + "step": 30240 + }, + { + "epoch": 0.88, + "grad_norm": 1.2621262481339401, + "learning_rate": 3.906866689815914e-07, + "loss": 0.2686, + "step": 30241 + }, + { + "epoch": 0.88, + "grad_norm": 1.7267851496069397, + "learning_rate": 3.9050466974224153e-07, + "loss": 0.2672, + "step": 30242 + }, + { + "epoch": 0.88, + "grad_norm": 1.3011818290036867, + "learning_rate": 3.903227111817381e-07, + "loss": 0.2618, + "step": 30243 + }, + { + "epoch": 0.88, + "grad_norm": 1.3688978658091562, + "learning_rate": 3.901407933016882e-07, + "loss": 0.2584, + "step": 30244 + }, + { + "epoch": 0.88, + "grad_norm": 1.221937348617278, + "learning_rate": 3.8995891610369707e-07, + "loss": 0.2647, + "step": 30245 + }, + { + "epoch": 0.88, + "grad_norm": 1.2200273963923993, + "learning_rate": 3.8977707958936963e-07, + "loss": 0.2736, + "step": 30246 + }, + { + "epoch": 0.88, + "grad_norm": 1.3828387932690187, + "learning_rate": 3.895952837603101e-07, + "loss": 0.2694, + "step": 30247 + }, + { + "epoch": 0.88, + "grad_norm": 1.8492609733624936, + "learning_rate": 3.8941352861812444e-07, + "loss": 0.2628, + "step": 30248 + }, + { + "epoch": 0.88, + "grad_norm": 2.4319519950662833, + "learning_rate": 3.8923181416441467e-07, + "loss": 0.2762, + "step": 30249 + }, + { + "epoch": 0.88, + "grad_norm": 2.045091794149814, + "learning_rate": 3.890501404007857e-07, + "loss": 0.2798, + "step": 30250 + }, + { + "epoch": 0.88, + "grad_norm": 1.279366588812291, + "learning_rate": 3.8886850732884005e-07, + "loss": 0.2761, + "step": 30251 + }, + { + "epoch": 0.88, + "grad_norm": 1.742090103729635, + "learning_rate": 3.886869149501821e-07, + "loss": 0.2456, + "step": 30252 + }, + { + "epoch": 0.88, + "grad_norm": 1.519998195528188, + "learning_rate": 3.885053632664121e-07, + "loss": 0.2653, + "step": 30253 + }, + { + "epoch": 0.88, + "grad_norm": 1.272352112651303, + "learning_rate": 3.8832385227913327e-07, + "loss": 0.2814, + "step": 30254 + }, + { + "epoch": 0.88, + "grad_norm": 1.394389179334717, + "learning_rate": 3.8814238198994767e-07, + "loss": 0.2778, + "step": 30255 + }, + { + "epoch": 0.88, + "grad_norm": 1.372881108148481, + "learning_rate": 3.8796095240045685e-07, + "loss": 0.2552, + "step": 30256 + }, + { + "epoch": 0.88, + "grad_norm": 1.583902816554531, + "learning_rate": 3.877795635122622e-07, + "loss": 0.2746, + "step": 30257 + }, + { + "epoch": 0.88, + "grad_norm": 2.0505553270543446, + "learning_rate": 3.875982153269636e-07, + "loss": 0.2726, + "step": 30258 + }, + { + "epoch": 0.88, + "grad_norm": 1.377306162097496, + "learning_rate": 3.8741690784616205e-07, + "loss": 0.2687, + "step": 30259 + }, + { + "epoch": 0.88, + "grad_norm": 1.406570957265944, + "learning_rate": 3.8723564107145727e-07, + "loss": 0.285, + "step": 30260 + }, + { + "epoch": 0.88, + "grad_norm": 1.336622811002982, + "learning_rate": 3.870544150044503e-07, + "loss": 0.2693, + "step": 30261 + }, + { + "epoch": 0.88, + "grad_norm": 1.7007283413201821, + "learning_rate": 3.868732296467381e-07, + "loss": 0.2809, + "step": 30262 + }, + { + "epoch": 0.88, + "grad_norm": 1.4289188558722337, + "learning_rate": 3.8669208499992114e-07, + "loss": 0.255, + "step": 30263 + }, + { + "epoch": 0.88, + "grad_norm": 1.5761357386514316, + "learning_rate": 3.865109810655976e-07, + "loss": 0.273, + "step": 30264 + }, + { + "epoch": 0.88, + "grad_norm": 1.4914186275736696, + "learning_rate": 3.8632991784536557e-07, + "loss": 0.3023, + "step": 30265 + }, + { + "epoch": 0.88, + "grad_norm": 1.5235552245465627, + "learning_rate": 3.861488953408238e-07, + "loss": 0.2732, + "step": 30266 + }, + { + "epoch": 0.88, + "grad_norm": 1.3908901914635823, + "learning_rate": 3.8596791355356876e-07, + "loss": 0.311, + "step": 30267 + }, + { + "epoch": 0.88, + "grad_norm": 1.1751161459672785, + "learning_rate": 3.8578697248519814e-07, + "loss": 0.2591, + "step": 30268 + }, + { + "epoch": 0.88, + "grad_norm": 1.4627498420149434, + "learning_rate": 3.856060721373095e-07, + "loss": 0.2856, + "step": 30269 + }, + { + "epoch": 0.88, + "grad_norm": 1.5189115822586983, + "learning_rate": 3.8542521251149765e-07, + "loss": 0.2551, + "step": 30270 + }, + { + "epoch": 0.88, + "grad_norm": 1.2976520461035208, + "learning_rate": 3.852443936093592e-07, + "loss": 0.2564, + "step": 30271 + }, + { + "epoch": 0.88, + "grad_norm": 1.871551634883083, + "learning_rate": 3.850636154324905e-07, + "loss": 0.2507, + "step": 30272 + }, + { + "epoch": 0.88, + "grad_norm": 1.2710029015005866, + "learning_rate": 3.848828779824865e-07, + "loss": 0.2626, + "step": 30273 + }, + { + "epoch": 0.88, + "grad_norm": 1.3702313721162596, + "learning_rate": 3.847021812609425e-07, + "loss": 0.2806, + "step": 30274 + }, + { + "epoch": 0.88, + "grad_norm": 1.2722781730395643, + "learning_rate": 3.8452152526945286e-07, + "loss": 0.2706, + "step": 30275 + }, + { + "epoch": 0.88, + "grad_norm": 1.369580895808444, + "learning_rate": 3.8434091000961237e-07, + "loss": 0.2667, + "step": 30276 + }, + { + "epoch": 0.88, + "grad_norm": 1.5183231649400761, + "learning_rate": 3.8416033548301426e-07, + "loss": 0.2964, + "step": 30277 + }, + { + "epoch": 0.88, + "grad_norm": 1.2585677346824544, + "learning_rate": 3.8397980169125215e-07, + "loss": 0.2457, + "step": 30278 + }, + { + "epoch": 0.88, + "grad_norm": 1.3137415355537807, + "learning_rate": 3.8379930863591985e-07, + "loss": 0.2825, + "step": 30279 + }, + { + "epoch": 0.88, + "grad_norm": 1.334247826271691, + "learning_rate": 3.8361885631860993e-07, + "loss": 0.2779, + "step": 30280 + }, + { + "epoch": 0.88, + "grad_norm": 1.3329854133820256, + "learning_rate": 3.834384447409145e-07, + "loss": 0.2574, + "step": 30281 + }, + { + "epoch": 0.88, + "grad_norm": 1.4668082468483872, + "learning_rate": 3.8325807390442616e-07, + "loss": 0.2578, + "step": 30282 + }, + { + "epoch": 0.88, + "grad_norm": 1.3156834728839533, + "learning_rate": 3.8307774381073635e-07, + "loss": 0.273, + "step": 30283 + }, + { + "epoch": 0.88, + "grad_norm": 1.765153911293429, + "learning_rate": 3.8289745446143665e-07, + "loss": 0.2714, + "step": 30284 + }, + { + "epoch": 0.88, + "grad_norm": 1.2966287532974121, + "learning_rate": 3.82717205858118e-07, + "loss": 0.2816, + "step": 30285 + }, + { + "epoch": 0.88, + "grad_norm": 1.4398577890594935, + "learning_rate": 3.825369980023713e-07, + "loss": 0.2466, + "step": 30286 + }, + { + "epoch": 0.88, + "grad_norm": 1.4058058826516004, + "learning_rate": 3.8235683089578703e-07, + "loss": 0.2578, + "step": 30287 + }, + { + "epoch": 0.88, + "grad_norm": 1.2834344562915556, + "learning_rate": 3.8217670453995545e-07, + "loss": 0.2652, + "step": 30288 + }, + { + "epoch": 0.88, + "grad_norm": 2.51835539501507, + "learning_rate": 3.8199661893646546e-07, + "loss": 0.2747, + "step": 30289 + }, + { + "epoch": 0.88, + "grad_norm": 1.3325592055190827, + "learning_rate": 3.8181657408690566e-07, + "loss": 0.2993, + "step": 30290 + }, + { + "epoch": 0.88, + "grad_norm": 1.7437619599082854, + "learning_rate": 3.8163656999286647e-07, + "loss": 0.2514, + "step": 30291 + }, + { + "epoch": 0.88, + "grad_norm": 1.3894533227985406, + "learning_rate": 3.8145660665593554e-07, + "loss": 0.2594, + "step": 30292 + }, + { + "epoch": 0.88, + "grad_norm": 1.2996768463489674, + "learning_rate": 3.812766840777016e-07, + "loss": 0.271, + "step": 30293 + }, + { + "epoch": 0.88, + "grad_norm": 1.382740678896396, + "learning_rate": 3.8109680225975164e-07, + "loss": 0.2763, + "step": 30294 + }, + { + "epoch": 0.88, + "grad_norm": 1.5321096397915248, + "learning_rate": 3.8091696120367393e-07, + "loss": 0.2712, + "step": 30295 + }, + { + "epoch": 0.88, + "grad_norm": 1.7178195588107903, + "learning_rate": 3.807371609110555e-07, + "loss": 0.2696, + "step": 30296 + }, + { + "epoch": 0.88, + "grad_norm": 1.41212747208091, + "learning_rate": 3.8055740138348343e-07, + "loss": 0.276, + "step": 30297 + }, + { + "epoch": 0.88, + "grad_norm": 1.6350093237511691, + "learning_rate": 3.803776826225425e-07, + "loss": 0.2862, + "step": 30298 + }, + { + "epoch": 0.88, + "grad_norm": 1.4012374434756005, + "learning_rate": 3.801980046298198e-07, + "loss": 0.2663, + "step": 30299 + }, + { + "epoch": 0.88, + "grad_norm": 1.4125695203060624, + "learning_rate": 3.8001836740690133e-07, + "loss": 0.2594, + "step": 30300 + }, + { + "epoch": 0.88, + "grad_norm": 1.441348077078421, + "learning_rate": 3.7983877095537193e-07, + "loss": 0.2386, + "step": 30301 + }, + { + "epoch": 0.88, + "grad_norm": 1.494648247912362, + "learning_rate": 3.7965921527681636e-07, + "loss": 0.276, + "step": 30302 + }, + { + "epoch": 0.88, + "grad_norm": 1.959379784339386, + "learning_rate": 3.7947970037281957e-07, + "loss": 0.282, + "step": 30303 + }, + { + "epoch": 0.88, + "grad_norm": 1.1978800421376345, + "learning_rate": 3.793002262449669e-07, + "loss": 0.2745, + "step": 30304 + }, + { + "epoch": 0.88, + "grad_norm": 1.8213570687871392, + "learning_rate": 3.7912079289483984e-07, + "loss": 0.2848, + "step": 30305 + }, + { + "epoch": 0.88, + "grad_norm": 1.5667387268263835, + "learning_rate": 3.7894140032402327e-07, + "loss": 0.2713, + "step": 30306 + }, + { + "epoch": 0.88, + "grad_norm": 0.9985168394018782, + "learning_rate": 3.787620485341004e-07, + "loss": 0.5361, + "step": 30307 + }, + { + "epoch": 0.88, + "grad_norm": 1.7196862662015933, + "learning_rate": 3.785827375266532e-07, + "loss": 0.2688, + "step": 30308 + }, + { + "epoch": 0.88, + "grad_norm": 1.2519765930784934, + "learning_rate": 3.7840346730326493e-07, + "loss": 0.2632, + "step": 30309 + }, + { + "epoch": 0.88, + "grad_norm": 1.4362597339931444, + "learning_rate": 3.7822423786551655e-07, + "loss": 0.2823, + "step": 30310 + }, + { + "epoch": 0.88, + "grad_norm": 1.3493663622400676, + "learning_rate": 3.7804504921499116e-07, + "loss": 0.2786, + "step": 30311 + }, + { + "epoch": 0.88, + "grad_norm": 1.3803051754565046, + "learning_rate": 3.7786590135326927e-07, + "loss": 0.2723, + "step": 30312 + }, + { + "epoch": 0.88, + "grad_norm": 1.307367033783307, + "learning_rate": 3.7768679428193233e-07, + "loss": 0.2737, + "step": 30313 + }, + { + "epoch": 0.88, + "grad_norm": 1.4124664765547763, + "learning_rate": 3.7750772800256075e-07, + "loss": 0.2721, + "step": 30314 + }, + { + "epoch": 0.88, + "grad_norm": 1.4390338555922488, + "learning_rate": 3.7732870251673436e-07, + "loss": 0.2524, + "step": 30315 + }, + { + "epoch": 0.88, + "grad_norm": 1.2650244325160451, + "learning_rate": 3.7714971782603414e-07, + "loss": 0.2618, + "step": 30316 + }, + { + "epoch": 0.88, + "grad_norm": 1.226882909669773, + "learning_rate": 3.7697077393203886e-07, + "loss": 0.2653, + "step": 30317 + }, + { + "epoch": 0.88, + "grad_norm": 1.3173987068259583, + "learning_rate": 3.767918708363272e-07, + "loss": 0.2623, + "step": 30318 + }, + { + "epoch": 0.88, + "grad_norm": 1.343752032767657, + "learning_rate": 3.766130085404784e-07, + "loss": 0.2659, + "step": 30319 + }, + { + "epoch": 0.88, + "grad_norm": 1.5153984270507161, + "learning_rate": 3.764341870460714e-07, + "loss": 0.2642, + "step": 30320 + }, + { + "epoch": 0.88, + "grad_norm": 1.301573210515388, + "learning_rate": 3.7625540635468416e-07, + "loss": 0.2817, + "step": 30321 + }, + { + "epoch": 0.88, + "grad_norm": 1.4363734268236645, + "learning_rate": 3.7607666646789385e-07, + "loss": 0.2887, + "step": 30322 + }, + { + "epoch": 0.88, + "grad_norm": 1.4610301960490548, + "learning_rate": 3.7589796738727867e-07, + "loss": 0.2709, + "step": 30323 + }, + { + "epoch": 0.88, + "grad_norm": 1.3304051062728866, + "learning_rate": 3.757193091144162e-07, + "loss": 0.2657, + "step": 30324 + }, + { + "epoch": 0.88, + "grad_norm": 1.2158808217036334, + "learning_rate": 3.755406916508808e-07, + "loss": 0.2696, + "step": 30325 + }, + { + "epoch": 0.88, + "grad_norm": 1.4080579031639922, + "learning_rate": 3.7536211499825e-07, + "loss": 0.2735, + "step": 30326 + }, + { + "epoch": 0.88, + "grad_norm": 1.472175536208311, + "learning_rate": 3.7518357915810046e-07, + "loss": 0.2652, + "step": 30327 + }, + { + "epoch": 0.88, + "grad_norm": 1.4952059864383458, + "learning_rate": 3.750050841320069e-07, + "loss": 0.2805, + "step": 30328 + }, + { + "epoch": 0.88, + "grad_norm": 1.6685184851112702, + "learning_rate": 3.748266299215442e-07, + "loss": 0.2794, + "step": 30329 + }, + { + "epoch": 0.88, + "grad_norm": 1.323470593563246, + "learning_rate": 3.7464821652828833e-07, + "loss": 0.2686, + "step": 30330 + }, + { + "epoch": 0.88, + "grad_norm": 1.4191096470772595, + "learning_rate": 3.7446984395381305e-07, + "loss": 0.304, + "step": 30331 + }, + { + "epoch": 0.88, + "grad_norm": 1.3174507249437997, + "learning_rate": 3.742915121996937e-07, + "loss": 0.2796, + "step": 30332 + }, + { + "epoch": 0.88, + "grad_norm": 1.3826305412581072, + "learning_rate": 3.7411322126750247e-07, + "loss": 0.2737, + "step": 30333 + }, + { + "epoch": 0.88, + "grad_norm": 1.4959554783848417, + "learning_rate": 3.7393497115881306e-07, + "loss": 0.2631, + "step": 30334 + }, + { + "epoch": 0.88, + "grad_norm": 1.460220274689965, + "learning_rate": 3.7375676187519974e-07, + "loss": 0.2668, + "step": 30335 + }, + { + "epoch": 0.88, + "grad_norm": 1.2933917414034597, + "learning_rate": 3.7357859341823344e-07, + "loss": 0.2719, + "step": 30336 + }, + { + "epoch": 0.88, + "grad_norm": 1.41578093808326, + "learning_rate": 3.734004657894874e-07, + "loss": 0.2769, + "step": 30337 + }, + { + "epoch": 0.88, + "grad_norm": 2.021869771519652, + "learning_rate": 3.732223789905337e-07, + "loss": 0.28, + "step": 30338 + }, + { + "epoch": 0.88, + "grad_norm": 1.443543209207895, + "learning_rate": 3.7304433302294385e-07, + "loss": 0.2989, + "step": 30339 + }, + { + "epoch": 0.88, + "grad_norm": 1.3842569656547434, + "learning_rate": 3.728663278882888e-07, + "loss": 0.2976, + "step": 30340 + }, + { + "epoch": 0.88, + "grad_norm": 1.4001137301217488, + "learning_rate": 3.726883635881401e-07, + "loss": 0.2797, + "step": 30341 + }, + { + "epoch": 0.88, + "grad_norm": 1.384204391475426, + "learning_rate": 3.7251044012406756e-07, + "loss": 0.2581, + "step": 30342 + }, + { + "epoch": 0.88, + "grad_norm": 1.3294026103610634, + "learning_rate": 3.723325574976427e-07, + "loss": 0.2675, + "step": 30343 + }, + { + "epoch": 0.88, + "grad_norm": 1.4573077474634046, + "learning_rate": 3.721547157104338e-07, + "loss": 0.2669, + "step": 30344 + }, + { + "epoch": 0.88, + "grad_norm": 1.2593010142042775, + "learning_rate": 3.7197691476401064e-07, + "loss": 0.2636, + "step": 30345 + }, + { + "epoch": 0.88, + "grad_norm": 1.3410655692070446, + "learning_rate": 3.717991546599425e-07, + "loss": 0.2694, + "step": 30346 + }, + { + "epoch": 0.88, + "grad_norm": 1.2384080039743466, + "learning_rate": 3.7162143539979823e-07, + "loss": 0.2562, + "step": 30347 + }, + { + "epoch": 0.88, + "grad_norm": 1.5391181798709967, + "learning_rate": 3.714437569851459e-07, + "loss": 0.2705, + "step": 30348 + }, + { + "epoch": 0.88, + "grad_norm": 1.5163594163090661, + "learning_rate": 3.712661194175543e-07, + "loss": 0.2803, + "step": 30349 + }, + { + "epoch": 0.88, + "grad_norm": 1.6202077469509375, + "learning_rate": 3.7108852269858996e-07, + "loss": 0.262, + "step": 30350 + }, + { + "epoch": 0.88, + "grad_norm": 1.5757354382577697, + "learning_rate": 3.7091096682982107e-07, + "loss": 0.2799, + "step": 30351 + }, + { + "epoch": 0.88, + "grad_norm": 1.4335246835227509, + "learning_rate": 3.7073345181281473e-07, + "loss": 0.2733, + "step": 30352 + }, + { + "epoch": 0.88, + "grad_norm": 1.3438803890665256, + "learning_rate": 3.7055597764913685e-07, + "loss": 0.3041, + "step": 30353 + }, + { + "epoch": 0.88, + "grad_norm": 1.8833756149879428, + "learning_rate": 3.703785443403535e-07, + "loss": 0.3043, + "step": 30354 + }, + { + "epoch": 0.88, + "grad_norm": 1.3049827669124705, + "learning_rate": 3.702011518880305e-07, + "loss": 0.2579, + "step": 30355 + }, + { + "epoch": 0.88, + "grad_norm": 1.361300164932123, + "learning_rate": 3.70023800293734e-07, + "loss": 0.275, + "step": 30356 + }, + { + "epoch": 0.88, + "grad_norm": 1.3069812850576839, + "learning_rate": 3.6984648955902927e-07, + "loss": 0.286, + "step": 30357 + }, + { + "epoch": 0.88, + "grad_norm": 1.3392002757733472, + "learning_rate": 3.696692196854801e-07, + "loss": 0.2798, + "step": 30358 + }, + { + "epoch": 0.88, + "grad_norm": 1.8324151592062266, + "learning_rate": 3.6949199067465134e-07, + "loss": 0.2628, + "step": 30359 + }, + { + "epoch": 0.88, + "grad_norm": 1.4701661210286714, + "learning_rate": 3.693148025281085e-07, + "loss": 0.2849, + "step": 30360 + }, + { + "epoch": 0.88, + "grad_norm": 1.6486201102781208, + "learning_rate": 3.691376552474124e-07, + "loss": 0.2908, + "step": 30361 + }, + { + "epoch": 0.88, + "grad_norm": 1.504372130162973, + "learning_rate": 3.689605488341291e-07, + "loss": 0.266, + "step": 30362 + }, + { + "epoch": 0.88, + "grad_norm": 1.5139936698824719, + "learning_rate": 3.687834832898196e-07, + "loss": 0.2876, + "step": 30363 + }, + { + "epoch": 0.88, + "grad_norm": 1.4067463315312516, + "learning_rate": 3.6860645861604704e-07, + "loss": 0.281, + "step": 30364 + }, + { + "epoch": 0.88, + "grad_norm": 1.4252241503468634, + "learning_rate": 3.684294748143735e-07, + "loss": 0.2511, + "step": 30365 + }, + { + "epoch": 0.88, + "grad_norm": 1.7667383937899084, + "learning_rate": 3.682525318863617e-07, + "loss": 0.2769, + "step": 30366 + }, + { + "epoch": 0.88, + "grad_norm": 1.4492907583832273, + "learning_rate": 3.6807562983357245e-07, + "loss": 0.277, + "step": 30367 + }, + { + "epoch": 0.88, + "grad_norm": 1.2332277864036445, + "learning_rate": 3.6789876865756744e-07, + "loss": 0.2598, + "step": 30368 + }, + { + "epoch": 0.88, + "grad_norm": 1.3043568604213964, + "learning_rate": 3.6772194835990704e-07, + "loss": 0.2652, + "step": 30369 + }, + { + "epoch": 0.88, + "grad_norm": 1.014947599354737, + "learning_rate": 3.6754516894215165e-07, + "loss": 0.5508, + "step": 30370 + }, + { + "epoch": 0.88, + "grad_norm": 1.4161506656351555, + "learning_rate": 3.673684304058628e-07, + "loss": 0.2542, + "step": 30371 + }, + { + "epoch": 0.88, + "grad_norm": 1.3183881446373886, + "learning_rate": 3.671917327525981e-07, + "loss": 0.2679, + "step": 30372 + }, + { + "epoch": 0.88, + "grad_norm": 1.3801788183288721, + "learning_rate": 3.670150759839175e-07, + "loss": 0.3015, + "step": 30373 + }, + { + "epoch": 0.88, + "grad_norm": 0.9876371151373241, + "learning_rate": 3.6683846010138026e-07, + "loss": 0.6119, + "step": 30374 + }, + { + "epoch": 0.88, + "grad_norm": 1.6189613622941497, + "learning_rate": 3.6666188510654455e-07, + "loss": 0.2819, + "step": 30375 + }, + { + "epoch": 0.88, + "grad_norm": 1.3380770920346403, + "learning_rate": 3.664853510009697e-07, + "loss": 0.2827, + "step": 30376 + }, + { + "epoch": 0.88, + "grad_norm": 0.9499724349052562, + "learning_rate": 3.6630885778621284e-07, + "loss": 0.5873, + "step": 30377 + }, + { + "epoch": 0.88, + "grad_norm": 1.4365841255291578, + "learning_rate": 3.661324054638321e-07, + "loss": 0.2983, + "step": 30378 + }, + { + "epoch": 0.88, + "grad_norm": 2.0088145054051645, + "learning_rate": 3.6595599403538465e-07, + "loss": 0.2924, + "step": 30379 + }, + { + "epoch": 0.88, + "grad_norm": 1.4884797171777553, + "learning_rate": 3.6577962350242637e-07, + "loss": 0.2461, + "step": 30380 + }, + { + "epoch": 0.88, + "grad_norm": 1.2570428655134467, + "learning_rate": 3.6560329386651383e-07, + "loss": 0.2599, + "step": 30381 + }, + { + "epoch": 0.88, + "grad_norm": 3.137526914751395, + "learning_rate": 3.6542700512920413e-07, + "loss": 0.2838, + "step": 30382 + }, + { + "epoch": 0.88, + "grad_norm": 1.3514303465059216, + "learning_rate": 3.6525075729205274e-07, + "loss": 0.2965, + "step": 30383 + }, + { + "epoch": 0.88, + "grad_norm": 1.276960982152016, + "learning_rate": 3.6507455035661444e-07, + "loss": 0.272, + "step": 30384 + }, + { + "epoch": 0.88, + "grad_norm": 1.429195062086981, + "learning_rate": 3.648983843244447e-07, + "loss": 0.2762, + "step": 30385 + }, + { + "epoch": 0.88, + "grad_norm": 1.3924330993251133, + "learning_rate": 3.6472225919709783e-07, + "loss": 0.2535, + "step": 30386 + }, + { + "epoch": 0.88, + "grad_norm": 1.5814168726174436, + "learning_rate": 3.6454617497612867e-07, + "loss": 0.2896, + "step": 30387 + }, + { + "epoch": 0.88, + "grad_norm": 1.3150458942535357, + "learning_rate": 3.643701316630921e-07, + "loss": 0.2669, + "step": 30388 + }, + { + "epoch": 0.88, + "grad_norm": 1.4459380344149035, + "learning_rate": 3.6419412925953966e-07, + "loss": 0.2478, + "step": 30389 + }, + { + "epoch": 0.88, + "grad_norm": 1.2687267407302816, + "learning_rate": 3.640181677670257e-07, + "loss": 0.2597, + "step": 30390 + }, + { + "epoch": 0.88, + "grad_norm": 1.3375984187939907, + "learning_rate": 3.638422471871028e-07, + "loss": 0.2887, + "step": 30391 + }, + { + "epoch": 0.88, + "grad_norm": 1.435201011944133, + "learning_rate": 3.6366636752132313e-07, + "loss": 0.2957, + "step": 30392 + }, + { + "epoch": 0.88, + "grad_norm": 1.4444622070703539, + "learning_rate": 3.634905287712387e-07, + "loss": 0.2714, + "step": 30393 + }, + { + "epoch": 0.88, + "grad_norm": 1.5182869690149206, + "learning_rate": 3.633147309384022e-07, + "loss": 0.2755, + "step": 30394 + }, + { + "epoch": 0.88, + "grad_norm": 1.316593951510596, + "learning_rate": 3.6313897402436516e-07, + "loss": 0.2704, + "step": 30395 + }, + { + "epoch": 0.88, + "grad_norm": 1.6781735410625551, + "learning_rate": 3.6296325803067744e-07, + "loss": 0.2791, + "step": 30396 + }, + { + "epoch": 0.88, + "grad_norm": 1.4700104062847517, + "learning_rate": 3.627875829588906e-07, + "loss": 0.282, + "step": 30397 + }, + { + "epoch": 0.88, + "grad_norm": 1.2410659981990038, + "learning_rate": 3.626119488105556e-07, + "loss": 0.2708, + "step": 30398 + }, + { + "epoch": 0.88, + "grad_norm": 1.2968847293846184, + "learning_rate": 3.624363555872207e-07, + "loss": 0.2667, + "step": 30399 + }, + { + "epoch": 0.88, + "grad_norm": 1.2797039173784743, + "learning_rate": 3.6226080329043676e-07, + "loss": 0.2666, + "step": 30400 + }, + { + "epoch": 0.88, + "grad_norm": 1.3336565172114434, + "learning_rate": 3.620852919217521e-07, + "loss": 0.2924, + "step": 30401 + }, + { + "epoch": 0.88, + "grad_norm": 2.496314766848555, + "learning_rate": 3.619098214827166e-07, + "loss": 0.2644, + "step": 30402 + }, + { + "epoch": 0.88, + "grad_norm": 1.3305364478045807, + "learning_rate": 3.617343919748784e-07, + "loss": 0.2642, + "step": 30403 + }, + { + "epoch": 0.88, + "grad_norm": 1.3994604356324072, + "learning_rate": 3.615590033997857e-07, + "loss": 0.2879, + "step": 30404 + }, + { + "epoch": 0.88, + "grad_norm": 1.3401606046748513, + "learning_rate": 3.6138365575898614e-07, + "loss": 0.2764, + "step": 30405 + }, + { + "epoch": 0.88, + "grad_norm": 1.211963656738346, + "learning_rate": 3.612083490540269e-07, + "loss": 0.2575, + "step": 30406 + }, + { + "epoch": 0.88, + "grad_norm": 1.8216067887880298, + "learning_rate": 3.610330832864567e-07, + "loss": 0.2792, + "step": 30407 + }, + { + "epoch": 0.88, + "grad_norm": 1.2541142991482825, + "learning_rate": 3.608578584578204e-07, + "loss": 0.2559, + "step": 30408 + }, + { + "epoch": 0.88, + "grad_norm": 1.3374556669353999, + "learning_rate": 3.6068267456966453e-07, + "loss": 0.2569, + "step": 30409 + }, + { + "epoch": 0.88, + "grad_norm": 1.671274776240723, + "learning_rate": 3.605075316235357e-07, + "loss": 0.2752, + "step": 30410 + }, + { + "epoch": 0.88, + "grad_norm": 1.7677448617290175, + "learning_rate": 3.6033242962097925e-07, + "loss": 0.2761, + "step": 30411 + }, + { + "epoch": 0.88, + "grad_norm": 1.3310871684396803, + "learning_rate": 3.6015736856354065e-07, + "loss": 0.2705, + "step": 30412 + }, + { + "epoch": 0.88, + "grad_norm": 1.4050329259467997, + "learning_rate": 3.599823484527643e-07, + "loss": 0.2624, + "step": 30413 + }, + { + "epoch": 0.88, + "grad_norm": 1.2502174861487658, + "learning_rate": 3.59807369290196e-07, + "loss": 0.255, + "step": 30414 + }, + { + "epoch": 0.88, + "grad_norm": 1.421033757557846, + "learning_rate": 3.5963243107737966e-07, + "loss": 0.2711, + "step": 30415 + }, + { + "epoch": 0.88, + "grad_norm": 1.7189759021473814, + "learning_rate": 3.5945753381585733e-07, + "loss": 0.2667, + "step": 30416 + }, + { + "epoch": 0.88, + "grad_norm": 1.2386425567011434, + "learning_rate": 3.5928267750717505e-07, + "loss": 0.2608, + "step": 30417 + }, + { + "epoch": 0.88, + "grad_norm": 1.5129150221473755, + "learning_rate": 3.5910786215287374e-07, + "loss": 0.2784, + "step": 30418 + }, + { + "epoch": 0.88, + "grad_norm": 1.361706654893134, + "learning_rate": 3.5893308775449664e-07, + "loss": 0.2763, + "step": 30419 + }, + { + "epoch": 0.88, + "grad_norm": 1.5589137113440634, + "learning_rate": 3.58758354313587e-07, + "loss": 0.2954, + "step": 30420 + }, + { + "epoch": 0.88, + "grad_norm": 1.424004251290386, + "learning_rate": 3.5858366183168626e-07, + "loss": 0.305, + "step": 30421 + }, + { + "epoch": 0.88, + "grad_norm": 2.4380319009770557, + "learning_rate": 3.5840901031033604e-07, + "loss": 0.2807, + "step": 30422 + }, + { + "epoch": 0.88, + "grad_norm": 1.4752419831498458, + "learning_rate": 3.582343997510779e-07, + "loss": 0.2509, + "step": 30423 + }, + { + "epoch": 0.88, + "grad_norm": 1.4681923007202142, + "learning_rate": 3.5805983015545276e-07, + "loss": 0.2557, + "step": 30424 + }, + { + "epoch": 0.88, + "grad_norm": 1.4860045273849563, + "learning_rate": 3.578853015250011e-07, + "loss": 0.3091, + "step": 30425 + }, + { + "epoch": 0.88, + "grad_norm": 1.3277137787815216, + "learning_rate": 3.577108138612639e-07, + "loss": 0.294, + "step": 30426 + }, + { + "epoch": 0.88, + "grad_norm": 1.4841469592560703, + "learning_rate": 3.5753636716577945e-07, + "loss": 0.2692, + "step": 30427 + }, + { + "epoch": 0.88, + "grad_norm": 1.3690222885593062, + "learning_rate": 3.573619614400881e-07, + "loss": 0.2668, + "step": 30428 + }, + { + "epoch": 0.88, + "grad_norm": 1.3294601679459006, + "learning_rate": 3.5718759668572913e-07, + "loss": 0.2456, + "step": 30429 + }, + { + "epoch": 0.88, + "grad_norm": 1.7347925096902033, + "learning_rate": 3.570132729042408e-07, + "loss": 0.289, + "step": 30430 + }, + { + "epoch": 0.88, + "grad_norm": 1.3644414388198969, + "learning_rate": 3.5683899009716193e-07, + "loss": 0.2664, + "step": 30431 + }, + { + "epoch": 0.88, + "grad_norm": 1.3263520306285104, + "learning_rate": 3.566647482660307e-07, + "loss": 0.283, + "step": 30432 + }, + { + "epoch": 0.88, + "grad_norm": 2.36508245714778, + "learning_rate": 3.564905474123842e-07, + "loss": 0.2808, + "step": 30433 + }, + { + "epoch": 0.88, + "grad_norm": 1.5696390116243222, + "learning_rate": 3.5631638753776123e-07, + "loss": 0.3043, + "step": 30434 + }, + { + "epoch": 0.88, + "grad_norm": 1.5638458420952361, + "learning_rate": 3.5614226864369663e-07, + "loss": 0.282, + "step": 30435 + }, + { + "epoch": 0.88, + "grad_norm": 1.7110314691110684, + "learning_rate": 3.559681907317281e-07, + "loss": 0.2968, + "step": 30436 + }, + { + "epoch": 0.88, + "grad_norm": 1.4989321622398952, + "learning_rate": 3.5579415380339213e-07, + "loss": 0.2569, + "step": 30437 + }, + { + "epoch": 0.88, + "grad_norm": 1.2765586606307584, + "learning_rate": 3.556201578602236e-07, + "loss": 0.266, + "step": 30438 + }, + { + "epoch": 0.88, + "grad_norm": 1.3123500807558766, + "learning_rate": 3.554462029037592e-07, + "loss": 0.2963, + "step": 30439 + }, + { + "epoch": 0.88, + "grad_norm": 2.9805674979489045, + "learning_rate": 3.5527228893553367e-07, + "loss": 0.2734, + "step": 30440 + }, + { + "epoch": 0.88, + "grad_norm": 1.447523169480446, + "learning_rate": 3.550984159570814e-07, + "loss": 0.2632, + "step": 30441 + }, + { + "epoch": 0.88, + "grad_norm": 1.6397799712399197, + "learning_rate": 3.549245839699372e-07, + "loss": 0.2522, + "step": 30442 + }, + { + "epoch": 0.88, + "grad_norm": 1.3117574721201088, + "learning_rate": 3.547507929756361e-07, + "loss": 0.2506, + "step": 30443 + }, + { + "epoch": 0.88, + "grad_norm": 0.9584129210750907, + "learning_rate": 3.545770429757095e-07, + "loss": 0.5766, + "step": 30444 + }, + { + "epoch": 0.88, + "grad_norm": 1.5629719401852085, + "learning_rate": 3.5440333397169345e-07, + "loss": 0.267, + "step": 30445 + }, + { + "epoch": 0.88, + "grad_norm": 1.3132275833863798, + "learning_rate": 3.542296659651184e-07, + "loss": 0.3122, + "step": 30446 + }, + { + "epoch": 0.88, + "grad_norm": 1.4249359245873303, + "learning_rate": 3.540560389575182e-07, + "loss": 0.2589, + "step": 30447 + }, + { + "epoch": 0.88, + "grad_norm": 1.436072059713869, + "learning_rate": 3.538824529504248e-07, + "loss": 0.2618, + "step": 30448 + }, + { + "epoch": 0.88, + "grad_norm": 1.4645954030579975, + "learning_rate": 3.5370890794537036e-07, + "loss": 0.2776, + "step": 30449 + }, + { + "epoch": 0.88, + "grad_norm": 1.7002219193788213, + "learning_rate": 3.5353540394388595e-07, + "loss": 0.2767, + "step": 30450 + }, + { + "epoch": 0.88, + "grad_norm": 1.4922784547700008, + "learning_rate": 3.5336194094750363e-07, + "loss": 0.2859, + "step": 30451 + }, + { + "epoch": 0.88, + "grad_norm": 1.2972831358386303, + "learning_rate": 3.5318851895775384e-07, + "loss": 0.2561, + "step": 30452 + }, + { + "epoch": 0.88, + "grad_norm": 1.4480202964893047, + "learning_rate": 3.530151379761676e-07, + "loss": 0.2843, + "step": 30453 + }, + { + "epoch": 0.88, + "grad_norm": 1.3249331828757176, + "learning_rate": 3.5284179800427364e-07, + "loss": 0.2758, + "step": 30454 + }, + { + "epoch": 0.88, + "grad_norm": 1.6148401173892395, + "learning_rate": 3.526684990436019e-07, + "loss": 0.3087, + "step": 30455 + }, + { + "epoch": 0.88, + "grad_norm": 1.6271298675115553, + "learning_rate": 3.524952410956828e-07, + "loss": 0.2596, + "step": 30456 + }, + { + "epoch": 0.88, + "grad_norm": 1.523322909351517, + "learning_rate": 3.5232202416204397e-07, + "loss": 0.2562, + "step": 30457 + }, + { + "epoch": 0.88, + "grad_norm": 1.453778590903578, + "learning_rate": 3.5214884824421536e-07, + "loss": 0.2868, + "step": 30458 + }, + { + "epoch": 0.88, + "grad_norm": 1.390426678428349, + "learning_rate": 3.519757133437246e-07, + "loss": 0.2826, + "step": 30459 + }, + { + "epoch": 0.88, + "grad_norm": 1.78031833091322, + "learning_rate": 3.518026194620999e-07, + "loss": 0.2754, + "step": 30460 + }, + { + "epoch": 0.88, + "grad_norm": 1.3690267811258152, + "learning_rate": 3.5162956660086844e-07, + "loss": 0.2769, + "step": 30461 + }, + { + "epoch": 0.88, + "grad_norm": 1.479165636346022, + "learning_rate": 3.5145655476155895e-07, + "loss": 0.2707, + "step": 30462 + }, + { + "epoch": 0.88, + "grad_norm": 1.6201665262171676, + "learning_rate": 3.5128358394569575e-07, + "loss": 0.2756, + "step": 30463 + }, + { + "epoch": 0.88, + "grad_norm": 1.3022914630549305, + "learning_rate": 3.511106541548065e-07, + "loss": 0.2654, + "step": 30464 + }, + { + "epoch": 0.88, + "grad_norm": 1.3083967505366267, + "learning_rate": 3.509377653904172e-07, + "loss": 0.2653, + "step": 30465 + }, + { + "epoch": 0.88, + "grad_norm": 1.3039249673861377, + "learning_rate": 3.507649176540534e-07, + "loss": 0.2755, + "step": 30466 + }, + { + "epoch": 0.88, + "grad_norm": 1.6715964515718573, + "learning_rate": 3.5059211094724155e-07, + "loss": 0.2712, + "step": 30467 + }, + { + "epoch": 0.88, + "grad_norm": 1.3749865557249201, + "learning_rate": 3.5041934527150544e-07, + "loss": 0.2594, + "step": 30468 + }, + { + "epoch": 0.88, + "grad_norm": 1.4415850920394346, + "learning_rate": 3.5024662062837e-07, + "loss": 0.2918, + "step": 30469 + }, + { + "epoch": 0.88, + "grad_norm": 2.0966094922123704, + "learning_rate": 3.5007393701936063e-07, + "loss": 0.2925, + "step": 30470 + }, + { + "epoch": 0.88, + "grad_norm": 1.6065118715425208, + "learning_rate": 3.499012944460001e-07, + "loss": 0.2809, + "step": 30471 + }, + { + "epoch": 0.88, + "grad_norm": 1.8885252370494685, + "learning_rate": 3.497286929098115e-07, + "loss": 0.2848, + "step": 30472 + }, + { + "epoch": 0.88, + "grad_norm": 1.2998425564321734, + "learning_rate": 3.4955613241231977e-07, + "loss": 0.2874, + "step": 30473 + }, + { + "epoch": 0.88, + "grad_norm": 1.4747087432008934, + "learning_rate": 3.4938361295504595e-07, + "loss": 0.2798, + "step": 30474 + }, + { + "epoch": 0.88, + "grad_norm": 1.3186477961717837, + "learning_rate": 3.4921113453951385e-07, + "loss": 0.2801, + "step": 30475 + }, + { + "epoch": 0.88, + "grad_norm": 1.448206483782142, + "learning_rate": 3.4903869716724437e-07, + "loss": 0.271, + "step": 30476 + }, + { + "epoch": 0.88, + "grad_norm": 1.523046469316306, + "learning_rate": 3.4886630083976027e-07, + "loss": 0.2525, + "step": 30477 + }, + { + "epoch": 0.88, + "grad_norm": 1.4211643448857676, + "learning_rate": 3.48693945558583e-07, + "loss": 0.2548, + "step": 30478 + }, + { + "epoch": 0.88, + "grad_norm": 1.694818132919226, + "learning_rate": 3.4852163132523263e-07, + "loss": 0.2609, + "step": 30479 + }, + { + "epoch": 0.88, + "grad_norm": 1.8038422403771799, + "learning_rate": 3.483493581412317e-07, + "loss": 0.2886, + "step": 30480 + }, + { + "epoch": 0.88, + "grad_norm": 0.984617102469222, + "learning_rate": 3.481771260080985e-07, + "loss": 0.587, + "step": 30481 + }, + { + "epoch": 0.88, + "grad_norm": 1.3608260884746843, + "learning_rate": 3.480049349273534e-07, + "loss": 0.2712, + "step": 30482 + }, + { + "epoch": 0.88, + "grad_norm": 1.3064400677372296, + "learning_rate": 3.47832784900517e-07, + "loss": 0.2508, + "step": 30483 + }, + { + "epoch": 0.88, + "grad_norm": 1.3256864350787423, + "learning_rate": 3.4766067592910736e-07, + "loss": 0.2548, + "step": 30484 + }, + { + "epoch": 0.88, + "grad_norm": 1.2655215243264684, + "learning_rate": 3.474886080146439e-07, + "loss": 0.2686, + "step": 30485 + }, + { + "epoch": 0.88, + "grad_norm": 1.3874879090932712, + "learning_rate": 3.473165811586449e-07, + "loss": 0.2613, + "step": 30486 + }, + { + "epoch": 0.88, + "grad_norm": 1.5237663220022695, + "learning_rate": 3.47144595362629e-07, + "loss": 0.2711, + "step": 30487 + }, + { + "epoch": 0.88, + "grad_norm": 1.3033037774230287, + "learning_rate": 3.46972650628114e-07, + "loss": 0.2702, + "step": 30488 + }, + { + "epoch": 0.88, + "grad_norm": 1.4237159061766853, + "learning_rate": 3.4680074695661706e-07, + "loss": 0.2657, + "step": 30489 + }, + { + "epoch": 0.88, + "grad_norm": 1.47331084375125, + "learning_rate": 3.4662888434965457e-07, + "loss": 0.3121, + "step": 30490 + }, + { + "epoch": 0.88, + "grad_norm": 1.5753503519401244, + "learning_rate": 3.4645706280874433e-07, + "loss": 0.2616, + "step": 30491 + }, + { + "epoch": 0.88, + "grad_norm": 2.362593683454463, + "learning_rate": 3.462852823354013e-07, + "loss": 0.2555, + "step": 30492 + }, + { + "epoch": 0.88, + "grad_norm": 1.3499285266441807, + "learning_rate": 3.4611354293114295e-07, + "loss": 0.2581, + "step": 30493 + }, + { + "epoch": 0.88, + "grad_norm": 1.611581488694263, + "learning_rate": 3.459418445974838e-07, + "loss": 0.2768, + "step": 30494 + }, + { + "epoch": 0.88, + "grad_norm": 1.2756557713368608, + "learning_rate": 3.4577018733593983e-07, + "loss": 0.2563, + "step": 30495 + }, + { + "epoch": 0.88, + "grad_norm": 1.1932158961368866, + "learning_rate": 3.455985711480253e-07, + "loss": 0.2562, + "step": 30496 + }, + { + "epoch": 0.88, + "grad_norm": 1.3965158302003322, + "learning_rate": 3.454269960352552e-07, + "loss": 0.2619, + "step": 30497 + }, + { + "epoch": 0.88, + "grad_norm": 1.308912130037255, + "learning_rate": 3.452554619991444e-07, + "loss": 0.2606, + "step": 30498 + }, + { + "epoch": 0.88, + "grad_norm": 0.9199292091890799, + "learning_rate": 3.4508396904120446e-07, + "loss": 0.5711, + "step": 30499 + }, + { + "epoch": 0.88, + "grad_norm": 1.2589013552733763, + "learning_rate": 3.4491251716295083e-07, + "loss": 0.2546, + "step": 30500 + }, + { + "epoch": 0.88, + "grad_norm": 0.9500286900368162, + "learning_rate": 3.447411063658962e-07, + "loss": 0.4979, + "step": 30501 + }, + { + "epoch": 0.88, + "grad_norm": 1.4300388402319355, + "learning_rate": 3.4456973665155216e-07, + "loss": 0.2714, + "step": 30502 + }, + { + "epoch": 0.88, + "grad_norm": 1.5444853385193786, + "learning_rate": 3.443984080214319e-07, + "loss": 0.2999, + "step": 30503 + }, + { + "epoch": 0.88, + "grad_norm": 1.336520221858218, + "learning_rate": 3.442271204770475e-07, + "loss": 0.2615, + "step": 30504 + }, + { + "epoch": 0.88, + "grad_norm": 2.944477817825845, + "learning_rate": 3.4405587401991013e-07, + "loss": 0.254, + "step": 30505 + }, + { + "epoch": 0.88, + "grad_norm": 1.2980166871353822, + "learning_rate": 3.4388466865153126e-07, + "loss": 0.2615, + "step": 30506 + }, + { + "epoch": 0.88, + "grad_norm": 0.9852962459908093, + "learning_rate": 3.437135043734224e-07, + "loss": 0.5702, + "step": 30507 + }, + { + "epoch": 0.88, + "grad_norm": 1.4053962230685402, + "learning_rate": 3.4354238118709415e-07, + "loss": 0.2853, + "step": 30508 + }, + { + "epoch": 0.88, + "grad_norm": 1.8160735908409442, + "learning_rate": 3.4337129909405463e-07, + "loss": 0.2804, + "step": 30509 + }, + { + "epoch": 0.88, + "grad_norm": 1.3328765286880069, + "learning_rate": 3.4320025809581545e-07, + "loss": 0.2611, + "step": 30510 + }, + { + "epoch": 0.88, + "grad_norm": 1.8781626392654436, + "learning_rate": 3.43029258193886e-07, + "loss": 0.2702, + "step": 30511 + }, + { + "epoch": 0.88, + "grad_norm": 1.568546346320799, + "learning_rate": 3.4285829938977445e-07, + "loss": 0.2879, + "step": 30512 + }, + { + "epoch": 0.89, + "grad_norm": 1.3621720992138613, + "learning_rate": 3.426873816849907e-07, + "loss": 0.29, + "step": 30513 + }, + { + "epoch": 0.89, + "grad_norm": 1.2958659581120888, + "learning_rate": 3.4251650508104194e-07, + "loss": 0.2691, + "step": 30514 + }, + { + "epoch": 0.89, + "grad_norm": 1.5824482593341018, + "learning_rate": 3.423456695794369e-07, + "loss": 0.2758, + "step": 30515 + }, + { + "epoch": 0.89, + "grad_norm": 1.7361448641013648, + "learning_rate": 3.421748751816839e-07, + "loss": 0.2604, + "step": 30516 + }, + { + "epoch": 0.89, + "grad_norm": 1.4489024969761017, + "learning_rate": 3.4200412188928834e-07, + "loss": 0.2686, + "step": 30517 + }, + { + "epoch": 0.89, + "grad_norm": 1.627205637330381, + "learning_rate": 3.4183340970375787e-07, + "loss": 0.2785, + "step": 30518 + }, + { + "epoch": 0.89, + "grad_norm": 1.3707106388629828, + "learning_rate": 3.416627386265997e-07, + "loss": 0.2752, + "step": 30519 + }, + { + "epoch": 0.89, + "grad_norm": 1.3673455166275164, + "learning_rate": 3.4149210865931927e-07, + "loss": 0.2844, + "step": 30520 + }, + { + "epoch": 0.89, + "grad_norm": 1.3792073404761689, + "learning_rate": 3.4132151980342255e-07, + "loss": 0.2689, + "step": 30521 + }, + { + "epoch": 0.89, + "grad_norm": 1.758759015997631, + "learning_rate": 3.4115097206041503e-07, + "loss": 0.2615, + "step": 30522 + }, + { + "epoch": 0.89, + "grad_norm": 1.2063054969218057, + "learning_rate": 3.4098046543180217e-07, + "loss": 0.2723, + "step": 30523 + }, + { + "epoch": 0.89, + "grad_norm": 1.7067367667073423, + "learning_rate": 3.408099999190884e-07, + "loss": 0.2757, + "step": 30524 + }, + { + "epoch": 0.89, + "grad_norm": 1.2812962790262075, + "learning_rate": 3.4063957552377845e-07, + "loss": 0.274, + "step": 30525 + }, + { + "epoch": 0.89, + "grad_norm": 1.2898859295818927, + "learning_rate": 3.404691922473757e-07, + "loss": 0.2728, + "step": 30526 + }, + { + "epoch": 0.89, + "grad_norm": 1.5843648567690354, + "learning_rate": 3.4029885009138343e-07, + "loss": 0.2807, + "step": 30527 + }, + { + "epoch": 0.89, + "grad_norm": 1.3551107986819362, + "learning_rate": 3.4012854905730585e-07, + "loss": 0.2538, + "step": 30528 + }, + { + "epoch": 0.89, + "grad_norm": 1.2933538096686057, + "learning_rate": 3.399582891466463e-07, + "loss": 0.2587, + "step": 30529 + }, + { + "epoch": 0.89, + "grad_norm": 1.2363543048029453, + "learning_rate": 3.397880703609058e-07, + "loss": 0.2594, + "step": 30530 + }, + { + "epoch": 0.89, + "grad_norm": 1.4440688941119932, + "learning_rate": 3.3961789270158754e-07, + "loss": 0.273, + "step": 30531 + }, + { + "epoch": 0.89, + "grad_norm": 1.3002699131581552, + "learning_rate": 3.3944775617019253e-07, + "loss": 0.296, + "step": 30532 + }, + { + "epoch": 0.89, + "grad_norm": 1.3101423974370725, + "learning_rate": 3.39277660768223e-07, + "loss": 0.3016, + "step": 30533 + }, + { + "epoch": 0.89, + "grad_norm": 1.3366749548234433, + "learning_rate": 3.391076064971799e-07, + "loss": 0.3217, + "step": 30534 + }, + { + "epoch": 0.89, + "grad_norm": 1.2228176640186104, + "learning_rate": 3.3893759335856423e-07, + "loss": 0.2739, + "step": 30535 + }, + { + "epoch": 0.89, + "grad_norm": 1.4271173647678501, + "learning_rate": 3.38767621353876e-07, + "loss": 0.2782, + "step": 30536 + }, + { + "epoch": 0.89, + "grad_norm": 1.6683594722333859, + "learning_rate": 3.3859769048461446e-07, + "loss": 0.2782, + "step": 30537 + }, + { + "epoch": 0.89, + "grad_norm": 1.2285024685099253, + "learning_rate": 3.3842780075228067e-07, + "loss": 0.2679, + "step": 30538 + }, + { + "epoch": 0.89, + "grad_norm": 1.2394668729960532, + "learning_rate": 3.3825795215837287e-07, + "loss": 0.2745, + "step": 30539 + }, + { + "epoch": 0.89, + "grad_norm": 1.9524038596640223, + "learning_rate": 3.3808814470439045e-07, + "loss": 0.2872, + "step": 30540 + }, + { + "epoch": 0.89, + "grad_norm": 1.4157229240158908, + "learning_rate": 3.379183783918316e-07, + "loss": 0.2621, + "step": 30541 + }, + { + "epoch": 0.89, + "grad_norm": 1.3877019697871815, + "learning_rate": 3.3774865322219517e-07, + "loss": 0.2741, + "step": 30542 + }, + { + "epoch": 0.89, + "grad_norm": 1.7758541445338765, + "learning_rate": 3.375789691969783e-07, + "loss": 0.2854, + "step": 30543 + }, + { + "epoch": 0.89, + "grad_norm": 1.4408329218135196, + "learning_rate": 3.3740932631767975e-07, + "loss": 0.2948, + "step": 30544 + }, + { + "epoch": 0.89, + "grad_norm": 1.8422035373876045, + "learning_rate": 3.3723972458579446e-07, + "loss": 0.2593, + "step": 30545 + }, + { + "epoch": 0.89, + "grad_norm": 2.2044781082507576, + "learning_rate": 3.3707016400282067e-07, + "loss": 0.2859, + "step": 30546 + }, + { + "epoch": 0.89, + "grad_norm": 1.4969321644518034, + "learning_rate": 3.369006445702544e-07, + "loss": 0.2801, + "step": 30547 + }, + { + "epoch": 0.89, + "grad_norm": 1.4068637478992918, + "learning_rate": 3.367311662895917e-07, + "loss": 0.2633, + "step": 30548 + }, + { + "epoch": 0.89, + "grad_norm": 1.4742511978436044, + "learning_rate": 3.36561729162328e-07, + "loss": 0.2764, + "step": 30549 + }, + { + "epoch": 0.89, + "grad_norm": 1.4320773826952715, + "learning_rate": 3.363923331899588e-07, + "loss": 0.2656, + "step": 30550 + }, + { + "epoch": 0.89, + "grad_norm": 1.505109737900277, + "learning_rate": 3.36222978373979e-07, + "loss": 0.2642, + "step": 30551 + }, + { + "epoch": 0.89, + "grad_norm": 3.44574115423927, + "learning_rate": 3.360536647158841e-07, + "loss": 0.2642, + "step": 30552 + }, + { + "epoch": 0.89, + "grad_norm": 1.3083192077174508, + "learning_rate": 3.358843922171667e-07, + "loss": 0.2617, + "step": 30553 + }, + { + "epoch": 0.89, + "grad_norm": 1.4667265233898856, + "learning_rate": 3.3571516087932074e-07, + "loss": 0.2654, + "step": 30554 + }, + { + "epoch": 0.89, + "grad_norm": 1.4636092981048454, + "learning_rate": 3.3554597070384055e-07, + "loss": 0.2662, + "step": 30555 + }, + { + "epoch": 0.89, + "grad_norm": 1.49980726001308, + "learning_rate": 3.353768216922193e-07, + "loss": 0.3531, + "step": 30556 + }, + { + "epoch": 0.89, + "grad_norm": 1.453051868526187, + "learning_rate": 3.3520771384594976e-07, + "loss": 0.2633, + "step": 30557 + }, + { + "epoch": 0.89, + "grad_norm": 1.50010266343853, + "learning_rate": 3.350386471665229e-07, + "loss": 0.2701, + "step": 30558 + }, + { + "epoch": 0.89, + "grad_norm": 1.3974696432157372, + "learning_rate": 3.34869621655432e-07, + "loss": 0.2859, + "step": 30559 + }, + { + "epoch": 0.89, + "grad_norm": 1.3364543314805726, + "learning_rate": 3.3470063731416815e-07, + "loss": 0.2582, + "step": 30560 + }, + { + "epoch": 0.89, + "grad_norm": 1.340364164400574, + "learning_rate": 3.3453169414422337e-07, + "loss": 0.2849, + "step": 30561 + }, + { + "epoch": 0.89, + "grad_norm": 1.3462105642004194, + "learning_rate": 3.3436279214708764e-07, + "loss": 0.2668, + "step": 30562 + }, + { + "epoch": 0.89, + "grad_norm": 1.264218663876598, + "learning_rate": 3.3419393132425313e-07, + "loss": 0.2896, + "step": 30563 + }, + { + "epoch": 0.89, + "grad_norm": 1.391885011755554, + "learning_rate": 3.3402511167720806e-07, + "loss": 0.2783, + "step": 30564 + }, + { + "epoch": 0.89, + "grad_norm": 1.397586974883859, + "learning_rate": 3.338563332074435e-07, + "loss": 0.2809, + "step": 30565 + }, + { + "epoch": 0.89, + "grad_norm": 1.4028998725347963, + "learning_rate": 3.336875959164482e-07, + "loss": 0.2668, + "step": 30566 + }, + { + "epoch": 0.89, + "grad_norm": 1.4016703342941388, + "learning_rate": 3.335188998057115e-07, + "loss": 0.2696, + "step": 30567 + }, + { + "epoch": 0.89, + "grad_norm": 1.914676026240787, + "learning_rate": 3.3335024487672287e-07, + "loss": 0.2536, + "step": 30568 + }, + { + "epoch": 0.89, + "grad_norm": 1.394598524772184, + "learning_rate": 3.3318163113096936e-07, + "loss": 0.2763, + "step": 30569 + }, + { + "epoch": 0.89, + "grad_norm": 1.7136784591463103, + "learning_rate": 3.330130585699404e-07, + "loss": 0.2636, + "step": 30570 + }, + { + "epoch": 0.89, + "grad_norm": 1.4516836200559815, + "learning_rate": 3.3284452719512306e-07, + "loss": 0.2656, + "step": 30571 + }, + { + "epoch": 0.89, + "grad_norm": 1.5906860515793817, + "learning_rate": 3.32676037008004e-07, + "loss": 0.283, + "step": 30572 + }, + { + "epoch": 0.89, + "grad_norm": 0.9283562364473005, + "learning_rate": 3.325075880100709e-07, + "loss": 0.5477, + "step": 30573 + }, + { + "epoch": 0.89, + "grad_norm": 1.5002619776334436, + "learning_rate": 3.323391802028103e-07, + "loss": 0.2721, + "step": 30574 + }, + { + "epoch": 0.89, + "grad_norm": 1.3501936578542844, + "learning_rate": 3.3217081358770775e-07, + "loss": 0.2474, + "step": 30575 + }, + { + "epoch": 0.89, + "grad_norm": 1.4298877291917262, + "learning_rate": 3.320024881662498e-07, + "loss": 0.2751, + "step": 30576 + }, + { + "epoch": 0.89, + "grad_norm": 1.4445030213057302, + "learning_rate": 3.3183420393992136e-07, + "loss": 0.2861, + "step": 30577 + }, + { + "epoch": 0.89, + "grad_norm": 1.5135215507691036, + "learning_rate": 3.3166596091020854e-07, + "loss": 0.264, + "step": 30578 + }, + { + "epoch": 0.89, + "grad_norm": 1.3675479857119832, + "learning_rate": 3.314977590785945e-07, + "loss": 0.2808, + "step": 30579 + }, + { + "epoch": 0.89, + "grad_norm": 1.3408335214969163, + "learning_rate": 3.313295984465659e-07, + "loss": 0.276, + "step": 30580 + }, + { + "epoch": 0.89, + "grad_norm": 1.2215850230825822, + "learning_rate": 3.311614790156048e-07, + "loss": 0.2639, + "step": 30581 + }, + { + "epoch": 0.89, + "grad_norm": 1.3015979313982193, + "learning_rate": 3.309934007871951e-07, + "loss": 0.2717, + "step": 30582 + }, + { + "epoch": 0.89, + "grad_norm": 1.4246313497583594, + "learning_rate": 3.3082536376282005e-07, + "loss": 0.2648, + "step": 30583 + }, + { + "epoch": 0.89, + "grad_norm": 1.690774932688752, + "learning_rate": 3.3065736794396343e-07, + "loss": 0.2764, + "step": 30584 + }, + { + "epoch": 0.89, + "grad_norm": 0.9241140089928133, + "learning_rate": 3.3048941333210794e-07, + "loss": 0.5998, + "step": 30585 + }, + { + "epoch": 0.89, + "grad_norm": 0.9729445994518696, + "learning_rate": 3.303214999287346e-07, + "loss": 0.6338, + "step": 30586 + }, + { + "epoch": 0.89, + "grad_norm": 0.9185214152366684, + "learning_rate": 3.3015362773532566e-07, + "loss": 0.5331, + "step": 30587 + }, + { + "epoch": 0.89, + "grad_norm": 1.280505433576832, + "learning_rate": 3.299857967533632e-07, + "loss": 0.2554, + "step": 30588 + }, + { + "epoch": 0.89, + "grad_norm": 1.363978766666406, + "learning_rate": 3.2981800698432766e-07, + "loss": 0.2873, + "step": 30589 + }, + { + "epoch": 0.89, + "grad_norm": 1.2540675045148695, + "learning_rate": 3.296502584297007e-07, + "loss": 0.2757, + "step": 30590 + }, + { + "epoch": 0.89, + "grad_norm": 1.3442715406901198, + "learning_rate": 3.2948255109096107e-07, + "loss": 0.2802, + "step": 30591 + }, + { + "epoch": 0.89, + "grad_norm": 1.557528681229248, + "learning_rate": 3.2931488496958984e-07, + "loss": 0.261, + "step": 30592 + }, + { + "epoch": 0.89, + "grad_norm": 0.9590467241610555, + "learning_rate": 3.2914726006706645e-07, + "loss": 0.6414, + "step": 30593 + }, + { + "epoch": 0.89, + "grad_norm": 1.2598634265765496, + "learning_rate": 3.289796763848707e-07, + "loss": 0.2726, + "step": 30594 + }, + { + "epoch": 0.89, + "grad_norm": 1.2928479717668473, + "learning_rate": 3.288121339244804e-07, + "loss": 0.274, + "step": 30595 + }, + { + "epoch": 0.89, + "grad_norm": 2.0062686452698784, + "learning_rate": 3.2864463268737547e-07, + "loss": 0.2672, + "step": 30596 + }, + { + "epoch": 0.89, + "grad_norm": 1.4564497196246577, + "learning_rate": 3.2847717267503354e-07, + "loss": 0.2644, + "step": 30597 + }, + { + "epoch": 0.89, + "grad_norm": 1.7709055997325038, + "learning_rate": 3.283097538889318e-07, + "loss": 0.2919, + "step": 30598 + }, + { + "epoch": 0.89, + "grad_norm": 1.3268185410945845, + "learning_rate": 3.281423763305497e-07, + "loss": 0.2608, + "step": 30599 + }, + { + "epoch": 0.89, + "grad_norm": 1.451597009594567, + "learning_rate": 3.2797504000136204e-07, + "loss": 0.2524, + "step": 30600 + }, + { + "epoch": 0.89, + "grad_norm": 1.4599581548010974, + "learning_rate": 3.2780774490284606e-07, + "loss": 0.2692, + "step": 30601 + }, + { + "epoch": 0.89, + "grad_norm": 1.4239850159935261, + "learning_rate": 3.2764049103647887e-07, + "loss": 0.2717, + "step": 30602 + }, + { + "epoch": 0.89, + "grad_norm": 1.4441803673060662, + "learning_rate": 3.2747327840373655e-07, + "loss": 0.2743, + "step": 30603 + }, + { + "epoch": 0.89, + "grad_norm": 1.468484732004886, + "learning_rate": 3.27306107006094e-07, + "loss": 0.2533, + "step": 30604 + }, + { + "epoch": 0.89, + "grad_norm": 1.2805196982044587, + "learning_rate": 3.271389768450267e-07, + "loss": 0.2665, + "step": 30605 + }, + { + "epoch": 0.89, + "grad_norm": 1.4820453773033708, + "learning_rate": 3.2697188792201017e-07, + "loss": 0.2615, + "step": 30606 + }, + { + "epoch": 0.89, + "grad_norm": 1.3125011534443538, + "learning_rate": 3.268048402385188e-07, + "loss": 0.2705, + "step": 30607 + }, + { + "epoch": 0.89, + "grad_norm": 1.4275017548544864, + "learning_rate": 3.266378337960263e-07, + "loss": 0.2807, + "step": 30608 + }, + { + "epoch": 0.89, + "grad_norm": 1.6556127828912397, + "learning_rate": 3.2647086859600664e-07, + "loss": 0.291, + "step": 30609 + }, + { + "epoch": 0.89, + "grad_norm": 1.4468308660882703, + "learning_rate": 3.263039446399335e-07, + "loss": 0.2672, + "step": 30610 + }, + { + "epoch": 0.89, + "grad_norm": 1.441317872088318, + "learning_rate": 3.2613706192927975e-07, + "loss": 0.2708, + "step": 30611 + }, + { + "epoch": 0.89, + "grad_norm": 1.2531845190112496, + "learning_rate": 3.25970220465518e-07, + "loss": 0.2581, + "step": 30612 + }, + { + "epoch": 0.89, + "grad_norm": 1.411409541154715, + "learning_rate": 3.2580342025012204e-07, + "loss": 0.2718, + "step": 30613 + }, + { + "epoch": 0.89, + "grad_norm": 1.7419303535160804, + "learning_rate": 3.2563666128456184e-07, + "loss": 0.2625, + "step": 30614 + }, + { + "epoch": 0.89, + "grad_norm": 1.4035872611608922, + "learning_rate": 3.2546994357031016e-07, + "loss": 0.2633, + "step": 30615 + }, + { + "epoch": 0.89, + "grad_norm": 1.3796005980554222, + "learning_rate": 3.25303267108838e-07, + "loss": 0.2692, + "step": 30616 + }, + { + "epoch": 0.89, + "grad_norm": 16.044672640403782, + "learning_rate": 3.2513663190161635e-07, + "loss": 0.2537, + "step": 30617 + }, + { + "epoch": 0.89, + "grad_norm": 1.396729389676245, + "learning_rate": 3.249700379501164e-07, + "loss": 0.2696, + "step": 30618 + }, + { + "epoch": 0.89, + "grad_norm": 1.267539109851686, + "learning_rate": 3.2480348525580685e-07, + "loss": 0.2637, + "step": 30619 + }, + { + "epoch": 0.89, + "grad_norm": 1.4044321209090467, + "learning_rate": 3.246369738201588e-07, + "loss": 0.2719, + "step": 30620 + }, + { + "epoch": 0.89, + "grad_norm": 1.3944584999282188, + "learning_rate": 3.2447050364464106e-07, + "loss": 0.2599, + "step": 30621 + }, + { + "epoch": 0.89, + "grad_norm": 1.5849938629847835, + "learning_rate": 3.24304074730723e-07, + "loss": 0.2743, + "step": 30622 + }, + { + "epoch": 0.89, + "grad_norm": 1.250092034453467, + "learning_rate": 3.241376870798735e-07, + "loss": 0.2715, + "step": 30623 + }, + { + "epoch": 0.89, + "grad_norm": 1.7723284430816106, + "learning_rate": 3.2397134069356074e-07, + "loss": 0.2791, + "step": 30624 + }, + { + "epoch": 0.89, + "grad_norm": 1.433800862017873, + "learning_rate": 3.238050355732525e-07, + "loss": 0.2916, + "step": 30625 + }, + { + "epoch": 0.89, + "grad_norm": 1.3443836075537796, + "learning_rate": 3.2363877172041814e-07, + "loss": 0.2752, + "step": 30626 + }, + { + "epoch": 0.89, + "grad_norm": 1.2504066284964714, + "learning_rate": 3.23472549136522e-07, + "loss": 0.2731, + "step": 30627 + }, + { + "epoch": 0.89, + "grad_norm": 1.3566754960861471, + "learning_rate": 3.23306367823033e-07, + "loss": 0.2602, + "step": 30628 + }, + { + "epoch": 0.89, + "grad_norm": 1.3574356345289793, + "learning_rate": 3.2314022778141664e-07, + "loss": 0.2606, + "step": 30629 + }, + { + "epoch": 0.89, + "grad_norm": 1.340796223153973, + "learning_rate": 3.2297412901313995e-07, + "loss": 0.2637, + "step": 30630 + }, + { + "epoch": 0.89, + "grad_norm": 1.462119586409474, + "learning_rate": 3.2280807151966854e-07, + "loss": 0.2472, + "step": 30631 + }, + { + "epoch": 0.89, + "grad_norm": 2.1648205366296236, + "learning_rate": 3.226420553024678e-07, + "loss": 0.274, + "step": 30632 + }, + { + "epoch": 0.89, + "grad_norm": 1.427131449776441, + "learning_rate": 3.2247608036300283e-07, + "loss": 0.2606, + "step": 30633 + }, + { + "epoch": 0.89, + "grad_norm": 1.2747192231174431, + "learning_rate": 3.2231014670273843e-07, + "loss": 0.2599, + "step": 30634 + }, + { + "epoch": 0.89, + "grad_norm": 1.8152965579819422, + "learning_rate": 3.2214425432313955e-07, + "loss": 0.2844, + "step": 30635 + }, + { + "epoch": 0.89, + "grad_norm": 3.407161350843975, + "learning_rate": 3.2197840322566843e-07, + "loss": 0.3026, + "step": 30636 + }, + { + "epoch": 0.89, + "grad_norm": 1.28685104093736, + "learning_rate": 3.2181259341178997e-07, + "loss": 0.2706, + "step": 30637 + }, + { + "epoch": 0.89, + "grad_norm": 1.4106180903891312, + "learning_rate": 3.216468248829674e-07, + "loss": 0.2794, + "step": 30638 + }, + { + "epoch": 0.89, + "grad_norm": 1.2784491032763268, + "learning_rate": 3.214810976406635e-07, + "loss": 0.2843, + "step": 30639 + }, + { + "epoch": 0.89, + "grad_norm": 7.402688349515644, + "learning_rate": 3.2131541168634095e-07, + "loss": 0.2738, + "step": 30640 + }, + { + "epoch": 0.89, + "grad_norm": 1.4167449633466647, + "learning_rate": 3.2114976702146137e-07, + "loss": 0.2957, + "step": 30641 + }, + { + "epoch": 0.89, + "grad_norm": 1.2788692422676473, + "learning_rate": 3.209841636474881e-07, + "loss": 0.2631, + "step": 30642 + }, + { + "epoch": 0.89, + "grad_norm": 1.4531787393870637, + "learning_rate": 3.208186015658804e-07, + "loss": 0.2814, + "step": 30643 + }, + { + "epoch": 0.89, + "grad_norm": 1.3528267557074987, + "learning_rate": 3.206530807781005e-07, + "loss": 0.277, + "step": 30644 + }, + { + "epoch": 0.89, + "grad_norm": 1.3593184083527692, + "learning_rate": 3.2048760128561005e-07, + "loss": 0.2847, + "step": 30645 + }, + { + "epoch": 0.89, + "grad_norm": 1.904082595338017, + "learning_rate": 3.203221630898679e-07, + "loss": 0.3216, + "step": 30646 + }, + { + "epoch": 0.89, + "grad_norm": 1.2851046302234652, + "learning_rate": 3.201567661923344e-07, + "loss": 0.2639, + "step": 30647 + }, + { + "epoch": 0.89, + "grad_norm": 1.324432495484659, + "learning_rate": 3.1999141059446903e-07, + "loss": 0.2861, + "step": 30648 + }, + { + "epoch": 0.89, + "grad_norm": 1.278758793693771, + "learning_rate": 3.1982609629773176e-07, + "loss": 0.2676, + "step": 30649 + }, + { + "epoch": 0.89, + "grad_norm": 1.5789170851855512, + "learning_rate": 3.1966082330358086e-07, + "loss": 0.2898, + "step": 30650 + }, + { + "epoch": 0.89, + "grad_norm": 1.681573279493702, + "learning_rate": 3.194955916134751e-07, + "loss": 0.2874, + "step": 30651 + }, + { + "epoch": 0.89, + "grad_norm": 1.4748902931922807, + "learning_rate": 3.193304012288728e-07, + "loss": 0.2834, + "step": 30652 + }, + { + "epoch": 0.89, + "grad_norm": 1.6235643367368529, + "learning_rate": 3.1916525215123175e-07, + "loss": 0.3178, + "step": 30653 + }, + { + "epoch": 0.89, + "grad_norm": 1.4933780264094279, + "learning_rate": 3.190001443820101e-07, + "loss": 0.2614, + "step": 30654 + }, + { + "epoch": 0.89, + "grad_norm": 1.2426556614040434, + "learning_rate": 3.188350779226629e-07, + "loss": 0.2735, + "step": 30655 + }, + { + "epoch": 0.89, + "grad_norm": 1.454895179951632, + "learning_rate": 3.1867005277464833e-07, + "loss": 0.2774, + "step": 30656 + }, + { + "epoch": 0.89, + "grad_norm": 1.4343510009142646, + "learning_rate": 3.1850506893942255e-07, + "loss": 0.277, + "step": 30657 + }, + { + "epoch": 0.89, + "grad_norm": 1.9916404606626013, + "learning_rate": 3.18340126418441e-07, + "loss": 0.2745, + "step": 30658 + }, + { + "epoch": 0.89, + "grad_norm": 1.5307906930355162, + "learning_rate": 3.1817522521316034e-07, + "loss": 0.2835, + "step": 30659 + }, + { + "epoch": 0.89, + "grad_norm": 1.4996477371848187, + "learning_rate": 3.1801036532503494e-07, + "loss": 0.3193, + "step": 30660 + }, + { + "epoch": 0.89, + "grad_norm": 1.383667413559081, + "learning_rate": 3.1784554675552024e-07, + "loss": 0.2639, + "step": 30661 + }, + { + "epoch": 0.89, + "grad_norm": 1.3592253780882433, + "learning_rate": 3.176807695060713e-07, + "loss": 0.2726, + "step": 30662 + }, + { + "epoch": 0.89, + "grad_norm": 1.3517915081600953, + "learning_rate": 3.1751603357814076e-07, + "loss": 0.2761, + "step": 30663 + }, + { + "epoch": 0.89, + "grad_norm": 1.334186463371296, + "learning_rate": 3.1735133897318305e-07, + "loss": 0.2574, + "step": 30664 + }, + { + "epoch": 0.89, + "grad_norm": 2.1582709824002553, + "learning_rate": 3.1718668569265144e-07, + "loss": 0.2866, + "step": 30665 + }, + { + "epoch": 0.89, + "grad_norm": 1.3127692136741065, + "learning_rate": 3.1702207373799976e-07, + "loss": 0.2915, + "step": 30666 + }, + { + "epoch": 0.89, + "grad_norm": 1.2460803557953113, + "learning_rate": 3.1685750311067967e-07, + "loss": 0.2423, + "step": 30667 + }, + { + "epoch": 0.89, + "grad_norm": 1.8458777651059122, + "learning_rate": 3.1669297381214435e-07, + "loss": 0.3135, + "step": 30668 + }, + { + "epoch": 0.89, + "grad_norm": 1.4104684190297194, + "learning_rate": 3.1652848584384556e-07, + "loss": 0.2524, + "step": 30669 + }, + { + "epoch": 0.89, + "grad_norm": 1.010050246694468, + "learning_rate": 3.163640392072359e-07, + "loss": 0.6198, + "step": 30670 + }, + { + "epoch": 0.89, + "grad_norm": 1.4616680756954794, + "learning_rate": 3.161996339037643e-07, + "loss": 0.2641, + "step": 30671 + }, + { + "epoch": 0.89, + "grad_norm": 1.3534373453895125, + "learning_rate": 3.1603526993488287e-07, + "loss": 0.2609, + "step": 30672 + }, + { + "epoch": 0.89, + "grad_norm": 1.2992616996929107, + "learning_rate": 3.1587094730204324e-07, + "loss": 0.2834, + "step": 30673 + }, + { + "epoch": 0.89, + "grad_norm": 2.168954268158181, + "learning_rate": 3.1570666600669377e-07, + "loss": 0.2891, + "step": 30674 + }, + { + "epoch": 0.89, + "grad_norm": 1.5692314446823439, + "learning_rate": 3.1554242605028484e-07, + "loss": 0.2625, + "step": 30675 + }, + { + "epoch": 0.89, + "grad_norm": 1.4517040960830698, + "learning_rate": 3.1537822743426536e-07, + "loss": 0.2532, + "step": 30676 + }, + { + "epoch": 0.89, + "grad_norm": 1.2509059205152735, + "learning_rate": 3.1521407016008586e-07, + "loss": 0.2749, + "step": 30677 + }, + { + "epoch": 0.89, + "grad_norm": 2.4653582780595142, + "learning_rate": 3.150499542291935e-07, + "loss": 0.3076, + "step": 30678 + }, + { + "epoch": 0.89, + "grad_norm": 1.3874012535843265, + "learning_rate": 3.148858796430376e-07, + "loss": 0.3196, + "step": 30679 + }, + { + "epoch": 0.89, + "grad_norm": 0.9184241041035388, + "learning_rate": 3.14721846403066e-07, + "loss": 0.5494, + "step": 30680 + }, + { + "epoch": 0.89, + "grad_norm": 1.3239604945184957, + "learning_rate": 3.145578545107264e-07, + "loss": 0.2712, + "step": 30681 + }, + { + "epoch": 0.89, + "grad_norm": 1.3650084332158667, + "learning_rate": 3.143939039674654e-07, + "loss": 0.271, + "step": 30682 + }, + { + "epoch": 0.89, + "grad_norm": 1.4401203230336628, + "learning_rate": 3.142299947747296e-07, + "loss": 0.2826, + "step": 30683 + }, + { + "epoch": 0.89, + "grad_norm": 1.5123865896914583, + "learning_rate": 3.140661269339668e-07, + "loss": 0.2906, + "step": 30684 + }, + { + "epoch": 0.89, + "grad_norm": 0.8815356013804783, + "learning_rate": 3.1390230044662184e-07, + "loss": 0.547, + "step": 30685 + }, + { + "epoch": 0.89, + "grad_norm": 1.7183608126941852, + "learning_rate": 3.137385153141409e-07, + "loss": 0.2649, + "step": 30686 + }, + { + "epoch": 0.89, + "grad_norm": 1.2868340496548596, + "learning_rate": 3.1357477153797e-07, + "loss": 0.2704, + "step": 30687 + }, + { + "epoch": 0.89, + "grad_norm": 1.4775244339844251, + "learning_rate": 3.134110691195535e-07, + "loss": 0.2712, + "step": 30688 + }, + { + "epoch": 0.89, + "grad_norm": 1.3507546131912573, + "learning_rate": 3.132474080603365e-07, + "loss": 0.2757, + "step": 30689 + }, + { + "epoch": 0.89, + "grad_norm": 1.3206650443064474, + "learning_rate": 3.1308378836176377e-07, + "loss": 0.2605, + "step": 30690 + }, + { + "epoch": 0.89, + "grad_norm": 1.3845191531026524, + "learning_rate": 3.1292021002527816e-07, + "loss": 0.2743, + "step": 30691 + }, + { + "epoch": 0.89, + "grad_norm": 1.460530499476944, + "learning_rate": 3.127566730523235e-07, + "loss": 0.2957, + "step": 30692 + }, + { + "epoch": 0.89, + "grad_norm": 1.4034270737341612, + "learning_rate": 3.12593177444343e-07, + "loss": 0.2911, + "step": 30693 + }, + { + "epoch": 0.89, + "grad_norm": 1.3565090230403976, + "learning_rate": 3.1242972320277953e-07, + "loss": 0.3061, + "step": 30694 + }, + { + "epoch": 0.89, + "grad_norm": 1.3873126623799554, + "learning_rate": 3.122663103290763e-07, + "loss": 0.2673, + "step": 30695 + }, + { + "epoch": 0.89, + "grad_norm": 1.4715404989336074, + "learning_rate": 3.1210293882467436e-07, + "loss": 0.2689, + "step": 30696 + }, + { + "epoch": 0.89, + "grad_norm": 1.7238626238844046, + "learning_rate": 3.1193960869101645e-07, + "loss": 0.2669, + "step": 30697 + }, + { + "epoch": 0.89, + "grad_norm": 1.4921237692654044, + "learning_rate": 3.117763199295437e-07, + "loss": 0.2708, + "step": 30698 + }, + { + "epoch": 0.89, + "grad_norm": 1.3408323653637877, + "learning_rate": 3.1161307254169657e-07, + "loss": 0.2767, + "step": 30699 + }, + { + "epoch": 0.89, + "grad_norm": 1.5099033233040322, + "learning_rate": 3.1144986652891674e-07, + "loss": 0.2973, + "step": 30700 + }, + { + "epoch": 0.89, + "grad_norm": 1.3417793495732198, + "learning_rate": 3.1128670189264353e-07, + "loss": 0.2675, + "step": 30701 + }, + { + "epoch": 0.89, + "grad_norm": 1.3419185807204996, + "learning_rate": 3.111235786343164e-07, + "loss": 0.2725, + "step": 30702 + }, + { + "epoch": 0.89, + "grad_norm": 1.2648277538976467, + "learning_rate": 3.1096049675537644e-07, + "loss": 0.271, + "step": 30703 + }, + { + "epoch": 0.89, + "grad_norm": 1.4371731088018709, + "learning_rate": 3.107974562572619e-07, + "loss": 0.2997, + "step": 30704 + }, + { + "epoch": 0.89, + "grad_norm": 1.7652527546460353, + "learning_rate": 3.106344571414116e-07, + "loss": 0.2464, + "step": 30705 + }, + { + "epoch": 0.89, + "grad_norm": 1.9191272668967205, + "learning_rate": 3.1047149940926444e-07, + "loss": 0.2602, + "step": 30706 + }, + { + "epoch": 0.89, + "grad_norm": 1.454646331150121, + "learning_rate": 3.103085830622582e-07, + "loss": 0.312, + "step": 30707 + }, + { + "epoch": 0.89, + "grad_norm": 1.327562176436962, + "learning_rate": 3.101457081018305e-07, + "loss": 0.2833, + "step": 30708 + }, + { + "epoch": 0.89, + "grad_norm": 1.6142063054514533, + "learning_rate": 3.0998287452941976e-07, + "loss": 0.2772, + "step": 30709 + }, + { + "epoch": 0.89, + "grad_norm": 1.4696675168499138, + "learning_rate": 3.09820082346462e-07, + "loss": 0.2861, + "step": 30710 + }, + { + "epoch": 0.89, + "grad_norm": 1.3813631837069913, + "learning_rate": 3.096573315543938e-07, + "loss": 0.2735, + "step": 30711 + }, + { + "epoch": 0.89, + "grad_norm": 1.7882972553887646, + "learning_rate": 3.094946221546513e-07, + "loss": 0.2711, + "step": 30712 + }, + { + "epoch": 0.89, + "grad_norm": 1.4124068609750526, + "learning_rate": 3.0933195414867114e-07, + "loss": 0.2749, + "step": 30713 + }, + { + "epoch": 0.89, + "grad_norm": 1.9364412927259849, + "learning_rate": 3.091693275378882e-07, + "loss": 0.2611, + "step": 30714 + }, + { + "epoch": 0.89, + "grad_norm": 1.427870741170675, + "learning_rate": 3.090067423237386e-07, + "loss": 0.2669, + "step": 30715 + }, + { + "epoch": 0.89, + "grad_norm": 1.5708950804248523, + "learning_rate": 3.088441985076562e-07, + "loss": 0.2787, + "step": 30716 + }, + { + "epoch": 0.89, + "grad_norm": 1.2652282775486547, + "learning_rate": 3.086816960910766e-07, + "loss": 0.271, + "step": 30717 + }, + { + "epoch": 0.89, + "grad_norm": 1.3902560238134896, + "learning_rate": 3.085192350754318e-07, + "loss": 0.2846, + "step": 30718 + }, + { + "epoch": 0.89, + "grad_norm": 1.4856185216458024, + "learning_rate": 3.0835681546215746e-07, + "loss": 0.2869, + "step": 30719 + }, + { + "epoch": 0.89, + "grad_norm": 1.2784580239378425, + "learning_rate": 3.081944372526857e-07, + "loss": 0.2606, + "step": 30720 + }, + { + "epoch": 0.89, + "grad_norm": 1.347890466543571, + "learning_rate": 3.0803210044845046e-07, + "loss": 0.2511, + "step": 30721 + }, + { + "epoch": 0.89, + "grad_norm": 1.3857716302471614, + "learning_rate": 3.078698050508844e-07, + "loss": 0.2604, + "step": 30722 + }, + { + "epoch": 0.89, + "grad_norm": 1.412688036195972, + "learning_rate": 3.077075510614186e-07, + "loss": 0.3066, + "step": 30723 + }, + { + "epoch": 0.89, + "grad_norm": 1.3028527458023975, + "learning_rate": 3.075453384814858e-07, + "loss": 0.275, + "step": 30724 + }, + { + "epoch": 0.89, + "grad_norm": 1.4531398612106898, + "learning_rate": 3.073831673125177e-07, + "loss": 0.2568, + "step": 30725 + }, + { + "epoch": 0.89, + "grad_norm": 1.3499149240965214, + "learning_rate": 3.072210375559459e-07, + "loss": 0.2552, + "step": 30726 + }, + { + "epoch": 0.89, + "grad_norm": 1.4886982571267742, + "learning_rate": 3.070589492131998e-07, + "loss": 0.3137, + "step": 30727 + }, + { + "epoch": 0.89, + "grad_norm": 1.2546139263436382, + "learning_rate": 3.068969022857116e-07, + "loss": 0.2588, + "step": 30728 + }, + { + "epoch": 0.89, + "grad_norm": 2.5840570144630086, + "learning_rate": 3.0673489677490896e-07, + "loss": 0.2671, + "step": 30729 + }, + { + "epoch": 0.89, + "grad_norm": 1.4199003986977445, + "learning_rate": 3.065729326822231e-07, + "loss": 0.298, + "step": 30730 + }, + { + "epoch": 0.89, + "grad_norm": 1.563032853499554, + "learning_rate": 3.0641101000908333e-07, + "loss": 0.2812, + "step": 30731 + }, + { + "epoch": 0.89, + "grad_norm": 1.516878606807032, + "learning_rate": 3.062491287569186e-07, + "loss": 0.2943, + "step": 30732 + }, + { + "epoch": 0.89, + "grad_norm": 1.2812842346515674, + "learning_rate": 3.060872889271571e-07, + "loss": 0.2671, + "step": 30733 + }, + { + "epoch": 0.89, + "grad_norm": 1.2549096017493513, + "learning_rate": 3.059254905212272e-07, + "loss": 0.2594, + "step": 30734 + }, + { + "epoch": 0.89, + "grad_norm": 1.2692547592811096, + "learning_rate": 3.057637335405572e-07, + "loss": 0.2478, + "step": 30735 + }, + { + "epoch": 0.89, + "grad_norm": 3.32328411593771, + "learning_rate": 3.0560201798657484e-07, + "loss": 0.2651, + "step": 30736 + }, + { + "epoch": 0.89, + "grad_norm": 1.2442345379862325, + "learning_rate": 3.054403438607062e-07, + "loss": 0.2505, + "step": 30737 + }, + { + "epoch": 0.89, + "grad_norm": 1.3575889764367728, + "learning_rate": 3.0527871116437847e-07, + "loss": 0.2666, + "step": 30738 + }, + { + "epoch": 0.89, + "grad_norm": 2.552579288789101, + "learning_rate": 3.0511711989901827e-07, + "loss": 0.2625, + "step": 30739 + }, + { + "epoch": 0.89, + "grad_norm": 1.2045194177959675, + "learning_rate": 3.049555700660517e-07, + "loss": 0.2538, + "step": 30740 + }, + { + "epoch": 0.89, + "grad_norm": 1.9905315232723915, + "learning_rate": 3.047940616669043e-07, + "loss": 0.2798, + "step": 30741 + }, + { + "epoch": 0.89, + "grad_norm": 1.2228964132069076, + "learning_rate": 3.04632594703001e-07, + "loss": 0.2504, + "step": 30742 + }, + { + "epoch": 0.89, + "grad_norm": 1.3672211295194663, + "learning_rate": 3.0447116917576736e-07, + "loss": 0.2771, + "step": 30743 + }, + { + "epoch": 0.89, + "grad_norm": 1.2739211701285975, + "learning_rate": 3.0430978508662777e-07, + "loss": 0.2559, + "step": 30744 + }, + { + "epoch": 0.89, + "grad_norm": 1.3456614821554307, + "learning_rate": 3.0414844243700727e-07, + "loss": 0.2727, + "step": 30745 + }, + { + "epoch": 0.89, + "grad_norm": 1.2697004657699607, + "learning_rate": 3.0398714122832796e-07, + "loss": 0.241, + "step": 30746 + }, + { + "epoch": 0.89, + "grad_norm": 1.3286160102267097, + "learning_rate": 3.038258814620143e-07, + "loss": 0.2652, + "step": 30747 + }, + { + "epoch": 0.89, + "grad_norm": 1.3905804540340185, + "learning_rate": 3.0366466313948963e-07, + "loss": 0.2795, + "step": 30748 + }, + { + "epoch": 0.89, + "grad_norm": 1.589977595599772, + "learning_rate": 3.0350348626217605e-07, + "loss": 0.3032, + "step": 30749 + }, + { + "epoch": 0.89, + "grad_norm": 1.593905731523357, + "learning_rate": 3.033423508314964e-07, + "loss": 0.2703, + "step": 30750 + }, + { + "epoch": 0.89, + "grad_norm": 1.4837327994235887, + "learning_rate": 3.0318125684887233e-07, + "loss": 0.2911, + "step": 30751 + }, + { + "epoch": 0.89, + "grad_norm": 1.6215837215904694, + "learning_rate": 3.03020204315726e-07, + "loss": 0.2714, + "step": 30752 + }, + { + "epoch": 0.89, + "grad_norm": 1.3322302302854963, + "learning_rate": 3.028591932334796e-07, + "loss": 0.2597, + "step": 30753 + }, + { + "epoch": 0.89, + "grad_norm": 1.2407452460791917, + "learning_rate": 3.026982236035514e-07, + "loss": 0.2721, + "step": 30754 + }, + { + "epoch": 0.89, + "grad_norm": 1.3707977776693177, + "learning_rate": 3.0253729542736475e-07, + "loss": 0.2732, + "step": 30755 + }, + { + "epoch": 0.89, + "grad_norm": 1.364116684007964, + "learning_rate": 3.02376408706338e-07, + "loss": 0.2674, + "step": 30756 + }, + { + "epoch": 0.89, + "grad_norm": 1.3029662787174008, + "learning_rate": 3.0221556344189107e-07, + "loss": 0.277, + "step": 30757 + }, + { + "epoch": 0.89, + "grad_norm": 1.4041348156980513, + "learning_rate": 3.0205475963544394e-07, + "loss": 0.2655, + "step": 30758 + }, + { + "epoch": 0.89, + "grad_norm": 1.2874009480248623, + "learning_rate": 3.0189399728841605e-07, + "loss": 0.2901, + "step": 30759 + }, + { + "epoch": 0.89, + "grad_norm": 1.5602455846069199, + "learning_rate": 3.0173327640222573e-07, + "loss": 0.2638, + "step": 30760 + }, + { + "epoch": 0.89, + "grad_norm": 1.4183170988045615, + "learning_rate": 3.0157259697829067e-07, + "loss": 0.2694, + "step": 30761 + }, + { + "epoch": 0.89, + "grad_norm": 1.5999452549663948, + "learning_rate": 3.014119590180303e-07, + "loss": 0.2618, + "step": 30762 + }, + { + "epoch": 0.89, + "grad_norm": 1.3680668065853314, + "learning_rate": 3.0125136252286136e-07, + "loss": 0.2562, + "step": 30763 + }, + { + "epoch": 0.89, + "grad_norm": 1.3097736378527698, + "learning_rate": 3.0109080749420203e-07, + "loss": 0.2775, + "step": 30764 + }, + { + "epoch": 0.89, + "grad_norm": 1.680534836557246, + "learning_rate": 3.009302939334674e-07, + "loss": 0.2619, + "step": 30765 + }, + { + "epoch": 0.89, + "grad_norm": 1.5393930816915267, + "learning_rate": 3.0076982184207516e-07, + "loss": 0.2682, + "step": 30766 + }, + { + "epoch": 0.89, + "grad_norm": 1.8935144779834872, + "learning_rate": 3.0060939122144195e-07, + "loss": 0.2639, + "step": 30767 + }, + { + "epoch": 0.89, + "grad_norm": 1.3641523075150368, + "learning_rate": 3.004490020729822e-07, + "loss": 0.2723, + "step": 30768 + }, + { + "epoch": 0.89, + "grad_norm": 1.3828368060932017, + "learning_rate": 3.002886543981126e-07, + "loss": 0.2984, + "step": 30769 + }, + { + "epoch": 0.89, + "grad_norm": 1.344606450227521, + "learning_rate": 3.0012834819824755e-07, + "loss": 0.2633, + "step": 30770 + }, + { + "epoch": 0.89, + "grad_norm": 0.9031809383806234, + "learning_rate": 2.99968083474802e-07, + "loss": 0.5391, + "step": 30771 + }, + { + "epoch": 0.89, + "grad_norm": 1.314193631472315, + "learning_rate": 2.9980786022919097e-07, + "loss": 0.2731, + "step": 30772 + }, + { + "epoch": 0.89, + "grad_norm": 1.4131928341089675, + "learning_rate": 2.996476784628272e-07, + "loss": 0.3156, + "step": 30773 + }, + { + "epoch": 0.89, + "grad_norm": 1.8115466577271948, + "learning_rate": 2.994875381771245e-07, + "loss": 0.2558, + "step": 30774 + }, + { + "epoch": 0.89, + "grad_norm": 1.2533294426829216, + "learning_rate": 2.9932743937349684e-07, + "loss": 0.2646, + "step": 30775 + }, + { + "epoch": 0.89, + "grad_norm": 1.3581672477815974, + "learning_rate": 2.991673820533564e-07, + "loss": 0.2628, + "step": 30776 + }, + { + "epoch": 0.89, + "grad_norm": 1.3530294178604099, + "learning_rate": 2.9900736621811586e-07, + "loss": 0.2606, + "step": 30777 + }, + { + "epoch": 0.89, + "grad_norm": 1.4996541110216222, + "learning_rate": 2.9884739186918754e-07, + "loss": 0.2714, + "step": 30778 + }, + { + "epoch": 0.89, + "grad_norm": 1.5057628092065585, + "learning_rate": 2.986874590079836e-07, + "loss": 0.2763, + "step": 30779 + }, + { + "epoch": 0.89, + "grad_norm": 1.4976658966473468, + "learning_rate": 2.985275676359145e-07, + "loss": 0.2766, + "step": 30780 + }, + { + "epoch": 0.89, + "grad_norm": 1.3638400079870587, + "learning_rate": 2.9836771775439255e-07, + "loss": 0.2768, + "step": 30781 + }, + { + "epoch": 0.89, + "grad_norm": 1.86445631411074, + "learning_rate": 2.982079093648277e-07, + "loss": 0.2866, + "step": 30782 + }, + { + "epoch": 0.89, + "grad_norm": 2.049415506380046, + "learning_rate": 2.980481424686293e-07, + "loss": 0.2874, + "step": 30783 + }, + { + "epoch": 0.89, + "grad_norm": 1.2463591031180217, + "learning_rate": 2.9788841706720805e-07, + "loss": 0.2572, + "step": 30784 + }, + { + "epoch": 0.89, + "grad_norm": 0.9778023804138922, + "learning_rate": 2.9772873316197437e-07, + "loss": 0.5973, + "step": 30785 + }, + { + "epoch": 0.89, + "grad_norm": 1.2936505938336038, + "learning_rate": 2.9756909075433606e-07, + "loss": 0.2637, + "step": 30786 + }, + { + "epoch": 0.89, + "grad_norm": 1.278072926393397, + "learning_rate": 2.9740948984570307e-07, + "loss": 0.2798, + "step": 30787 + }, + { + "epoch": 0.89, + "grad_norm": 1.3152643288285684, + "learning_rate": 2.9724993043748373e-07, + "loss": 0.2777, + "step": 30788 + }, + { + "epoch": 0.89, + "grad_norm": 1.9080708688656942, + "learning_rate": 2.970904125310853e-07, + "loss": 0.2533, + "step": 30789 + }, + { + "epoch": 0.89, + "grad_norm": 1.3803143685570962, + "learning_rate": 2.969309361279166e-07, + "loss": 0.2869, + "step": 30790 + }, + { + "epoch": 0.89, + "grad_norm": 1.3992683035817721, + "learning_rate": 2.967715012293854e-07, + "loss": 0.2783, + "step": 30791 + }, + { + "epoch": 0.89, + "grad_norm": 1.6694616107975215, + "learning_rate": 2.9661210783689664e-07, + "loss": 0.2718, + "step": 30792 + }, + { + "epoch": 0.89, + "grad_norm": 1.6816950038594012, + "learning_rate": 2.9645275595185875e-07, + "loss": 0.2631, + "step": 30793 + }, + { + "epoch": 0.89, + "grad_norm": 1.5765475411802958, + "learning_rate": 2.9629344557567717e-07, + "loss": 0.2579, + "step": 30794 + }, + { + "epoch": 0.89, + "grad_norm": 2.127756817331954, + "learning_rate": 2.961341767097581e-07, + "loss": 0.2887, + "step": 30795 + }, + { + "epoch": 0.89, + "grad_norm": 1.6731248584879905, + "learning_rate": 2.9597494935550754e-07, + "loss": 0.2646, + "step": 30796 + }, + { + "epoch": 0.89, + "grad_norm": 5.547831694769534, + "learning_rate": 2.958157635143294e-07, + "loss": 0.2678, + "step": 30797 + }, + { + "epoch": 0.89, + "grad_norm": 1.3872758575069113, + "learning_rate": 2.9565661918762977e-07, + "loss": 0.2607, + "step": 30798 + }, + { + "epoch": 0.89, + "grad_norm": 1.3167298577333524, + "learning_rate": 2.954975163768131e-07, + "loss": 0.2935, + "step": 30799 + }, + { + "epoch": 0.89, + "grad_norm": 1.5417324798016485, + "learning_rate": 2.953384550832833e-07, + "loss": 0.2976, + "step": 30800 + }, + { + "epoch": 0.89, + "grad_norm": 1.3579660000434455, + "learning_rate": 2.9517943530844307e-07, + "loss": 0.2561, + "step": 30801 + }, + { + "epoch": 0.89, + "grad_norm": 6.290624602471621, + "learning_rate": 2.950204570536963e-07, + "loss": 0.2673, + "step": 30802 + }, + { + "epoch": 0.89, + "grad_norm": 1.2789721159719574, + "learning_rate": 2.948615203204469e-07, + "loss": 0.2821, + "step": 30803 + }, + { + "epoch": 0.89, + "grad_norm": 1.3170701262645361, + "learning_rate": 2.947026251100965e-07, + "loss": 0.2464, + "step": 30804 + }, + { + "epoch": 0.89, + "grad_norm": 1.3275154075844668, + "learning_rate": 2.945437714240473e-07, + "loss": 0.2738, + "step": 30805 + }, + { + "epoch": 0.89, + "grad_norm": 1.2974269025474034, + "learning_rate": 2.943849592637016e-07, + "loss": 0.2782, + "step": 30806 + }, + { + "epoch": 0.89, + "grad_norm": 1.460181171464323, + "learning_rate": 2.942261886304609e-07, + "loss": 0.2936, + "step": 30807 + }, + { + "epoch": 0.89, + "grad_norm": 1.3830163236475992, + "learning_rate": 2.94067459525727e-07, + "loss": 0.2718, + "step": 30808 + }, + { + "epoch": 0.89, + "grad_norm": 1.00989059229119, + "learning_rate": 2.939087719508993e-07, + "loss": 0.6089, + "step": 30809 + }, + { + "epoch": 0.89, + "grad_norm": 1.2530604058949368, + "learning_rate": 2.9375012590737994e-07, + "loss": 0.2669, + "step": 30810 + }, + { + "epoch": 0.89, + "grad_norm": 1.0062007129062795, + "learning_rate": 2.935915213965668e-07, + "loss": 0.6038, + "step": 30811 + }, + { + "epoch": 0.89, + "grad_norm": 1.365556416965551, + "learning_rate": 2.934329584198609e-07, + "loss": 0.2817, + "step": 30812 + }, + { + "epoch": 0.89, + "grad_norm": 1.4617995955713479, + "learning_rate": 2.932744369786611e-07, + "loss": 0.2561, + "step": 30813 + }, + { + "epoch": 0.89, + "grad_norm": 1.4218680875648346, + "learning_rate": 2.931159570743669e-07, + "loss": 0.2561, + "step": 30814 + }, + { + "epoch": 0.89, + "grad_norm": 1.3938567115863474, + "learning_rate": 2.929575187083761e-07, + "loss": 0.2649, + "step": 30815 + }, + { + "epoch": 0.89, + "grad_norm": 1.2559651585813025, + "learning_rate": 2.9279912188208806e-07, + "loss": 0.2609, + "step": 30816 + }, + { + "epoch": 0.89, + "grad_norm": 1.3756592425893743, + "learning_rate": 2.9264076659689944e-07, + "loss": 0.27, + "step": 30817 + }, + { + "epoch": 0.89, + "grad_norm": 1.6577088941757374, + "learning_rate": 2.9248245285420915e-07, + "loss": 0.2863, + "step": 30818 + }, + { + "epoch": 0.89, + "grad_norm": 1.54390780428285, + "learning_rate": 2.923241806554133e-07, + "loss": 0.2442, + "step": 30819 + }, + { + "epoch": 0.89, + "grad_norm": 1.4505869526515034, + "learning_rate": 2.921659500019081e-07, + "loss": 0.2792, + "step": 30820 + }, + { + "epoch": 0.89, + "grad_norm": 1.3272301186872904, + "learning_rate": 2.9200776089509053e-07, + "loss": 0.2532, + "step": 30821 + }, + { + "epoch": 0.89, + "grad_norm": 1.499047642882403, + "learning_rate": 2.918496133363574e-07, + "loss": 0.266, + "step": 30822 + }, + { + "epoch": 0.89, + "grad_norm": 1.4446378110074087, + "learning_rate": 2.9169150732710316e-07, + "loss": 0.2586, + "step": 30823 + }, + { + "epoch": 0.89, + "grad_norm": 2.3299344555385284, + "learning_rate": 2.915334428687233e-07, + "loss": 0.2845, + "step": 30824 + }, + { + "epoch": 0.89, + "grad_norm": 1.3758894532289716, + "learning_rate": 2.913754199626134e-07, + "loss": 0.3025, + "step": 30825 + }, + { + "epoch": 0.89, + "grad_norm": 1.6360669283753657, + "learning_rate": 2.912174386101679e-07, + "loss": 0.2612, + "step": 30826 + }, + { + "epoch": 0.89, + "grad_norm": 1.3320328048095247, + "learning_rate": 2.9105949881278126e-07, + "loss": 0.26, + "step": 30827 + }, + { + "epoch": 0.89, + "grad_norm": 0.9722723322659528, + "learning_rate": 2.9090160057184626e-07, + "loss": 0.5627, + "step": 30828 + }, + { + "epoch": 0.89, + "grad_norm": 1.284858289141724, + "learning_rate": 2.907437438887567e-07, + "loss": 0.2524, + "step": 30829 + }, + { + "epoch": 0.89, + "grad_norm": 1.2985979008443378, + "learning_rate": 2.9058592876490544e-07, + "loss": 0.2647, + "step": 30830 + }, + { + "epoch": 0.89, + "grad_norm": 1.6494024548572916, + "learning_rate": 2.9042815520168634e-07, + "loss": 0.2801, + "step": 30831 + }, + { + "epoch": 0.89, + "grad_norm": 1.2893989660691052, + "learning_rate": 2.902704232004905e-07, + "loss": 0.2976, + "step": 30832 + }, + { + "epoch": 0.89, + "grad_norm": 1.619007092784779, + "learning_rate": 2.9011273276271066e-07, + "loss": 0.2805, + "step": 30833 + }, + { + "epoch": 0.89, + "grad_norm": 2.835751049779358, + "learning_rate": 2.89955083889738e-07, + "loss": 0.2982, + "step": 30834 + }, + { + "epoch": 0.89, + "grad_norm": 1.4551976605988823, + "learning_rate": 2.8979747658296367e-07, + "loss": 0.2828, + "step": 30835 + }, + { + "epoch": 0.89, + "grad_norm": 1.3476820906172526, + "learning_rate": 2.8963991084378027e-07, + "loss": 0.2638, + "step": 30836 + }, + { + "epoch": 0.89, + "grad_norm": 1.2876867606543028, + "learning_rate": 2.8948238667357574e-07, + "loss": 0.2649, + "step": 30837 + }, + { + "epoch": 0.89, + "grad_norm": 1.3231682766450854, + "learning_rate": 2.8932490407374227e-07, + "loss": 0.2586, + "step": 30838 + }, + { + "epoch": 0.89, + "grad_norm": 1.775707990449756, + "learning_rate": 2.891674630456681e-07, + "loss": 0.2835, + "step": 30839 + }, + { + "epoch": 0.89, + "grad_norm": 1.2754760586269294, + "learning_rate": 2.8901006359074336e-07, + "loss": 0.2436, + "step": 30840 + }, + { + "epoch": 0.89, + "grad_norm": 1.4600689594439547, + "learning_rate": 2.8885270571035684e-07, + "loss": 0.2654, + "step": 30841 + }, + { + "epoch": 0.89, + "grad_norm": 0.918055050671701, + "learning_rate": 2.88695389405898e-07, + "loss": 0.5134, + "step": 30842 + }, + { + "epoch": 0.89, + "grad_norm": 1.3143521233125939, + "learning_rate": 2.8853811467875413e-07, + "loss": 0.2633, + "step": 30843 + }, + { + "epoch": 0.89, + "grad_norm": 1.6126950787847683, + "learning_rate": 2.883808815303135e-07, + "loss": 0.2778, + "step": 30844 + }, + { + "epoch": 0.89, + "grad_norm": 0.922634998764464, + "learning_rate": 2.882236899619645e-07, + "loss": 0.5438, + "step": 30845 + }, + { + "epoch": 0.89, + "grad_norm": 1.6211125014404582, + "learning_rate": 2.880665399750937e-07, + "loss": 0.2696, + "step": 30846 + }, + { + "epoch": 0.89, + "grad_norm": 1.7112167622095935, + "learning_rate": 2.8790943157108787e-07, + "loss": 0.2483, + "step": 30847 + }, + { + "epoch": 0.89, + "grad_norm": 1.2595218543247193, + "learning_rate": 2.8775236475133307e-07, + "loss": 0.2747, + "step": 30848 + }, + { + "epoch": 0.89, + "grad_norm": 1.39006744039808, + "learning_rate": 2.875953395172165e-07, + "loss": 0.2489, + "step": 30849 + }, + { + "epoch": 0.89, + "grad_norm": 2.087158022034908, + "learning_rate": 2.874383558701227e-07, + "loss": 0.2913, + "step": 30850 + }, + { + "epoch": 0.89, + "grad_norm": 1.226975876857697, + "learning_rate": 2.8728141381143825e-07, + "loss": 0.3148, + "step": 30851 + }, + { + "epoch": 0.89, + "grad_norm": 1.3810471434918261, + "learning_rate": 2.871245133425471e-07, + "loss": 0.2855, + "step": 30852 + }, + { + "epoch": 0.89, + "grad_norm": 1.3937294719290345, + "learning_rate": 2.869676544648348e-07, + "loss": 0.2815, + "step": 30853 + }, + { + "epoch": 0.89, + "grad_norm": 1.2663147772155487, + "learning_rate": 2.8681083717968574e-07, + "loss": 0.2633, + "step": 30854 + }, + { + "epoch": 0.89, + "grad_norm": 1.5170522031658011, + "learning_rate": 2.8665406148848274e-07, + "loss": 0.2768, + "step": 30855 + }, + { + "epoch": 0.89, + "grad_norm": 1.3634051085815262, + "learning_rate": 2.864973273926097e-07, + "loss": 0.2643, + "step": 30856 + }, + { + "epoch": 0.9, + "grad_norm": 1.5784836829205247, + "learning_rate": 2.8634063489344996e-07, + "loss": 0.2777, + "step": 30857 + }, + { + "epoch": 0.9, + "grad_norm": 1.3587417695852193, + "learning_rate": 2.861839839923869e-07, + "loss": 0.2782, + "step": 30858 + }, + { + "epoch": 0.9, + "grad_norm": 1.530781715309854, + "learning_rate": 2.860273746908021e-07, + "loss": 0.2707, + "step": 30859 + }, + { + "epoch": 0.9, + "grad_norm": 1.4613199922203541, + "learning_rate": 2.858708069900784e-07, + "loss": 0.2565, + "step": 30860 + }, + { + "epoch": 0.9, + "grad_norm": 1.2802345268083855, + "learning_rate": 2.8571428089159694e-07, + "loss": 0.2682, + "step": 30861 + }, + { + "epoch": 0.9, + "grad_norm": 1.3300782676240526, + "learning_rate": 2.8555779639673877e-07, + "loss": 0.2741, + "step": 30862 + }, + { + "epoch": 0.9, + "grad_norm": 1.3418761673852986, + "learning_rate": 2.854013535068867e-07, + "loss": 0.2604, + "step": 30863 + }, + { + "epoch": 0.9, + "grad_norm": 1.8285570405016935, + "learning_rate": 2.8524495222341907e-07, + "loss": 0.2763, + "step": 30864 + }, + { + "epoch": 0.9, + "grad_norm": 1.5644721593126003, + "learning_rate": 2.85088592547717e-07, + "loss": 0.261, + "step": 30865 + }, + { + "epoch": 0.9, + "grad_norm": 1.5313217193743993, + "learning_rate": 2.8493227448116113e-07, + "loss": 0.2507, + "step": 30866 + }, + { + "epoch": 0.9, + "grad_norm": 1.2573777137946576, + "learning_rate": 2.847759980251291e-07, + "loss": 0.2883, + "step": 30867 + }, + { + "epoch": 0.9, + "grad_norm": 1.6729343815548512, + "learning_rate": 2.846197631810016e-07, + "loss": 0.2851, + "step": 30868 + }, + { + "epoch": 0.9, + "grad_norm": 1.257971455898369, + "learning_rate": 2.8446356995015743e-07, + "loss": 0.2752, + "step": 30869 + }, + { + "epoch": 0.9, + "grad_norm": 1.2784278903477748, + "learning_rate": 2.843074183339739e-07, + "loss": 0.257, + "step": 30870 + }, + { + "epoch": 0.9, + "grad_norm": 1.2802696792452626, + "learning_rate": 2.8415130833382987e-07, + "loss": 0.2526, + "step": 30871 + }, + { + "epoch": 0.9, + "grad_norm": 1.5163003938566944, + "learning_rate": 2.839952399511031e-07, + "loss": 0.2521, + "step": 30872 + }, + { + "epoch": 0.9, + "grad_norm": 1.2494577399265405, + "learning_rate": 2.8383921318717145e-07, + "loss": 0.273, + "step": 30873 + }, + { + "epoch": 0.9, + "grad_norm": 1.4305980263866764, + "learning_rate": 2.836832280434099e-07, + "loss": 0.2475, + "step": 30874 + }, + { + "epoch": 0.9, + "grad_norm": 1.3128907401987082, + "learning_rate": 2.8352728452119615e-07, + "loss": 0.2571, + "step": 30875 + }, + { + "epoch": 0.9, + "grad_norm": 1.561254122110984, + "learning_rate": 2.83371382621907e-07, + "loss": 0.2531, + "step": 30876 + }, + { + "epoch": 0.9, + "grad_norm": 1.92055943805732, + "learning_rate": 2.832155223469174e-07, + "loss": 0.2659, + "step": 30877 + }, + { + "epoch": 0.9, + "grad_norm": 1.502276254348197, + "learning_rate": 2.830597036976029e-07, + "loss": 0.2878, + "step": 30878 + }, + { + "epoch": 0.9, + "grad_norm": 1.623325499993585, + "learning_rate": 2.829039266753392e-07, + "loss": 0.2577, + "step": 30879 + }, + { + "epoch": 0.9, + "grad_norm": 1.5712487418427799, + "learning_rate": 2.8274819128150056e-07, + "loss": 0.2878, + "step": 30880 + }, + { + "epoch": 0.9, + "grad_norm": 1.3457443734839118, + "learning_rate": 2.8259249751746153e-07, + "loss": 0.27, + "step": 30881 + }, + { + "epoch": 0.9, + "grad_norm": 1.3362121969505765, + "learning_rate": 2.824368453845966e-07, + "loss": 0.276, + "step": 30882 + }, + { + "epoch": 0.9, + "grad_norm": 1.3570185652217792, + "learning_rate": 2.822812348842779e-07, + "loss": 0.2658, + "step": 30883 + }, + { + "epoch": 0.9, + "grad_norm": 1.32239159059501, + "learning_rate": 2.8212566601788007e-07, + "loss": 0.2644, + "step": 30884 + }, + { + "epoch": 0.9, + "grad_norm": 1.4772113721681086, + "learning_rate": 2.819701387867757e-07, + "loss": 0.268, + "step": 30885 + }, + { + "epoch": 0.9, + "grad_norm": 1.5021987633975828, + "learning_rate": 2.8181465319233715e-07, + "loss": 0.2662, + "step": 30886 + }, + { + "epoch": 0.9, + "grad_norm": 1.2419061430460367, + "learning_rate": 2.8165920923593604e-07, + "loss": 0.2648, + "step": 30887 + }, + { + "epoch": 0.9, + "grad_norm": 1.44434223323535, + "learning_rate": 2.815038069189452e-07, + "loss": 0.252, + "step": 30888 + }, + { + "epoch": 0.9, + "grad_norm": 1.406586706621402, + "learning_rate": 2.813484462427357e-07, + "loss": 0.2707, + "step": 30889 + }, + { + "epoch": 0.9, + "grad_norm": 1.6142520592917045, + "learning_rate": 2.8119312720867875e-07, + "loss": 0.2836, + "step": 30890 + }, + { + "epoch": 0.9, + "grad_norm": 1.6039489847534627, + "learning_rate": 2.8103784981814485e-07, + "loss": 0.2704, + "step": 30891 + }, + { + "epoch": 0.9, + "grad_norm": 1.4473640840453372, + "learning_rate": 2.808826140725035e-07, + "loss": 0.2696, + "step": 30892 + }, + { + "epoch": 0.9, + "grad_norm": 1.5116837388768165, + "learning_rate": 2.807274199731258e-07, + "loss": 0.2879, + "step": 30893 + }, + { + "epoch": 0.9, + "grad_norm": 1.3612331922579575, + "learning_rate": 2.8057226752138176e-07, + "loss": 0.2928, + "step": 30894 + }, + { + "epoch": 0.9, + "grad_norm": 3.021053642876099, + "learning_rate": 2.8041715671863924e-07, + "loss": 0.271, + "step": 30895 + }, + { + "epoch": 0.9, + "grad_norm": 1.3045218612957505, + "learning_rate": 2.8026208756626704e-07, + "loss": 0.2969, + "step": 30896 + }, + { + "epoch": 0.9, + "grad_norm": 1.5006914493435188, + "learning_rate": 2.801070600656347e-07, + "loss": 0.2639, + "step": 30897 + }, + { + "epoch": 0.9, + "grad_norm": 1.5084287913679517, + "learning_rate": 2.7995207421811e-07, + "loss": 0.2467, + "step": 30898 + }, + { + "epoch": 0.9, + "grad_norm": 1.2041319859172919, + "learning_rate": 2.7979713002506013e-07, + "loss": 0.2629, + "step": 30899 + }, + { + "epoch": 0.9, + "grad_norm": 1.5386358528943578, + "learning_rate": 2.796422274878535e-07, + "loss": 0.2706, + "step": 30900 + }, + { + "epoch": 0.9, + "grad_norm": 1.4705920403856017, + "learning_rate": 2.7948736660785736e-07, + "loss": 0.277, + "step": 30901 + }, + { + "epoch": 0.9, + "grad_norm": 0.9878058108220148, + "learning_rate": 2.793325473864367e-07, + "loss": 0.5776, + "step": 30902 + }, + { + "epoch": 0.9, + "grad_norm": 1.4806467027243881, + "learning_rate": 2.7917776982495815e-07, + "loss": 0.265, + "step": 30903 + }, + { + "epoch": 0.9, + "grad_norm": 1.3297989196430513, + "learning_rate": 2.7902303392478846e-07, + "loss": 0.2699, + "step": 30904 + }, + { + "epoch": 0.9, + "grad_norm": 1.6046165057790895, + "learning_rate": 2.788683396872932e-07, + "loss": 0.2916, + "step": 30905 + }, + { + "epoch": 0.9, + "grad_norm": 1.9208897144534196, + "learning_rate": 2.7871368711383685e-07, + "loss": 0.2664, + "step": 30906 + }, + { + "epoch": 0.9, + "grad_norm": 1.8572067248526842, + "learning_rate": 2.785590762057849e-07, + "loss": 0.2998, + "step": 30907 + }, + { + "epoch": 0.9, + "grad_norm": 1.4496278415312904, + "learning_rate": 2.784045069645014e-07, + "loss": 0.2765, + "step": 30908 + }, + { + "epoch": 0.9, + "grad_norm": 1.2703965510883366, + "learning_rate": 2.782499793913512e-07, + "loss": 0.2654, + "step": 30909 + }, + { + "epoch": 0.9, + "grad_norm": 1.3765043138218658, + "learning_rate": 2.7809549348769615e-07, + "loss": 0.2628, + "step": 30910 + }, + { + "epoch": 0.9, + "grad_norm": 1.3229855826378367, + "learning_rate": 2.779410492549012e-07, + "loss": 0.2697, + "step": 30911 + }, + { + "epoch": 0.9, + "grad_norm": 1.6444276502098472, + "learning_rate": 2.7778664669432906e-07, + "loss": 0.2763, + "step": 30912 + }, + { + "epoch": 0.9, + "grad_norm": 1.4921188626551207, + "learning_rate": 2.776322858073416e-07, + "loss": 0.2655, + "step": 30913 + }, + { + "epoch": 0.9, + "grad_norm": 1.8308333747897065, + "learning_rate": 2.77477966595302e-07, + "loss": 0.2686, + "step": 30914 + }, + { + "epoch": 0.9, + "grad_norm": 1.25692038443497, + "learning_rate": 2.773236890595715e-07, + "loss": 0.2528, + "step": 30915 + }, + { + "epoch": 0.9, + "grad_norm": 1.3380284502034725, + "learning_rate": 2.7716945320151233e-07, + "loss": 0.2685, + "step": 30916 + }, + { + "epoch": 0.9, + "grad_norm": 1.3957852163789064, + "learning_rate": 2.7701525902248447e-07, + "loss": 0.2814, + "step": 30917 + }, + { + "epoch": 0.9, + "grad_norm": 1.261672270867301, + "learning_rate": 2.768611065238502e-07, + "loss": 0.2787, + "step": 30918 + }, + { + "epoch": 0.9, + "grad_norm": 1.5470941374536002, + "learning_rate": 2.767069957069685e-07, + "loss": 0.2665, + "step": 30919 + }, + { + "epoch": 0.9, + "grad_norm": 1.7424431716048698, + "learning_rate": 2.765529265732003e-07, + "loss": 0.2549, + "step": 30920 + }, + { + "epoch": 0.9, + "grad_norm": 1.517342803938698, + "learning_rate": 2.763988991239047e-07, + "loss": 0.2596, + "step": 30921 + }, + { + "epoch": 0.9, + "grad_norm": 1.341427492413919, + "learning_rate": 2.7624491336044055e-07, + "loss": 0.2881, + "step": 30922 + }, + { + "epoch": 0.9, + "grad_norm": 1.294949648269074, + "learning_rate": 2.76090969284169e-07, + "loss": 0.2696, + "step": 30923 + }, + { + "epoch": 0.9, + "grad_norm": 1.7674363932779054, + "learning_rate": 2.7593706689644617e-07, + "loss": 0.2665, + "step": 30924 + }, + { + "epoch": 0.9, + "grad_norm": 1.4015440734457114, + "learning_rate": 2.7578320619863095e-07, + "loss": 0.2704, + "step": 30925 + }, + { + "epoch": 0.9, + "grad_norm": 1.4137714067210116, + "learning_rate": 2.756293871920812e-07, + "loss": 0.2528, + "step": 30926 + }, + { + "epoch": 0.9, + "grad_norm": 1.3063208985634047, + "learning_rate": 2.7547560987815467e-07, + "loss": 0.2748, + "step": 30927 + }, + { + "epoch": 0.9, + "grad_norm": 1.2371013934273674, + "learning_rate": 2.7532187425820923e-07, + "loss": 0.2913, + "step": 30928 + }, + { + "epoch": 0.9, + "grad_norm": 1.6556256479617495, + "learning_rate": 2.751681803335998e-07, + "loss": 0.2793, + "step": 30929 + }, + { + "epoch": 0.9, + "grad_norm": 1.7620314539113322, + "learning_rate": 2.7501452810568376e-07, + "loss": 0.276, + "step": 30930 + }, + { + "epoch": 0.9, + "grad_norm": 1.2244851428835222, + "learning_rate": 2.7486091757581655e-07, + "loss": 0.2703, + "step": 30931 + }, + { + "epoch": 0.9, + "grad_norm": 1.4521287115688313, + "learning_rate": 2.7470734874535444e-07, + "loss": 0.2649, + "step": 30932 + }, + { + "epoch": 0.9, + "grad_norm": 1.6006541602785487, + "learning_rate": 2.745538216156518e-07, + "loss": 0.2825, + "step": 30933 + }, + { + "epoch": 0.9, + "grad_norm": 1.9164542134013423, + "learning_rate": 2.7440033618806484e-07, + "loss": 0.2746, + "step": 30934 + }, + { + "epoch": 0.9, + "grad_norm": 1.27428666874765, + "learning_rate": 2.7424689246394685e-07, + "loss": 0.2751, + "step": 30935 + }, + { + "epoch": 0.9, + "grad_norm": 2.24909027386893, + "learning_rate": 2.7409349044465296e-07, + "loss": 0.2563, + "step": 30936 + }, + { + "epoch": 0.9, + "grad_norm": 1.505479745229746, + "learning_rate": 2.73940130131537e-07, + "loss": 0.2879, + "step": 30937 + }, + { + "epoch": 0.9, + "grad_norm": 1.2386518935758233, + "learning_rate": 2.7378681152595065e-07, + "loss": 0.2646, + "step": 30938 + }, + { + "epoch": 0.9, + "grad_norm": 1.5420289725081089, + "learning_rate": 2.7363353462924845e-07, + "loss": 0.2706, + "step": 30939 + }, + { + "epoch": 0.9, + "grad_norm": 1.338844797986054, + "learning_rate": 2.734802994427832e-07, + "loss": 0.2655, + "step": 30940 + }, + { + "epoch": 0.9, + "grad_norm": 1.3890408141589115, + "learning_rate": 2.7332710596790604e-07, + "loss": 0.2632, + "step": 30941 + }, + { + "epoch": 0.9, + "grad_norm": 1.2695650816101907, + "learning_rate": 2.731739542059697e-07, + "loss": 0.2792, + "step": 30942 + }, + { + "epoch": 0.9, + "grad_norm": 1.7562945240824053, + "learning_rate": 2.7302084415832595e-07, + "loss": 0.2859, + "step": 30943 + }, + { + "epoch": 0.9, + "grad_norm": 1.4498847435905549, + "learning_rate": 2.7286777582632597e-07, + "loss": 0.2434, + "step": 30944 + }, + { + "epoch": 0.9, + "grad_norm": 1.6485598509975696, + "learning_rate": 2.7271474921132025e-07, + "loss": 0.2653, + "step": 30945 + }, + { + "epoch": 0.9, + "grad_norm": 1.481795956907532, + "learning_rate": 2.7256176431465886e-07, + "loss": 0.2921, + "step": 30946 + }, + { + "epoch": 0.9, + "grad_norm": 1.255932925187593, + "learning_rate": 2.724088211376924e-07, + "loss": 0.2677, + "step": 30947 + }, + { + "epoch": 0.9, + "grad_norm": 1.3344612663798592, + "learning_rate": 2.722559196817709e-07, + "loss": 0.2677, + "step": 30948 + }, + { + "epoch": 0.9, + "grad_norm": 1.3849318825788015, + "learning_rate": 2.7210305994824273e-07, + "loss": 0.2571, + "step": 30949 + }, + { + "epoch": 0.9, + "grad_norm": 0.9840024952127551, + "learning_rate": 2.719502419384579e-07, + "loss": 0.5451, + "step": 30950 + }, + { + "epoch": 0.9, + "grad_norm": 1.462683107804251, + "learning_rate": 2.717974656537653e-07, + "loss": 0.2712, + "step": 30951 + }, + { + "epoch": 0.9, + "grad_norm": 1.301153933574943, + "learning_rate": 2.716447310955117e-07, + "loss": 0.2784, + "step": 30952 + }, + { + "epoch": 0.9, + "grad_norm": 1.2985389725941945, + "learning_rate": 2.71492038265046e-07, + "loss": 0.2805, + "step": 30953 + }, + { + "epoch": 0.9, + "grad_norm": 1.2464791726493363, + "learning_rate": 2.713393871637154e-07, + "loss": 0.2704, + "step": 30954 + }, + { + "epoch": 0.9, + "grad_norm": 1.3079150715189454, + "learning_rate": 2.711867777928673e-07, + "loss": 0.2533, + "step": 30955 + }, + { + "epoch": 0.9, + "grad_norm": 1.33002104758925, + "learning_rate": 2.710342101538488e-07, + "loss": 0.2834, + "step": 30956 + }, + { + "epoch": 0.9, + "grad_norm": 1.4226121418171125, + "learning_rate": 2.7088168424800497e-07, + "loss": 0.2632, + "step": 30957 + }, + { + "epoch": 0.9, + "grad_norm": 1.3990295937868757, + "learning_rate": 2.707292000766831e-07, + "loss": 0.2453, + "step": 30958 + }, + { + "epoch": 0.9, + "grad_norm": 1.2639900251230705, + "learning_rate": 2.7057675764122826e-07, + "loss": 0.3094, + "step": 30959 + }, + { + "epoch": 0.9, + "grad_norm": 1.436310808215828, + "learning_rate": 2.704243569429865e-07, + "loss": 0.2861, + "step": 30960 + }, + { + "epoch": 0.9, + "grad_norm": 1.3960789427225837, + "learning_rate": 2.7027199798330185e-07, + "loss": 0.2579, + "step": 30961 + }, + { + "epoch": 0.9, + "grad_norm": 1.392864520958891, + "learning_rate": 2.7011968076351925e-07, + "loss": 0.2659, + "step": 30962 + }, + { + "epoch": 0.9, + "grad_norm": 1.358450893958692, + "learning_rate": 2.699674052849832e-07, + "loss": 0.2696, + "step": 30963 + }, + { + "epoch": 0.9, + "grad_norm": 1.414930654622894, + "learning_rate": 2.6981517154903767e-07, + "loss": 0.2558, + "step": 30964 + }, + { + "epoch": 0.9, + "grad_norm": 1.3643560310381508, + "learning_rate": 2.696629795570255e-07, + "loss": 0.2742, + "step": 30965 + }, + { + "epoch": 0.9, + "grad_norm": 1.5116994320249488, + "learning_rate": 2.6951082931028937e-07, + "loss": 0.2541, + "step": 30966 + }, + { + "epoch": 0.9, + "grad_norm": 1.4248479806398096, + "learning_rate": 2.6935872081017276e-07, + "loss": 0.2723, + "step": 30967 + }, + { + "epoch": 0.9, + "grad_norm": 1.2671927480474128, + "learning_rate": 2.6920665405801847e-07, + "loss": 0.2796, + "step": 30968 + }, + { + "epoch": 0.9, + "grad_norm": 1.5210123628664451, + "learning_rate": 2.690546290551677e-07, + "loss": 0.2666, + "step": 30969 + }, + { + "epoch": 0.9, + "grad_norm": 3.194736485899018, + "learning_rate": 2.68902645802962e-07, + "loss": 0.2966, + "step": 30970 + }, + { + "epoch": 0.9, + "grad_norm": 1.3716036505659885, + "learning_rate": 2.687507043027432e-07, + "loss": 0.28, + "step": 30971 + }, + { + "epoch": 0.9, + "grad_norm": 1.3008129088003113, + "learning_rate": 2.6859880455585183e-07, + "loss": 0.2971, + "step": 30972 + }, + { + "epoch": 0.9, + "grad_norm": 1.356350044470618, + "learning_rate": 2.6844694656362915e-07, + "loss": 0.2742, + "step": 30973 + }, + { + "epoch": 0.9, + "grad_norm": 1.2954095661770044, + "learning_rate": 2.682951303274139e-07, + "loss": 0.2757, + "step": 30974 + }, + { + "epoch": 0.9, + "grad_norm": 1.444576844489184, + "learning_rate": 2.6814335584854687e-07, + "loss": 0.2726, + "step": 30975 + }, + { + "epoch": 0.9, + "grad_norm": 1.267637251593074, + "learning_rate": 2.679916231283669e-07, + "loss": 0.264, + "step": 30976 + }, + { + "epoch": 0.9, + "grad_norm": 1.2690487653147353, + "learning_rate": 2.6783993216821345e-07, + "loss": 0.2759, + "step": 30977 + }, + { + "epoch": 0.9, + "grad_norm": 1.5518733419330388, + "learning_rate": 2.676882829694255e-07, + "loss": 0.284, + "step": 30978 + }, + { + "epoch": 0.9, + "grad_norm": 2.5166689289813022, + "learning_rate": 2.6753667553334137e-07, + "loss": 0.2794, + "step": 30979 + }, + { + "epoch": 0.9, + "grad_norm": 0.9814554678625398, + "learning_rate": 2.6738510986129784e-07, + "loss": 0.5771, + "step": 30980 + }, + { + "epoch": 0.9, + "grad_norm": 1.3757793758379475, + "learning_rate": 2.672335859546332e-07, + "loss": 0.294, + "step": 30981 + }, + { + "epoch": 0.9, + "grad_norm": 1.2791528101599343, + "learning_rate": 2.670821038146848e-07, + "loss": 0.2828, + "step": 30982 + }, + { + "epoch": 0.9, + "grad_norm": 1.8784886473048494, + "learning_rate": 2.6693066344278985e-07, + "loss": 0.2717, + "step": 30983 + }, + { + "epoch": 0.9, + "grad_norm": 1.465557090578141, + "learning_rate": 2.667792648402834e-07, + "loss": 0.2674, + "step": 30984 + }, + { + "epoch": 0.9, + "grad_norm": 1.3363918263148684, + "learning_rate": 2.666279080085027e-07, + "loss": 0.2812, + "step": 30985 + }, + { + "epoch": 0.9, + "grad_norm": 1.6597251830675201, + "learning_rate": 2.664765929487834e-07, + "loss": 0.2667, + "step": 30986 + }, + { + "epoch": 0.9, + "grad_norm": 1.3860853669565074, + "learning_rate": 2.6632531966246043e-07, + "loss": 0.2886, + "step": 30987 + }, + { + "epoch": 0.9, + "grad_norm": 1.3657839804263987, + "learning_rate": 2.6617408815086897e-07, + "loss": 0.2549, + "step": 30988 + }, + { + "epoch": 0.9, + "grad_norm": 1.3433550337868527, + "learning_rate": 2.66022898415344e-07, + "loss": 0.2623, + "step": 30989 + }, + { + "epoch": 0.9, + "grad_norm": 1.2797349188461702, + "learning_rate": 2.6587175045721947e-07, + "loss": 0.2826, + "step": 30990 + }, + { + "epoch": 0.9, + "grad_norm": 1.3719835936324563, + "learning_rate": 2.6572064427782875e-07, + "loss": 0.2687, + "step": 30991 + }, + { + "epoch": 0.9, + "grad_norm": 3.1936896726835533, + "learning_rate": 2.6556957987850686e-07, + "loss": 0.3056, + "step": 30992 + }, + { + "epoch": 0.9, + "grad_norm": 1.4515774732740263, + "learning_rate": 2.654185572605855e-07, + "loss": 0.2608, + "step": 30993 + }, + { + "epoch": 0.9, + "grad_norm": 1.2847672622651312, + "learning_rate": 2.6526757642539756e-07, + "loss": 0.2716, + "step": 30994 + }, + { + "epoch": 0.9, + "grad_norm": 1.363483540523586, + "learning_rate": 2.6511663737427587e-07, + "loss": 0.2785, + "step": 30995 + }, + { + "epoch": 0.9, + "grad_norm": 1.2427215948894208, + "learning_rate": 2.649657401085526e-07, + "loss": 0.2581, + "step": 30996 + }, + { + "epoch": 0.9, + "grad_norm": 1.29131615492936, + "learning_rate": 2.6481488462955897e-07, + "loss": 0.2823, + "step": 30997 + }, + { + "epoch": 0.9, + "grad_norm": 1.3854196384587103, + "learning_rate": 2.646640709386267e-07, + "loss": 0.279, + "step": 30998 + }, + { + "epoch": 0.9, + "grad_norm": 1.2994743773709725, + "learning_rate": 2.645132990370863e-07, + "loss": 0.2705, + "step": 30999 + }, + { + "epoch": 0.9, + "grad_norm": 2.098805465272102, + "learning_rate": 2.6436256892626967e-07, + "loss": 0.2773, + "step": 31000 + }, + { + "epoch": 0.9, + "grad_norm": 1.416133194800855, + "learning_rate": 2.6421188060750447e-07, + "loss": 0.2715, + "step": 31001 + }, + { + "epoch": 0.9, + "grad_norm": 1.881125492254555, + "learning_rate": 2.6406123408212246e-07, + "loss": 0.2608, + "step": 31002 + }, + { + "epoch": 0.9, + "grad_norm": 1.5900893999569143, + "learning_rate": 2.639106293514526e-07, + "loss": 0.264, + "step": 31003 + }, + { + "epoch": 0.9, + "grad_norm": 1.2619849537186727, + "learning_rate": 2.6376006641682386e-07, + "loss": 0.2579, + "step": 31004 + }, + { + "epoch": 0.9, + "grad_norm": 1.5233680881793377, + "learning_rate": 2.636095452795651e-07, + "loss": 0.2701, + "step": 31005 + }, + { + "epoch": 0.9, + "grad_norm": 1.6177435649974932, + "learning_rate": 2.634590659410047e-07, + "loss": 0.2985, + "step": 31006 + }, + { + "epoch": 0.9, + "grad_norm": 1.2026871540338337, + "learning_rate": 2.633086284024711e-07, + "loss": 0.2627, + "step": 31007 + }, + { + "epoch": 0.9, + "grad_norm": 1.2799442598835504, + "learning_rate": 2.6315823266529094e-07, + "loss": 0.2575, + "step": 31008 + }, + { + "epoch": 0.9, + "grad_norm": 1.383356946204468, + "learning_rate": 2.6300787873079213e-07, + "loss": 0.2494, + "step": 31009 + }, + { + "epoch": 0.9, + "grad_norm": 1.486941571853985, + "learning_rate": 2.628575666003008e-07, + "loss": 0.2856, + "step": 31010 + }, + { + "epoch": 0.9, + "grad_norm": 1.3411880146177357, + "learning_rate": 2.627072962751448e-07, + "loss": 0.2808, + "step": 31011 + }, + { + "epoch": 0.9, + "grad_norm": 1.3327006247373616, + "learning_rate": 2.625570677566491e-07, + "loss": 0.2711, + "step": 31012 + }, + { + "epoch": 0.9, + "grad_norm": 2.2166358396721497, + "learning_rate": 2.6240688104614e-07, + "loss": 0.2742, + "step": 31013 + }, + { + "epoch": 0.9, + "grad_norm": 1.3474940339951282, + "learning_rate": 2.6225673614494183e-07, + "loss": 0.2589, + "step": 31014 + }, + { + "epoch": 0.9, + "grad_norm": 1.4309472204607712, + "learning_rate": 2.6210663305438145e-07, + "loss": 0.2512, + "step": 31015 + }, + { + "epoch": 0.9, + "grad_norm": 1.660436611630114, + "learning_rate": 2.619565717757822e-07, + "loss": 0.2575, + "step": 31016 + }, + { + "epoch": 0.9, + "grad_norm": 1.702664693122909, + "learning_rate": 2.6180655231046846e-07, + "loss": 0.2694, + "step": 31017 + }, + { + "epoch": 0.9, + "grad_norm": 1.511949571291314, + "learning_rate": 2.616565746597649e-07, + "loss": 0.2507, + "step": 31018 + }, + { + "epoch": 0.9, + "grad_norm": 1.2978676417229047, + "learning_rate": 2.6150663882499537e-07, + "loss": 0.2451, + "step": 31019 + }, + { + "epoch": 0.9, + "grad_norm": 0.9898049031495192, + "learning_rate": 2.6135674480748164e-07, + "loss": 0.5713, + "step": 31020 + }, + { + "epoch": 0.9, + "grad_norm": 1.531542787750657, + "learning_rate": 2.612068926085465e-07, + "loss": 0.2791, + "step": 31021 + }, + { + "epoch": 0.9, + "grad_norm": 1.6841341443181976, + "learning_rate": 2.610570822295139e-07, + "loss": 0.2679, + "step": 31022 + }, + { + "epoch": 0.9, + "grad_norm": 1.4701411890060951, + "learning_rate": 2.60907313671705e-07, + "loss": 0.2662, + "step": 31023 + }, + { + "epoch": 0.9, + "grad_norm": 1.2959679255370602, + "learning_rate": 2.607575869364415e-07, + "loss": 0.2544, + "step": 31024 + }, + { + "epoch": 0.9, + "grad_norm": 1.3973877038562919, + "learning_rate": 2.606079020250446e-07, + "loss": 0.2633, + "step": 31025 + }, + { + "epoch": 0.9, + "grad_norm": 1.4664399301000548, + "learning_rate": 2.6045825893883604e-07, + "loss": 0.2669, + "step": 31026 + }, + { + "epoch": 0.9, + "grad_norm": 1.3091086301570356, + "learning_rate": 2.6030865767913527e-07, + "loss": 0.2543, + "step": 31027 + }, + { + "epoch": 0.9, + "grad_norm": 1.3666482592425364, + "learning_rate": 2.601590982472646e-07, + "loss": 0.2516, + "step": 31028 + }, + { + "epoch": 0.9, + "grad_norm": 1.4461607945788892, + "learning_rate": 2.6000958064454127e-07, + "loss": 0.2602, + "step": 31029 + }, + { + "epoch": 0.9, + "grad_norm": 1.7519780299269563, + "learning_rate": 2.5986010487228597e-07, + "loss": 0.2671, + "step": 31030 + }, + { + "epoch": 0.9, + "grad_norm": 1.2991459244951513, + "learning_rate": 2.597106709318181e-07, + "loss": 0.2691, + "step": 31031 + }, + { + "epoch": 0.9, + "grad_norm": 1.5981848249026989, + "learning_rate": 2.5956127882445614e-07, + "loss": 0.2601, + "step": 31032 + }, + { + "epoch": 0.9, + "grad_norm": 1.6212552621674639, + "learning_rate": 2.594119285515184e-07, + "loss": 0.2643, + "step": 31033 + }, + { + "epoch": 0.9, + "grad_norm": 1.3902234242368283, + "learning_rate": 2.592626201143228e-07, + "loss": 0.2662, + "step": 31034 + }, + { + "epoch": 0.9, + "grad_norm": 1.2898270870145756, + "learning_rate": 2.591133535141882e-07, + "loss": 0.2599, + "step": 31035 + }, + { + "epoch": 0.9, + "grad_norm": 1.413862040491768, + "learning_rate": 2.589641287524297e-07, + "loss": 0.2723, + "step": 31036 + }, + { + "epoch": 0.9, + "grad_norm": 1.379348838655522, + "learning_rate": 2.588149458303657e-07, + "loss": 0.2634, + "step": 31037 + }, + { + "epoch": 0.9, + "grad_norm": 1.3429710328139142, + "learning_rate": 2.5866580474931345e-07, + "loss": 0.2709, + "step": 31038 + }, + { + "epoch": 0.9, + "grad_norm": 1.4560663296823928, + "learning_rate": 2.5851670551058693e-07, + "loss": 0.2764, + "step": 31039 + }, + { + "epoch": 0.9, + "grad_norm": 1.281423869906025, + "learning_rate": 2.5836764811550284e-07, + "loss": 0.2659, + "step": 31040 + }, + { + "epoch": 0.9, + "grad_norm": 1.352248559055408, + "learning_rate": 2.582186325653774e-07, + "loss": 0.2733, + "step": 31041 + }, + { + "epoch": 0.9, + "grad_norm": 1.2728382094357944, + "learning_rate": 2.580696588615245e-07, + "loss": 0.2746, + "step": 31042 + }, + { + "epoch": 0.9, + "grad_norm": 1.266892149282181, + "learning_rate": 2.579207270052597e-07, + "loss": 0.2719, + "step": 31043 + }, + { + "epoch": 0.9, + "grad_norm": 1.3682580644376274, + "learning_rate": 2.5777183699789767e-07, + "loss": 0.2658, + "step": 31044 + }, + { + "epoch": 0.9, + "grad_norm": 1.2689698553010142, + "learning_rate": 2.5762298884075107e-07, + "loss": 0.2755, + "step": 31045 + }, + { + "epoch": 0.9, + "grad_norm": 1.4926306774552123, + "learning_rate": 2.574741825351346e-07, + "loss": 0.2592, + "step": 31046 + }, + { + "epoch": 0.9, + "grad_norm": 1.4967699239439796, + "learning_rate": 2.5732541808236145e-07, + "loss": 0.2716, + "step": 31047 + }, + { + "epoch": 0.9, + "grad_norm": 1.298502018064586, + "learning_rate": 2.57176695483744e-07, + "loss": 0.2624, + "step": 31048 + }, + { + "epoch": 0.9, + "grad_norm": 1.3840908495471003, + "learning_rate": 2.57028014740594e-07, + "loss": 0.2949, + "step": 31049 + }, + { + "epoch": 0.9, + "grad_norm": 1.761688953210471, + "learning_rate": 2.5687937585422486e-07, + "loss": 0.2694, + "step": 31050 + }, + { + "epoch": 0.9, + "grad_norm": 1.3728796081539643, + "learning_rate": 2.5673077882594766e-07, + "loss": 0.2488, + "step": 31051 + }, + { + "epoch": 0.9, + "grad_norm": 1.3250003003498916, + "learning_rate": 2.565822236570742e-07, + "loss": 0.2656, + "step": 31052 + }, + { + "epoch": 0.9, + "grad_norm": 1.3044822187819312, + "learning_rate": 2.564337103489151e-07, + "loss": 0.2643, + "step": 31053 + }, + { + "epoch": 0.9, + "grad_norm": 1.310878973291367, + "learning_rate": 2.5628523890278144e-07, + "loss": 0.2665, + "step": 31054 + }, + { + "epoch": 0.9, + "grad_norm": 1.3102743412350477, + "learning_rate": 2.561368093199834e-07, + "loss": 0.272, + "step": 31055 + }, + { + "epoch": 0.9, + "grad_norm": 1.474664405444928, + "learning_rate": 2.5598842160183046e-07, + "loss": 0.2672, + "step": 31056 + }, + { + "epoch": 0.9, + "grad_norm": 1.425866184006814, + "learning_rate": 2.5584007574963157e-07, + "loss": 0.2491, + "step": 31057 + }, + { + "epoch": 0.9, + "grad_norm": 0.9896877552558281, + "learning_rate": 2.5569177176469727e-07, + "loss": 0.5538, + "step": 31058 + }, + { + "epoch": 0.9, + "grad_norm": 1.2611753314737126, + "learning_rate": 2.5554350964833553e-07, + "loss": 0.2834, + "step": 31059 + }, + { + "epoch": 0.9, + "grad_norm": 1.4263154271292777, + "learning_rate": 2.5539528940185465e-07, + "loss": 0.2644, + "step": 31060 + }, + { + "epoch": 0.9, + "grad_norm": 1.3624624144200637, + "learning_rate": 2.5524711102656366e-07, + "loss": 0.2637, + "step": 31061 + }, + { + "epoch": 0.9, + "grad_norm": 1.4441850201214335, + "learning_rate": 2.550989745237692e-07, + "loss": 0.2764, + "step": 31062 + }, + { + "epoch": 0.9, + "grad_norm": 1.499929259107265, + "learning_rate": 2.549508798947792e-07, + "loss": 0.2745, + "step": 31063 + }, + { + "epoch": 0.9, + "grad_norm": 1.234423653168389, + "learning_rate": 2.5480282714090035e-07, + "loss": 0.2778, + "step": 31064 + }, + { + "epoch": 0.9, + "grad_norm": 1.381972387818581, + "learning_rate": 2.546548162634388e-07, + "loss": 0.2891, + "step": 31065 + }, + { + "epoch": 0.9, + "grad_norm": 4.022139195574847, + "learning_rate": 2.545068472637019e-07, + "loss": 0.2842, + "step": 31066 + }, + { + "epoch": 0.9, + "grad_norm": 1.3749463472820056, + "learning_rate": 2.5435892014299357e-07, + "loss": 0.2648, + "step": 31067 + }, + { + "epoch": 0.9, + "grad_norm": 1.4167953877967274, + "learning_rate": 2.542110349026211e-07, + "loss": 0.244, + "step": 31068 + }, + { + "epoch": 0.9, + "grad_norm": 1.7037032694077436, + "learning_rate": 2.5406319154388845e-07, + "loss": 0.2735, + "step": 31069 + }, + { + "epoch": 0.9, + "grad_norm": 1.2443107619310063, + "learning_rate": 2.539153900681013e-07, + "loss": 0.2469, + "step": 31070 + }, + { + "epoch": 0.9, + "grad_norm": 1.221381206954952, + "learning_rate": 2.537676304765629e-07, + "loss": 0.261, + "step": 31071 + }, + { + "epoch": 0.9, + "grad_norm": 1.3024627895992527, + "learning_rate": 2.5361991277057797e-07, + "loss": 0.2547, + "step": 31072 + }, + { + "epoch": 0.9, + "grad_norm": 1.4105512441978545, + "learning_rate": 2.534722369514503e-07, + "loss": 0.2649, + "step": 31073 + }, + { + "epoch": 0.9, + "grad_norm": 1.6473329000593104, + "learning_rate": 2.5332460302048334e-07, + "loss": 0.2916, + "step": 31074 + }, + { + "epoch": 0.9, + "grad_norm": 1.5403926248407607, + "learning_rate": 2.531770109789783e-07, + "loss": 0.2672, + "step": 31075 + }, + { + "epoch": 0.9, + "grad_norm": 1.784920723846946, + "learning_rate": 2.5302946082823907e-07, + "loss": 0.256, + "step": 31076 + }, + { + "epoch": 0.9, + "grad_norm": 1.4943498791709253, + "learning_rate": 2.5288195256956747e-07, + "loss": 0.2719, + "step": 31077 + }, + { + "epoch": 0.9, + "grad_norm": 1.3391312168381668, + "learning_rate": 2.527344862042658e-07, + "loss": 0.2665, + "step": 31078 + }, + { + "epoch": 0.9, + "grad_norm": 1.2896621074993944, + "learning_rate": 2.5258706173363455e-07, + "loss": 0.2593, + "step": 31079 + }, + { + "epoch": 0.9, + "grad_norm": 1.3162036505631944, + "learning_rate": 2.5243967915897506e-07, + "loss": 0.2575, + "step": 31080 + }, + { + "epoch": 0.9, + "grad_norm": 1.3117920571438526, + "learning_rate": 2.5229233848158784e-07, + "loss": 0.2663, + "step": 31081 + }, + { + "epoch": 0.9, + "grad_norm": 2.082443705046526, + "learning_rate": 2.521450397027742e-07, + "loss": 0.2764, + "step": 31082 + }, + { + "epoch": 0.9, + "grad_norm": 1.3306910413607467, + "learning_rate": 2.519977828238329e-07, + "loss": 0.2413, + "step": 31083 + }, + { + "epoch": 0.9, + "grad_norm": 1.273899981207371, + "learning_rate": 2.518505678460642e-07, + "loss": 0.2611, + "step": 31084 + }, + { + "epoch": 0.9, + "grad_norm": 1.368468225878293, + "learning_rate": 2.5170339477076646e-07, + "loss": 0.2608, + "step": 31085 + }, + { + "epoch": 0.9, + "grad_norm": 1.6017136366437823, + "learning_rate": 2.5155626359923855e-07, + "loss": 0.2673, + "step": 31086 + }, + { + "epoch": 0.9, + "grad_norm": 1.8022634508409585, + "learning_rate": 2.5140917433277957e-07, + "loss": 0.2849, + "step": 31087 + }, + { + "epoch": 0.9, + "grad_norm": 3.656851652169875, + "learning_rate": 2.5126212697268724e-07, + "loss": 0.3133, + "step": 31088 + }, + { + "epoch": 0.9, + "grad_norm": 1.3368946985871546, + "learning_rate": 2.511151215202595e-07, + "loss": 0.2428, + "step": 31089 + }, + { + "epoch": 0.9, + "grad_norm": 1.3118597930322375, + "learning_rate": 2.5096815797679364e-07, + "loss": 0.2764, + "step": 31090 + }, + { + "epoch": 0.9, + "grad_norm": 1.29718380440787, + "learning_rate": 2.508212363435869e-07, + "loss": 0.2774, + "step": 31091 + }, + { + "epoch": 0.9, + "grad_norm": 1.290626732459249, + "learning_rate": 2.50674356621935e-07, + "loss": 0.2651, + "step": 31092 + }, + { + "epoch": 0.9, + "grad_norm": 1.3103375774344035, + "learning_rate": 2.5052751881313506e-07, + "loss": 0.2753, + "step": 31093 + }, + { + "epoch": 0.9, + "grad_norm": 1.4010670479430456, + "learning_rate": 2.503807229184818e-07, + "loss": 0.252, + "step": 31094 + }, + { + "epoch": 0.9, + "grad_norm": 1.655958742167057, + "learning_rate": 2.502339689392713e-07, + "loss": 0.2651, + "step": 31095 + }, + { + "epoch": 0.9, + "grad_norm": 1.311531892310813, + "learning_rate": 2.500872568767987e-07, + "loss": 0.2664, + "step": 31096 + }, + { + "epoch": 0.9, + "grad_norm": 1.5739783194535892, + "learning_rate": 2.49940586732359e-07, + "loss": 0.2955, + "step": 31097 + }, + { + "epoch": 0.9, + "grad_norm": 1.2904617871744721, + "learning_rate": 2.4979395850724565e-07, + "loss": 0.2757, + "step": 31098 + }, + { + "epoch": 0.9, + "grad_norm": 1.321516665664081, + "learning_rate": 2.496473722027537e-07, + "loss": 0.2825, + "step": 31099 + }, + { + "epoch": 0.9, + "grad_norm": 1.4329347086526851, + "learning_rate": 2.495008278201766e-07, + "loss": 0.2636, + "step": 31100 + }, + { + "epoch": 0.9, + "grad_norm": 1.3049821510842825, + "learning_rate": 2.493543253608072e-07, + "loss": 0.2834, + "step": 31101 + }, + { + "epoch": 0.9, + "grad_norm": 1.3145926853460426, + "learning_rate": 2.492078648259394e-07, + "loss": 0.2695, + "step": 31102 + }, + { + "epoch": 0.9, + "grad_norm": 1.4036096945221828, + "learning_rate": 2.4906144621686445e-07, + "loss": 0.2627, + "step": 31103 + }, + { + "epoch": 0.9, + "grad_norm": 1.519344574446084, + "learning_rate": 2.4891506953487467e-07, + "loss": 0.2923, + "step": 31104 + }, + { + "epoch": 0.9, + "grad_norm": 1.397269998260128, + "learning_rate": 2.487687347812617e-07, + "loss": 0.2674, + "step": 31105 + }, + { + "epoch": 0.9, + "grad_norm": 1.4851660650432588, + "learning_rate": 2.48622441957318e-07, + "loss": 0.2853, + "step": 31106 + }, + { + "epoch": 0.9, + "grad_norm": 1.289476250314671, + "learning_rate": 2.4847619106433354e-07, + "loss": 0.2642, + "step": 31107 + }, + { + "epoch": 0.9, + "grad_norm": 1.267995805496045, + "learning_rate": 2.483299821035995e-07, + "loss": 0.2645, + "step": 31108 + }, + { + "epoch": 0.9, + "grad_norm": 1.4244266250774413, + "learning_rate": 2.48183815076406e-07, + "loss": 0.2645, + "step": 31109 + }, + { + "epoch": 0.9, + "grad_norm": 1.350540075977169, + "learning_rate": 2.480376899840442e-07, + "loss": 0.2731, + "step": 31110 + }, + { + "epoch": 0.9, + "grad_norm": 1.5461737290461186, + "learning_rate": 2.4789160682780147e-07, + "loss": 0.2834, + "step": 31111 + }, + { + "epoch": 0.9, + "grad_norm": 1.2645029724186332, + "learning_rate": 2.477455656089678e-07, + "loss": 0.2523, + "step": 31112 + }, + { + "epoch": 0.9, + "grad_norm": 1.707723579664751, + "learning_rate": 2.475995663288322e-07, + "loss": 0.2589, + "step": 31113 + }, + { + "epoch": 0.9, + "grad_norm": 1.2617876120181766, + "learning_rate": 2.4745360898868366e-07, + "loss": 0.2804, + "step": 31114 + }, + { + "epoch": 0.9, + "grad_norm": 1.7857394154261579, + "learning_rate": 2.4730769358980943e-07, + "loss": 0.2589, + "step": 31115 + }, + { + "epoch": 0.9, + "grad_norm": 3.7863190493396335, + "learning_rate": 2.4716182013349746e-07, + "loss": 0.3038, + "step": 31116 + }, + { + "epoch": 0.9, + "grad_norm": 1.3325607014327137, + "learning_rate": 2.470159886210355e-07, + "loss": 0.2798, + "step": 31117 + }, + { + "epoch": 0.9, + "grad_norm": 1.3900486021696472, + "learning_rate": 2.468701990537098e-07, + "loss": 0.2549, + "step": 31118 + }, + { + "epoch": 0.9, + "grad_norm": 1.3426575339637339, + "learning_rate": 2.467244514328082e-07, + "loss": 0.2607, + "step": 31119 + }, + { + "epoch": 0.9, + "grad_norm": 1.3718671512762657, + "learning_rate": 2.4657874575961584e-07, + "loss": 0.2711, + "step": 31120 + }, + { + "epoch": 0.9, + "grad_norm": 1.385442144217295, + "learning_rate": 2.464330820354183e-07, + "loss": 0.2619, + "step": 31121 + }, + { + "epoch": 0.9, + "grad_norm": 1.302423611428542, + "learning_rate": 2.4628746026150174e-07, + "loss": 0.2703, + "step": 31122 + }, + { + "epoch": 0.9, + "grad_norm": 1.2978411000598153, + "learning_rate": 2.461418804391508e-07, + "loss": 0.2624, + "step": 31123 + }, + { + "epoch": 0.9, + "grad_norm": 1.566907379771024, + "learning_rate": 2.4599634256965043e-07, + "loss": 0.2823, + "step": 31124 + }, + { + "epoch": 0.9, + "grad_norm": 1.267235066323612, + "learning_rate": 2.4585084665428525e-07, + "loss": 0.2454, + "step": 31125 + }, + { + "epoch": 0.9, + "grad_norm": 1.5760216661670532, + "learning_rate": 2.457053926943392e-07, + "loss": 0.276, + "step": 31126 + }, + { + "epoch": 0.9, + "grad_norm": 1.4601357887336022, + "learning_rate": 2.455599806910952e-07, + "loss": 0.2719, + "step": 31127 + }, + { + "epoch": 0.9, + "grad_norm": 1.2285616703949296, + "learning_rate": 2.454146106458377e-07, + "loss": 0.2514, + "step": 31128 + }, + { + "epoch": 0.9, + "grad_norm": 1.3643964432178326, + "learning_rate": 2.452692825598496e-07, + "loss": 0.246, + "step": 31129 + }, + { + "epoch": 0.9, + "grad_norm": 1.3892694951762006, + "learning_rate": 2.451239964344121e-07, + "loss": 0.2635, + "step": 31130 + }, + { + "epoch": 0.9, + "grad_norm": 1.2721148951940042, + "learning_rate": 2.44978752270808e-07, + "loss": 0.265, + "step": 31131 + }, + { + "epoch": 0.9, + "grad_norm": 1.3112162347265677, + "learning_rate": 2.4483355007031916e-07, + "loss": 0.2631, + "step": 31132 + }, + { + "epoch": 0.9, + "grad_norm": 1.4452241882087022, + "learning_rate": 2.446883898342273e-07, + "loss": 0.2716, + "step": 31133 + }, + { + "epoch": 0.9, + "grad_norm": 1.2573831301919407, + "learning_rate": 2.44543271563813e-07, + "loss": 0.2603, + "step": 31134 + }, + { + "epoch": 0.9, + "grad_norm": 1.6252237969111152, + "learning_rate": 2.443981952603569e-07, + "loss": 0.2727, + "step": 31135 + }, + { + "epoch": 0.9, + "grad_norm": 1.3503221803580157, + "learning_rate": 2.442531609251392e-07, + "loss": 0.2653, + "step": 31136 + }, + { + "epoch": 0.9, + "grad_norm": 1.3368956259627678, + "learning_rate": 2.4410816855944044e-07, + "loss": 0.2749, + "step": 31137 + }, + { + "epoch": 0.9, + "grad_norm": 1.2813059453910098, + "learning_rate": 2.439632181645407e-07, + "loss": 0.2642, + "step": 31138 + }, + { + "epoch": 0.9, + "grad_norm": 1.8234366188583082, + "learning_rate": 2.438183097417174e-07, + "loss": 0.2598, + "step": 31139 + }, + { + "epoch": 0.9, + "grad_norm": 1.4180865473932145, + "learning_rate": 2.436734432922505e-07, + "loss": 0.2696, + "step": 31140 + }, + { + "epoch": 0.9, + "grad_norm": 1.3598996598929614, + "learning_rate": 2.4352861881741795e-07, + "loss": 0.2751, + "step": 31141 + }, + { + "epoch": 0.9, + "grad_norm": 1.2749241630225654, + "learning_rate": 2.433838363184987e-07, + "loss": 0.2659, + "step": 31142 + }, + { + "epoch": 0.9, + "grad_norm": 1.3667835264735408, + "learning_rate": 2.4323909579676895e-07, + "loss": 0.2674, + "step": 31143 + }, + { + "epoch": 0.9, + "grad_norm": 1.7575090862256681, + "learning_rate": 2.430943972535077e-07, + "loss": 0.2908, + "step": 31144 + }, + { + "epoch": 0.9, + "grad_norm": 1.8672117626215217, + "learning_rate": 2.4294974068999113e-07, + "loss": 0.3146, + "step": 31145 + }, + { + "epoch": 0.9, + "grad_norm": 1.3291801234533118, + "learning_rate": 2.4280512610749594e-07, + "loss": 0.2693, + "step": 31146 + }, + { + "epoch": 0.9, + "grad_norm": 1.3208597512810978, + "learning_rate": 2.426605535072979e-07, + "loss": 0.2494, + "step": 31147 + }, + { + "epoch": 0.9, + "grad_norm": 1.337758237728671, + "learning_rate": 2.425160228906742e-07, + "loss": 0.2714, + "step": 31148 + }, + { + "epoch": 0.9, + "grad_norm": 1.3037771922924482, + "learning_rate": 2.4237153425889835e-07, + "loss": 0.2744, + "step": 31149 + }, + { + "epoch": 0.9, + "grad_norm": 1.515262171552356, + "learning_rate": 2.4222708761324655e-07, + "loss": 0.2662, + "step": 31150 + }, + { + "epoch": 0.9, + "grad_norm": 1.160092208604163, + "learning_rate": 2.420826829549938e-07, + "loss": 0.2706, + "step": 31151 + }, + { + "epoch": 0.9, + "grad_norm": 1.2601870180974877, + "learning_rate": 2.419383202854142e-07, + "loss": 0.3043, + "step": 31152 + }, + { + "epoch": 0.9, + "grad_norm": 1.3148460562852848, + "learning_rate": 2.417939996057811e-07, + "loss": 0.2811, + "step": 31153 + }, + { + "epoch": 0.9, + "grad_norm": 1.6424026231269515, + "learning_rate": 2.4164972091736907e-07, + "loss": 0.2723, + "step": 31154 + }, + { + "epoch": 0.9, + "grad_norm": 1.3708062108718388, + "learning_rate": 2.415054842214509e-07, + "loss": 0.2828, + "step": 31155 + }, + { + "epoch": 0.9, + "grad_norm": 1.4607803204310557, + "learning_rate": 2.413612895193007e-07, + "loss": 0.2658, + "step": 31156 + }, + { + "epoch": 0.9, + "grad_norm": 1.3705475397158282, + "learning_rate": 2.4121713681218905e-07, + "loss": 0.2634, + "step": 31157 + }, + { + "epoch": 0.9, + "grad_norm": 1.3675236154034245, + "learning_rate": 2.4107302610138883e-07, + "loss": 0.2731, + "step": 31158 + }, + { + "epoch": 0.9, + "grad_norm": 1.2702097641127974, + "learning_rate": 2.409289573881718e-07, + "loss": 0.2576, + "step": 31159 + }, + { + "epoch": 0.9, + "grad_norm": 1.4316528728640485, + "learning_rate": 2.407849306738097e-07, + "loss": 0.2806, + "step": 31160 + }, + { + "epoch": 0.9, + "grad_norm": 1.3037654835140948, + "learning_rate": 2.4064094595957377e-07, + "loss": 0.2618, + "step": 31161 + }, + { + "epoch": 0.9, + "grad_norm": 1.2748644065488395, + "learning_rate": 2.4049700324673355e-07, + "loss": 0.2694, + "step": 31162 + }, + { + "epoch": 0.9, + "grad_norm": 1.347191190475986, + "learning_rate": 2.4035310253656073e-07, + "loss": 0.2667, + "step": 31163 + }, + { + "epoch": 0.9, + "grad_norm": 1.2869929965500122, + "learning_rate": 2.4020924383032386e-07, + "loss": 0.2813, + "step": 31164 + }, + { + "epoch": 0.9, + "grad_norm": 3.186955605525416, + "learning_rate": 2.400654271292946e-07, + "loss": 0.2508, + "step": 31165 + }, + { + "epoch": 0.9, + "grad_norm": 1.2790169465897783, + "learning_rate": 2.3992165243473977e-07, + "loss": 0.2684, + "step": 31166 + }, + { + "epoch": 0.9, + "grad_norm": 1.2959584422045065, + "learning_rate": 2.397779197479294e-07, + "loss": 0.2729, + "step": 31167 + }, + { + "epoch": 0.9, + "grad_norm": 1.3650127856623142, + "learning_rate": 2.3963422907013144e-07, + "loss": 0.2707, + "step": 31168 + }, + { + "epoch": 0.9, + "grad_norm": 1.2017522854133336, + "learning_rate": 2.394905804026143e-07, + "loss": 0.2619, + "step": 31169 + }, + { + "epoch": 0.9, + "grad_norm": 1.4049138491415794, + "learning_rate": 2.3934697374664526e-07, + "loss": 0.2662, + "step": 31170 + }, + { + "epoch": 0.9, + "grad_norm": 1.5509529208020456, + "learning_rate": 2.392034091034923e-07, + "loss": 0.2973, + "step": 31171 + }, + { + "epoch": 0.9, + "grad_norm": 1.7136476313639524, + "learning_rate": 2.39059886474422e-07, + "loss": 0.2857, + "step": 31172 + }, + { + "epoch": 0.9, + "grad_norm": 1.6697963317725635, + "learning_rate": 2.389164058607013e-07, + "loss": 0.3351, + "step": 31173 + }, + { + "epoch": 0.9, + "grad_norm": 2.665318188222787, + "learning_rate": 2.387729672635963e-07, + "loss": 0.2655, + "step": 31174 + }, + { + "epoch": 0.9, + "grad_norm": 1.2685679514140566, + "learning_rate": 2.3862957068437333e-07, + "loss": 0.2544, + "step": 31175 + }, + { + "epoch": 0.9, + "grad_norm": 1.3969592139661597, + "learning_rate": 2.384862161242962e-07, + "loss": 0.2633, + "step": 31176 + }, + { + "epoch": 0.9, + "grad_norm": 1.3255511664153807, + "learning_rate": 2.3834290358463074e-07, + "loss": 0.2689, + "step": 31177 + }, + { + "epoch": 0.9, + "grad_norm": 1.5512610854472006, + "learning_rate": 2.381996330666425e-07, + "loss": 0.2818, + "step": 31178 + }, + { + "epoch": 0.9, + "grad_norm": 1.3716127314872764, + "learning_rate": 2.3805640457159497e-07, + "loss": 0.2777, + "step": 31179 + }, + { + "epoch": 0.9, + "grad_norm": 1.7324717805274839, + "learning_rate": 2.3791321810075263e-07, + "loss": 0.263, + "step": 31180 + }, + { + "epoch": 0.9, + "grad_norm": 1.3592468155459367, + "learning_rate": 2.3777007365537898e-07, + "loss": 0.2632, + "step": 31181 + }, + { + "epoch": 0.9, + "grad_norm": 1.7914434288862544, + "learning_rate": 2.3762697123673683e-07, + "loss": 0.2742, + "step": 31182 + }, + { + "epoch": 0.9, + "grad_norm": 1.6005830930833547, + "learning_rate": 2.3748391084609023e-07, + "loss": 0.2802, + "step": 31183 + }, + { + "epoch": 0.9, + "grad_norm": 1.452112341073134, + "learning_rate": 2.3734089248470093e-07, + "loss": 0.2943, + "step": 31184 + }, + { + "epoch": 0.9, + "grad_norm": 1.2495269343004252, + "learning_rate": 2.3719791615383126e-07, + "loss": 0.2636, + "step": 31185 + }, + { + "epoch": 0.9, + "grad_norm": 1.2856135594267797, + "learning_rate": 2.3705498185474186e-07, + "loss": 0.2655, + "step": 31186 + }, + { + "epoch": 0.9, + "grad_norm": 1.2807572585506763, + "learning_rate": 2.369120895886956e-07, + "loss": 0.2639, + "step": 31187 + }, + { + "epoch": 0.9, + "grad_norm": 1.5149750692444834, + "learning_rate": 2.3676923935695317e-07, + "loss": 0.2474, + "step": 31188 + }, + { + "epoch": 0.9, + "grad_norm": 1.2996771728572696, + "learning_rate": 2.3662643116077465e-07, + "loss": 0.2702, + "step": 31189 + }, + { + "epoch": 0.9, + "grad_norm": 1.8857530763709742, + "learning_rate": 2.3648366500142073e-07, + "loss": 0.2673, + "step": 31190 + }, + { + "epoch": 0.9, + "grad_norm": 1.5549733575437317, + "learning_rate": 2.3634094088015146e-07, + "loss": 0.2561, + "step": 31191 + }, + { + "epoch": 0.9, + "grad_norm": 1.3578976922888089, + "learning_rate": 2.36198258798227e-07, + "loss": 0.2796, + "step": 31192 + }, + { + "epoch": 0.9, + "grad_norm": 0.9331373248884223, + "learning_rate": 2.360556187569052e-07, + "loss": 0.4995, + "step": 31193 + }, + { + "epoch": 0.9, + "grad_norm": 1.7056057193337246, + "learning_rate": 2.3591302075744505e-07, + "loss": 0.2544, + "step": 31194 + }, + { + "epoch": 0.9, + "grad_norm": 1.0247180545689916, + "learning_rate": 2.3577046480110554e-07, + "loss": 0.6329, + "step": 31195 + }, + { + "epoch": 0.9, + "grad_norm": 1.4171900249141414, + "learning_rate": 2.356279508891446e-07, + "loss": 0.2701, + "step": 31196 + }, + { + "epoch": 0.9, + "grad_norm": 1.5752008935301558, + "learning_rate": 2.3548547902281948e-07, + "loss": 0.2577, + "step": 31197 + }, + { + "epoch": 0.9, + "grad_norm": 1.534151935417531, + "learning_rate": 2.3534304920338813e-07, + "loss": 0.2601, + "step": 31198 + }, + { + "epoch": 0.9, + "grad_norm": 2.3995523343270397, + "learning_rate": 2.352006614321073e-07, + "loss": 0.2744, + "step": 31199 + }, + { + "epoch": 0.9, + "grad_norm": 1.4558897943304336, + "learning_rate": 2.350583157102332e-07, + "loss": 0.2645, + "step": 31200 + }, + { + "epoch": 0.9, + "grad_norm": 1.4019531982230267, + "learning_rate": 2.3491601203902315e-07, + "loss": 0.265, + "step": 31201 + }, + { + "epoch": 0.91, + "grad_norm": 1.2780058325836074, + "learning_rate": 2.3477375041973172e-07, + "loss": 0.2749, + "step": 31202 + }, + { + "epoch": 0.91, + "grad_norm": 1.3162559241051972, + "learning_rate": 2.3463153085361457e-07, + "loss": 0.2674, + "step": 31203 + }, + { + "epoch": 0.91, + "grad_norm": 1.5717343972631035, + "learning_rate": 2.3448935334192736e-07, + "loss": 0.2825, + "step": 31204 + }, + { + "epoch": 0.91, + "grad_norm": 1.4118000908586712, + "learning_rate": 2.3434721788592462e-07, + "loss": 0.2487, + "step": 31205 + }, + { + "epoch": 0.91, + "grad_norm": 1.2654069305929136, + "learning_rate": 2.3420512448685985e-07, + "loss": 0.2677, + "step": 31206 + }, + { + "epoch": 0.91, + "grad_norm": 1.5089789671030271, + "learning_rate": 2.3406307314598808e-07, + "loss": 0.313, + "step": 31207 + }, + { + "epoch": 0.91, + "grad_norm": 1.323799697909493, + "learning_rate": 2.3392106386456225e-07, + "loss": 0.2604, + "step": 31208 + }, + { + "epoch": 0.91, + "grad_norm": 0.972893318006182, + "learning_rate": 2.3377909664383636e-07, + "loss": 0.5701, + "step": 31209 + }, + { + "epoch": 0.91, + "grad_norm": 1.3340860568843074, + "learning_rate": 2.3363717148506215e-07, + "loss": 0.2966, + "step": 31210 + }, + { + "epoch": 0.91, + "grad_norm": 1.6132428590130286, + "learning_rate": 2.334952883894942e-07, + "loss": 0.2755, + "step": 31211 + }, + { + "epoch": 0.91, + "grad_norm": 1.4933151473083837, + "learning_rate": 2.3335344735838207e-07, + "loss": 0.2674, + "step": 31212 + }, + { + "epoch": 0.91, + "grad_norm": 1.467694845302438, + "learning_rate": 2.332116483929786e-07, + "loss": 0.2791, + "step": 31213 + }, + { + "epoch": 0.91, + "grad_norm": 1.613121252104351, + "learning_rate": 2.3306989149453562e-07, + "loss": 0.2999, + "step": 31214 + }, + { + "epoch": 0.91, + "grad_norm": 1.4002188539842009, + "learning_rate": 2.3292817666430323e-07, + "loss": 0.2551, + "step": 31215 + }, + { + "epoch": 0.91, + "grad_norm": 1.4114831933687837, + "learning_rate": 2.327865039035332e-07, + "loss": 0.2663, + "step": 31216 + }, + { + "epoch": 0.91, + "grad_norm": 0.9408459426350405, + "learning_rate": 2.326448732134745e-07, + "loss": 0.5747, + "step": 31217 + }, + { + "epoch": 0.91, + "grad_norm": 1.4397457703776138, + "learning_rate": 2.3250328459537784e-07, + "loss": 0.2743, + "step": 31218 + }, + { + "epoch": 0.91, + "grad_norm": 1.3135508467415407, + "learning_rate": 2.3236173805049277e-07, + "loss": 0.2639, + "step": 31219 + }, + { + "epoch": 0.91, + "grad_norm": 1.2770912024467278, + "learning_rate": 2.3222023358006883e-07, + "loss": 0.2862, + "step": 31220 + }, + { + "epoch": 0.91, + "grad_norm": 6.782542945235951, + "learning_rate": 2.3207877118535393e-07, + "loss": 0.2877, + "step": 31221 + }, + { + "epoch": 0.91, + "grad_norm": 1.3020718251158603, + "learning_rate": 2.3193735086759594e-07, + "loss": 0.2593, + "step": 31222 + }, + { + "epoch": 0.91, + "grad_norm": 1.5143469917890722, + "learning_rate": 2.3179597262804443e-07, + "loss": 0.2791, + "step": 31223 + }, + { + "epoch": 0.91, + "grad_norm": 1.3199049473112827, + "learning_rate": 2.3165463646794618e-07, + "loss": 0.2452, + "step": 31224 + }, + { + "epoch": 0.91, + "grad_norm": 1.5533247353393562, + "learning_rate": 2.315133423885485e-07, + "loss": 0.2899, + "step": 31225 + }, + { + "epoch": 0.91, + "grad_norm": 1.6374660277538102, + "learning_rate": 2.3137209039109875e-07, + "loss": 0.2652, + "step": 31226 + }, + { + "epoch": 0.91, + "grad_norm": 1.3045908980254899, + "learning_rate": 2.3123088047684316e-07, + "loss": 0.274, + "step": 31227 + }, + { + "epoch": 0.91, + "grad_norm": 1.6217815024033313, + "learning_rate": 2.310897126470285e-07, + "loss": 0.3067, + "step": 31228 + }, + { + "epoch": 0.91, + "grad_norm": 1.6223479915894827, + "learning_rate": 2.3094858690289934e-07, + "loss": 0.2997, + "step": 31229 + }, + { + "epoch": 0.91, + "grad_norm": 1.4889176929038646, + "learning_rate": 2.3080750324570134e-07, + "loss": 0.2632, + "step": 31230 + }, + { + "epoch": 0.91, + "grad_norm": 1.319192292476129, + "learning_rate": 2.306664616766807e-07, + "loss": 0.2724, + "step": 31231 + }, + { + "epoch": 0.91, + "grad_norm": 1.406185750500783, + "learning_rate": 2.3052546219708206e-07, + "loss": 0.2506, + "step": 31232 + }, + { + "epoch": 0.91, + "grad_norm": 1.3955950368528072, + "learning_rate": 2.3038450480814821e-07, + "loss": 0.2593, + "step": 31233 + }, + { + "epoch": 0.91, + "grad_norm": 1.330448745760416, + "learning_rate": 2.3024358951112435e-07, + "loss": 0.267, + "step": 31234 + }, + { + "epoch": 0.91, + "grad_norm": 1.4522001534375901, + "learning_rate": 2.3010271630725335e-07, + "loss": 0.2643, + "step": 31235 + }, + { + "epoch": 0.91, + "grad_norm": 1.54108017707463, + "learning_rate": 2.2996188519777863e-07, + "loss": 0.2629, + "step": 31236 + }, + { + "epoch": 0.91, + "grad_norm": 1.3077428067778691, + "learning_rate": 2.2982109618394367e-07, + "loss": 0.2544, + "step": 31237 + }, + { + "epoch": 0.91, + "grad_norm": 1.3723156533693033, + "learning_rate": 2.2968034926699024e-07, + "loss": 0.279, + "step": 31238 + }, + { + "epoch": 0.91, + "grad_norm": 1.3269501894170945, + "learning_rate": 2.2953964444816122e-07, + "loss": 0.2578, + "step": 31239 + }, + { + "epoch": 0.91, + "grad_norm": 1.3220309680675448, + "learning_rate": 2.2939898172869735e-07, + "loss": 0.2811, + "step": 31240 + }, + { + "epoch": 0.91, + "grad_norm": 1.4306416863948959, + "learning_rate": 2.2925836110984033e-07, + "loss": 0.2612, + "step": 31241 + }, + { + "epoch": 0.91, + "grad_norm": 1.2939292230263764, + "learning_rate": 2.291177825928309e-07, + "loss": 0.2693, + "step": 31242 + }, + { + "epoch": 0.91, + "grad_norm": 1.4065264304676468, + "learning_rate": 2.2897724617891026e-07, + "loss": 0.2713, + "step": 31243 + }, + { + "epoch": 0.91, + "grad_norm": 1.7066968433925016, + "learning_rate": 2.2883675186931854e-07, + "loss": 0.2689, + "step": 31244 + }, + { + "epoch": 0.91, + "grad_norm": 10.773303056381845, + "learning_rate": 2.2869629966529528e-07, + "loss": 0.2759, + "step": 31245 + }, + { + "epoch": 0.91, + "grad_norm": 1.5480931460495813, + "learning_rate": 2.2855588956808006e-07, + "loss": 0.3075, + "step": 31246 + }, + { + "epoch": 0.91, + "grad_norm": 1.3486785586005843, + "learning_rate": 2.2841552157891246e-07, + "loss": 0.2895, + "step": 31247 + }, + { + "epoch": 0.91, + "grad_norm": 1.5196526256603273, + "learning_rate": 2.2827519569903034e-07, + "loss": 0.2745, + "step": 31248 + }, + { + "epoch": 0.91, + "grad_norm": 1.4298338608704677, + "learning_rate": 2.2813491192967217e-07, + "loss": 0.3022, + "step": 31249 + }, + { + "epoch": 0.91, + "grad_norm": 1.2462584494183804, + "learning_rate": 2.27994670272077e-07, + "loss": 0.2824, + "step": 31250 + }, + { + "epoch": 0.91, + "grad_norm": 1.2238067617287804, + "learning_rate": 2.2785447072748156e-07, + "loss": 0.2538, + "step": 31251 + }, + { + "epoch": 0.91, + "grad_norm": 1.56147739765572, + "learning_rate": 2.2771431329712324e-07, + "loss": 0.2573, + "step": 31252 + }, + { + "epoch": 0.91, + "grad_norm": 1.4459224535203268, + "learning_rate": 2.2757419798223933e-07, + "loss": 0.2669, + "step": 31253 + }, + { + "epoch": 0.91, + "grad_norm": 1.37176675424768, + "learning_rate": 2.2743412478406557e-07, + "loss": 0.2547, + "step": 31254 + }, + { + "epoch": 0.91, + "grad_norm": 1.4414954453431819, + "learning_rate": 2.272940937038387e-07, + "loss": 0.267, + "step": 31255 + }, + { + "epoch": 0.91, + "grad_norm": 1.352560142002468, + "learning_rate": 2.2715410474279554e-07, + "loss": 0.273, + "step": 31256 + }, + { + "epoch": 0.91, + "grad_norm": 1.3089933040632824, + "learning_rate": 2.270141579021695e-07, + "loss": 0.2737, + "step": 31257 + }, + { + "epoch": 0.91, + "grad_norm": 1.2906177194190853, + "learning_rate": 2.2687425318319633e-07, + "loss": 0.2545, + "step": 31258 + }, + { + "epoch": 0.91, + "grad_norm": 1.8666751634848242, + "learning_rate": 2.2673439058711055e-07, + "loss": 0.2911, + "step": 31259 + }, + { + "epoch": 0.91, + "grad_norm": 1.5026192856944434, + "learning_rate": 2.2659457011514786e-07, + "loss": 0.2626, + "step": 31260 + }, + { + "epoch": 0.91, + "grad_norm": 1.310784434085015, + "learning_rate": 2.2645479176854003e-07, + "loss": 0.2512, + "step": 31261 + }, + { + "epoch": 0.91, + "grad_norm": 1.3646738539999068, + "learning_rate": 2.2631505554852216e-07, + "loss": 0.2421, + "step": 31262 + }, + { + "epoch": 0.91, + "grad_norm": 1.796079310746462, + "learning_rate": 2.2617536145632612e-07, + "loss": 0.3038, + "step": 31263 + }, + { + "epoch": 0.91, + "grad_norm": 1.6330031983355011, + "learning_rate": 2.2603570949318586e-07, + "loss": 0.2726, + "step": 31264 + }, + { + "epoch": 0.91, + "grad_norm": 1.3412354079399038, + "learning_rate": 2.2589609966033377e-07, + "loss": 0.2954, + "step": 31265 + }, + { + "epoch": 0.91, + "grad_norm": 1.807812424329768, + "learning_rate": 2.257565319590016e-07, + "loss": 0.2546, + "step": 31266 + }, + { + "epoch": 0.91, + "grad_norm": 1.3262412846713523, + "learning_rate": 2.2561700639042062e-07, + "loss": 0.268, + "step": 31267 + }, + { + "epoch": 0.91, + "grad_norm": 1.5558528805222014, + "learning_rate": 2.254775229558226e-07, + "loss": 0.2682, + "step": 31268 + }, + { + "epoch": 0.91, + "grad_norm": 1.6957122363397772, + "learning_rate": 2.2533808165643823e-07, + "loss": 0.2906, + "step": 31269 + }, + { + "epoch": 0.91, + "grad_norm": 1.514712937430935, + "learning_rate": 2.2519868249349875e-07, + "loss": 0.2653, + "step": 31270 + }, + { + "epoch": 0.91, + "grad_norm": 2.142842696019358, + "learning_rate": 2.2505932546823316e-07, + "loss": 0.2633, + "step": 31271 + }, + { + "epoch": 0.91, + "grad_norm": 1.38950652646571, + "learning_rate": 2.2492001058187218e-07, + "loss": 0.2693, + "step": 31272 + }, + { + "epoch": 0.91, + "grad_norm": 1.3158009198774627, + "learning_rate": 2.2478073783564536e-07, + "loss": 0.2769, + "step": 31273 + }, + { + "epoch": 0.91, + "grad_norm": 1.336155165890942, + "learning_rate": 2.2464150723078172e-07, + "loss": 0.2595, + "step": 31274 + }, + { + "epoch": 0.91, + "grad_norm": 2.2524248595920757, + "learning_rate": 2.245023187685097e-07, + "loss": 0.2732, + "step": 31275 + }, + { + "epoch": 0.91, + "grad_norm": 1.35576483361612, + "learning_rate": 2.2436317245005724e-07, + "loss": 0.2685, + "step": 31276 + }, + { + "epoch": 0.91, + "grad_norm": 1.2739650175861044, + "learning_rate": 2.2422406827665334e-07, + "loss": 0.26, + "step": 31277 + }, + { + "epoch": 0.91, + "grad_norm": 1.3094086715379665, + "learning_rate": 2.2408500624952424e-07, + "loss": 0.2973, + "step": 31278 + }, + { + "epoch": 0.91, + "grad_norm": 1.3790947934802773, + "learning_rate": 2.239459863698984e-07, + "loss": 0.2701, + "step": 31279 + }, + { + "epoch": 0.91, + "grad_norm": 1.3597757939363622, + "learning_rate": 2.2380700863900207e-07, + "loss": 0.2664, + "step": 31280 + }, + { + "epoch": 0.91, + "grad_norm": 1.3618311121003877, + "learning_rate": 2.2366807305806203e-07, + "loss": 0.2781, + "step": 31281 + }, + { + "epoch": 0.91, + "grad_norm": 1.350036325952491, + "learning_rate": 2.23529179628304e-07, + "loss": 0.2526, + "step": 31282 + }, + { + "epoch": 0.91, + "grad_norm": 1.2442712458419902, + "learning_rate": 2.2339032835095475e-07, + "loss": 0.2663, + "step": 31283 + }, + { + "epoch": 0.91, + "grad_norm": 1.3435603232827598, + "learning_rate": 2.2325151922723831e-07, + "loss": 0.2712, + "step": 31284 + }, + { + "epoch": 0.91, + "grad_norm": 1.4054670673132257, + "learning_rate": 2.231127522583798e-07, + "loss": 0.2536, + "step": 31285 + }, + { + "epoch": 0.91, + "grad_norm": 1.3325916185057851, + "learning_rate": 2.2297402744560438e-07, + "loss": 0.2613, + "step": 31286 + }, + { + "epoch": 0.91, + "grad_norm": 1.4685417701199188, + "learning_rate": 2.2283534479013658e-07, + "loss": 0.2817, + "step": 31287 + }, + { + "epoch": 0.91, + "grad_norm": 1.3993776670767295, + "learning_rate": 2.2269670429319988e-07, + "loss": 0.2899, + "step": 31288 + }, + { + "epoch": 0.91, + "grad_norm": 1.2362619236451142, + "learning_rate": 2.2255810595601725e-07, + "loss": 0.2881, + "step": 31289 + }, + { + "epoch": 0.91, + "grad_norm": 1.4809478468428676, + "learning_rate": 2.224195497798126e-07, + "loss": 0.2695, + "step": 31290 + }, + { + "epoch": 0.91, + "grad_norm": 1.4432421957163395, + "learning_rate": 2.222810357658084e-07, + "loss": 0.2807, + "step": 31291 + }, + { + "epoch": 0.91, + "grad_norm": 1.2717221018240659, + "learning_rate": 2.2214256391522692e-07, + "loss": 0.2817, + "step": 31292 + }, + { + "epoch": 0.91, + "grad_norm": 1.392446366739751, + "learning_rate": 2.2200413422929057e-07, + "loss": 0.2786, + "step": 31293 + }, + { + "epoch": 0.91, + "grad_norm": 1.4448062689056353, + "learning_rate": 2.2186574670922112e-07, + "loss": 0.2838, + "step": 31294 + }, + { + "epoch": 0.91, + "grad_norm": 1.8702045699721481, + "learning_rate": 2.2172740135623872e-07, + "loss": 0.261, + "step": 31295 + }, + { + "epoch": 0.91, + "grad_norm": 1.2536222518606737, + "learning_rate": 2.2158909817156515e-07, + "loss": 0.2674, + "step": 31296 + }, + { + "epoch": 0.91, + "grad_norm": 1.284188919616605, + "learning_rate": 2.2145083715642113e-07, + "loss": 0.2836, + "step": 31297 + }, + { + "epoch": 0.91, + "grad_norm": 1.327134323958644, + "learning_rate": 2.2131261831202622e-07, + "loss": 0.3, + "step": 31298 + }, + { + "epoch": 0.91, + "grad_norm": 1.5701917507178775, + "learning_rate": 2.211744416396e-07, + "loss": 0.2673, + "step": 31299 + }, + { + "epoch": 0.91, + "grad_norm": 1.4278588683332032, + "learning_rate": 2.2103630714036318e-07, + "loss": 0.2726, + "step": 31300 + }, + { + "epoch": 0.91, + "grad_norm": 1.395196156161175, + "learning_rate": 2.2089821481553365e-07, + "loss": 0.2618, + "step": 31301 + }, + { + "epoch": 0.91, + "grad_norm": 1.371504706587602, + "learning_rate": 2.2076016466633044e-07, + "loss": 0.2812, + "step": 31302 + }, + { + "epoch": 0.91, + "grad_norm": 2.014192775914106, + "learning_rate": 2.2062215669397201e-07, + "loss": 0.2577, + "step": 31303 + }, + { + "epoch": 0.91, + "grad_norm": 1.213297607048204, + "learning_rate": 2.2048419089967576e-07, + "loss": 0.2632, + "step": 31304 + }, + { + "epoch": 0.91, + "grad_norm": 1.456473509343891, + "learning_rate": 2.2034626728465958e-07, + "loss": 0.2854, + "step": 31305 + }, + { + "epoch": 0.91, + "grad_norm": 1.3668770406083877, + "learning_rate": 2.2020838585014026e-07, + "loss": 0.2636, + "step": 31306 + }, + { + "epoch": 0.91, + "grad_norm": 1.3688776511970016, + "learning_rate": 2.2007054659733518e-07, + "loss": 0.2687, + "step": 31307 + }, + { + "epoch": 0.91, + "grad_norm": 1.296959491341146, + "learning_rate": 2.1993274952746058e-07, + "loss": 0.2789, + "step": 31308 + }, + { + "epoch": 0.91, + "grad_norm": 1.3543260508848654, + "learning_rate": 2.197949946417327e-07, + "loss": 0.268, + "step": 31309 + }, + { + "epoch": 0.91, + "grad_norm": 1.3692653165063413, + "learning_rate": 2.196572819413667e-07, + "loss": 0.2905, + "step": 31310 + }, + { + "epoch": 0.91, + "grad_norm": 1.41076709377737, + "learning_rate": 2.1951961142757883e-07, + "loss": 0.2786, + "step": 31311 + }, + { + "epoch": 0.91, + "grad_norm": 1.4347074522821386, + "learning_rate": 2.193819831015831e-07, + "loss": 0.2776, + "step": 31312 + }, + { + "epoch": 0.91, + "grad_norm": 1.6399222592239544, + "learning_rate": 2.192443969645941e-07, + "loss": 0.2794, + "step": 31313 + }, + { + "epoch": 0.91, + "grad_norm": 1.264806934332129, + "learning_rate": 2.1910685301782642e-07, + "loss": 0.2502, + "step": 31314 + }, + { + "epoch": 0.91, + "grad_norm": 1.2968753114442202, + "learning_rate": 2.189693512624941e-07, + "loss": 0.2591, + "step": 31315 + }, + { + "epoch": 0.91, + "grad_norm": 1.353675556773844, + "learning_rate": 2.1883189169981056e-07, + "loss": 0.2627, + "step": 31316 + }, + { + "epoch": 0.91, + "grad_norm": 1.5047922300641527, + "learning_rate": 2.1869447433098822e-07, + "loss": 0.2799, + "step": 31317 + }, + { + "epoch": 0.91, + "grad_norm": 1.2594961048950253, + "learning_rate": 2.1855709915724e-07, + "loss": 0.2705, + "step": 31318 + }, + { + "epoch": 0.91, + "grad_norm": 1.3815345111280894, + "learning_rate": 2.184197661797788e-07, + "loss": 0.2716, + "step": 31319 + }, + { + "epoch": 0.91, + "grad_norm": 1.4228462267523347, + "learning_rate": 2.1828247539981583e-07, + "loss": 0.2653, + "step": 31320 + }, + { + "epoch": 0.91, + "grad_norm": 1.3136754714129846, + "learning_rate": 2.1814522681856353e-07, + "loss": 0.2609, + "step": 31321 + }, + { + "epoch": 0.91, + "grad_norm": 1.5398706996882985, + "learning_rate": 2.1800802043723258e-07, + "loss": 0.3283, + "step": 31322 + }, + { + "epoch": 0.91, + "grad_norm": 1.3322712416011542, + "learning_rate": 2.178708562570331e-07, + "loss": 0.2765, + "step": 31323 + }, + { + "epoch": 0.91, + "grad_norm": 1.3430027041943768, + "learning_rate": 2.1773373427917687e-07, + "loss": 0.2759, + "step": 31324 + }, + { + "epoch": 0.91, + "grad_norm": 1.3713620185264208, + "learning_rate": 2.1759665450487354e-07, + "loss": 0.2721, + "step": 31325 + }, + { + "epoch": 0.91, + "grad_norm": 1.4007182456930543, + "learning_rate": 2.174596169353327e-07, + "loss": 0.2639, + "step": 31326 + }, + { + "epoch": 0.91, + "grad_norm": 1.2580786840361216, + "learning_rate": 2.1732262157176386e-07, + "loss": 0.2595, + "step": 31327 + }, + { + "epoch": 0.91, + "grad_norm": 1.3190869264296197, + "learning_rate": 2.1718566841537613e-07, + "loss": 0.2575, + "step": 31328 + }, + { + "epoch": 0.91, + "grad_norm": 1.2753739546758045, + "learning_rate": 2.1704875746737742e-07, + "loss": 0.2588, + "step": 31329 + }, + { + "epoch": 0.91, + "grad_norm": 1.87862801849168, + "learning_rate": 2.1691188872897727e-07, + "loss": 0.2554, + "step": 31330 + }, + { + "epoch": 0.91, + "grad_norm": 1.2774901006972577, + "learning_rate": 2.1677506220138255e-07, + "loss": 0.265, + "step": 31331 + }, + { + "epoch": 0.91, + "grad_norm": 1.4845627000290456, + "learning_rate": 2.166382778858006e-07, + "loss": 0.2791, + "step": 31332 + }, + { + "epoch": 0.91, + "grad_norm": 1.5790276403945813, + "learning_rate": 2.165015357834388e-07, + "loss": 0.2774, + "step": 31333 + }, + { + "epoch": 0.91, + "grad_norm": 1.4572829774410774, + "learning_rate": 2.1636483589550393e-07, + "loss": 0.2774, + "step": 31334 + }, + { + "epoch": 0.91, + "grad_norm": 1.2996230867128125, + "learning_rate": 2.1622817822320285e-07, + "loss": 0.261, + "step": 31335 + }, + { + "epoch": 0.91, + "grad_norm": 1.3182071834364277, + "learning_rate": 2.1609156276774124e-07, + "loss": 0.2935, + "step": 31336 + }, + { + "epoch": 0.91, + "grad_norm": 1.033069017928, + "learning_rate": 2.1595498953032422e-07, + "loss": 0.5718, + "step": 31337 + }, + { + "epoch": 0.91, + "grad_norm": 1.4519846874492273, + "learning_rate": 2.1581845851215866e-07, + "loss": 0.2694, + "step": 31338 + }, + { + "epoch": 0.91, + "grad_norm": 1.3315634130380587, + "learning_rate": 2.1568196971444689e-07, + "loss": 0.2728, + "step": 31339 + }, + { + "epoch": 0.91, + "grad_norm": 1.6451334239329565, + "learning_rate": 2.1554552313839516e-07, + "loss": 0.2737, + "step": 31340 + }, + { + "epoch": 0.91, + "grad_norm": 1.3870928454346036, + "learning_rate": 2.1540911878520753e-07, + "loss": 0.2668, + "step": 31341 + }, + { + "epoch": 0.91, + "grad_norm": 1.4621545897205785, + "learning_rate": 2.152727566560875e-07, + "loss": 0.2761, + "step": 31342 + }, + { + "epoch": 0.91, + "grad_norm": 1.3943733248758838, + "learning_rate": 2.1513643675223794e-07, + "loss": 0.2565, + "step": 31343 + }, + { + "epoch": 0.91, + "grad_norm": 1.4030576586518069, + "learning_rate": 2.150001590748635e-07, + "loss": 0.2896, + "step": 31344 + }, + { + "epoch": 0.91, + "grad_norm": 1.3798352851527576, + "learning_rate": 2.148639236251654e-07, + "loss": 0.286, + "step": 31345 + }, + { + "epoch": 0.91, + "grad_norm": 1.3098887076726646, + "learning_rate": 2.1472773040434546e-07, + "loss": 0.261, + "step": 31346 + }, + { + "epoch": 0.91, + "grad_norm": 1.3691690133461933, + "learning_rate": 2.1459157941360664e-07, + "loss": 0.2852, + "step": 31347 + }, + { + "epoch": 0.91, + "grad_norm": 1.3634158076154765, + "learning_rate": 2.1445547065415074e-07, + "loss": 0.2809, + "step": 31348 + }, + { + "epoch": 0.91, + "grad_norm": 1.4158709032141197, + "learning_rate": 2.1431940412717843e-07, + "loss": 0.2773, + "step": 31349 + }, + { + "epoch": 0.91, + "grad_norm": 1.313976901684275, + "learning_rate": 2.1418337983388994e-07, + "loss": 0.2756, + "step": 31350 + }, + { + "epoch": 0.91, + "grad_norm": 1.6769541862568427, + "learning_rate": 2.1404739777548645e-07, + "loss": 0.2689, + "step": 31351 + }, + { + "epoch": 0.91, + "grad_norm": 1.8154738232321421, + "learning_rate": 2.1391145795316758e-07, + "loss": 0.3089, + "step": 31352 + }, + { + "epoch": 0.91, + "grad_norm": 1.652674951908352, + "learning_rate": 2.1377556036813353e-07, + "loss": 0.275, + "step": 31353 + }, + { + "epoch": 0.91, + "grad_norm": 1.6341390397745412, + "learning_rate": 2.136397050215827e-07, + "loss": 0.2861, + "step": 31354 + }, + { + "epoch": 0.91, + "grad_norm": 1.4279155663795513, + "learning_rate": 2.1350389191471478e-07, + "loss": 0.292, + "step": 31355 + }, + { + "epoch": 0.91, + "grad_norm": 1.3737514451163428, + "learning_rate": 2.1336812104872816e-07, + "loss": 0.2665, + "step": 31356 + }, + { + "epoch": 0.91, + "grad_norm": 1.3970435523411848, + "learning_rate": 2.1323239242482197e-07, + "loss": 0.269, + "step": 31357 + }, + { + "epoch": 0.91, + "grad_norm": 1.1991093724268724, + "learning_rate": 2.1309670604419186e-07, + "loss": 0.2501, + "step": 31358 + }, + { + "epoch": 0.91, + "grad_norm": 1.3448763841786342, + "learning_rate": 2.129610619080369e-07, + "loss": 0.2796, + "step": 31359 + }, + { + "epoch": 0.91, + "grad_norm": 1.259433281103455, + "learning_rate": 2.1282546001755333e-07, + "loss": 0.2856, + "step": 31360 + }, + { + "epoch": 0.91, + "grad_norm": 1.392810996875615, + "learning_rate": 2.12689900373938e-07, + "loss": 0.2861, + "step": 31361 + }, + { + "epoch": 0.91, + "grad_norm": 1.339711164862268, + "learning_rate": 2.1255438297838771e-07, + "loss": 0.2549, + "step": 31362 + }, + { + "epoch": 0.91, + "grad_norm": 1.2729160755317428, + "learning_rate": 2.1241890783209817e-07, + "loss": 0.2755, + "step": 31363 + }, + { + "epoch": 0.91, + "grad_norm": 1.3373608340247574, + "learning_rate": 2.1228347493626455e-07, + "loss": 0.2584, + "step": 31364 + }, + { + "epoch": 0.91, + "grad_norm": 1.5839476272396416, + "learning_rate": 2.1214808429208255e-07, + "loss": 0.2544, + "step": 31365 + }, + { + "epoch": 0.91, + "grad_norm": 1.4186363973285656, + "learning_rate": 2.1201273590074733e-07, + "loss": 0.2654, + "step": 31366 + }, + { + "epoch": 0.91, + "grad_norm": 1.34164295791694, + "learning_rate": 2.1187742976345237e-07, + "loss": 0.2672, + "step": 31367 + }, + { + "epoch": 0.91, + "grad_norm": 1.5409766346350062, + "learning_rate": 2.117421658813923e-07, + "loss": 0.2691, + "step": 31368 + }, + { + "epoch": 0.91, + "grad_norm": 1.3429954495182108, + "learning_rate": 2.1160694425576057e-07, + "loss": 0.2549, + "step": 31369 + }, + { + "epoch": 0.91, + "grad_norm": 1.3937681702719702, + "learning_rate": 2.1147176488775066e-07, + "loss": 0.2717, + "step": 31370 + }, + { + "epoch": 0.91, + "grad_norm": 0.9586105586834083, + "learning_rate": 2.1133662777855558e-07, + "loss": 0.5981, + "step": 31371 + }, + { + "epoch": 0.91, + "grad_norm": 1.3570935879951358, + "learning_rate": 2.1120153292936874e-07, + "loss": 0.2573, + "step": 31372 + }, + { + "epoch": 0.91, + "grad_norm": 0.9103141459391546, + "learning_rate": 2.1106648034138034e-07, + "loss": 0.5977, + "step": 31373 + }, + { + "epoch": 0.91, + "grad_norm": 0.9712095245454289, + "learning_rate": 2.1093147001578383e-07, + "loss": 0.526, + "step": 31374 + }, + { + "epoch": 0.91, + "grad_norm": 1.4125440012655275, + "learning_rate": 2.1079650195377e-07, + "loss": 0.263, + "step": 31375 + }, + { + "epoch": 0.91, + "grad_norm": 1.301861396872329, + "learning_rate": 2.1066157615653117e-07, + "loss": 0.2663, + "step": 31376 + }, + { + "epoch": 0.91, + "grad_norm": 1.5575433657464908, + "learning_rate": 2.1052669262525583e-07, + "loss": 0.266, + "step": 31377 + }, + { + "epoch": 0.91, + "grad_norm": 1.3682936242994093, + "learning_rate": 2.1039185136113582e-07, + "loss": 0.2443, + "step": 31378 + }, + { + "epoch": 0.91, + "grad_norm": 1.400862967094851, + "learning_rate": 2.1025705236536131e-07, + "loss": 0.2561, + "step": 31379 + }, + { + "epoch": 0.91, + "grad_norm": 1.461707008161729, + "learning_rate": 2.101222956391208e-07, + "loss": 0.2709, + "step": 31380 + }, + { + "epoch": 0.91, + "grad_norm": 1.3378642998785475, + "learning_rate": 2.099875811836044e-07, + "loss": 0.2916, + "step": 31381 + }, + { + "epoch": 0.91, + "grad_norm": 1.1696309681625185, + "learning_rate": 2.0985290900000067e-07, + "loss": 0.271, + "step": 31382 + }, + { + "epoch": 0.91, + "grad_norm": 1.284107938171074, + "learning_rate": 2.0971827908949804e-07, + "loss": 0.2631, + "step": 31383 + }, + { + "epoch": 0.91, + "grad_norm": 1.469475181644934, + "learning_rate": 2.0958369145328505e-07, + "loss": 0.2599, + "step": 31384 + }, + { + "epoch": 0.91, + "grad_norm": 1.437157926124265, + "learning_rate": 2.0944914609254964e-07, + "loss": 0.2784, + "step": 31385 + }, + { + "epoch": 0.91, + "grad_norm": 1.9007606486556912, + "learning_rate": 2.0931464300847803e-07, + "loss": 0.266, + "step": 31386 + }, + { + "epoch": 0.91, + "grad_norm": 1.510287252257351, + "learning_rate": 2.0918018220225823e-07, + "loss": 0.2617, + "step": 31387 + }, + { + "epoch": 0.91, + "grad_norm": 1.4137047271223075, + "learning_rate": 2.090457636750759e-07, + "loss": 0.2754, + "step": 31388 + }, + { + "epoch": 0.91, + "grad_norm": 1.3153681850338983, + "learning_rate": 2.0891138742811789e-07, + "loss": 0.2664, + "step": 31389 + }, + { + "epoch": 0.91, + "grad_norm": 1.3886204277423737, + "learning_rate": 2.087770534625705e-07, + "loss": 0.2796, + "step": 31390 + }, + { + "epoch": 0.91, + "grad_norm": 1.2890161833996296, + "learning_rate": 2.086427617796183e-07, + "loss": 0.2603, + "step": 31391 + }, + { + "epoch": 0.91, + "grad_norm": 1.381413219986109, + "learning_rate": 2.08508512380447e-07, + "loss": 0.2554, + "step": 31392 + }, + { + "epoch": 0.91, + "grad_norm": 1.5627341176761036, + "learning_rate": 2.083743052662418e-07, + "loss": 0.2571, + "step": 31393 + }, + { + "epoch": 0.91, + "grad_norm": 1.5515421973319423, + "learning_rate": 2.0824014043818618e-07, + "loss": 0.2536, + "step": 31394 + }, + { + "epoch": 0.91, + "grad_norm": 1.3223170895946406, + "learning_rate": 2.081060178974642e-07, + "loss": 0.2541, + "step": 31395 + }, + { + "epoch": 0.91, + "grad_norm": 1.33199014004003, + "learning_rate": 2.0797193764525936e-07, + "loss": 0.2469, + "step": 31396 + }, + { + "epoch": 0.91, + "grad_norm": 1.2265695314916352, + "learning_rate": 2.0783789968275624e-07, + "loss": 0.2805, + "step": 31397 + }, + { + "epoch": 0.91, + "grad_norm": 0.9846839249416276, + "learning_rate": 2.0770390401113617e-07, + "loss": 0.5594, + "step": 31398 + }, + { + "epoch": 0.91, + "grad_norm": 9.821495832041238, + "learning_rate": 2.0756995063158258e-07, + "loss": 0.2604, + "step": 31399 + }, + { + "epoch": 0.91, + "grad_norm": 1.341223031998642, + "learning_rate": 2.0743603954527792e-07, + "loss": 0.2829, + "step": 31400 + }, + { + "epoch": 0.91, + "grad_norm": 0.9557817094951477, + "learning_rate": 2.0730217075340287e-07, + "loss": 0.5741, + "step": 31401 + }, + { + "epoch": 0.91, + "grad_norm": 1.3073403703321045, + "learning_rate": 2.0716834425713927e-07, + "loss": 0.267, + "step": 31402 + }, + { + "epoch": 0.91, + "grad_norm": 1.697040087971537, + "learning_rate": 2.0703456005766788e-07, + "loss": 0.2621, + "step": 31403 + }, + { + "epoch": 0.91, + "grad_norm": 1.5021211438479625, + "learning_rate": 2.0690081815617046e-07, + "loss": 0.3271, + "step": 31404 + }, + { + "epoch": 0.91, + "grad_norm": 1.39372989257783, + "learning_rate": 2.0676711855382613e-07, + "loss": 0.2525, + "step": 31405 + }, + { + "epoch": 0.91, + "grad_norm": 1.266430924485109, + "learning_rate": 2.0663346125181506e-07, + "loss": 0.2632, + "step": 31406 + }, + { + "epoch": 0.91, + "grad_norm": 1.215326884409952, + "learning_rate": 2.064998462513168e-07, + "loss": 0.261, + "step": 31407 + }, + { + "epoch": 0.91, + "grad_norm": 1.4515682710570823, + "learning_rate": 2.0636627355351046e-07, + "loss": 0.2672, + "step": 31408 + }, + { + "epoch": 0.91, + "grad_norm": 1.4628141839229876, + "learning_rate": 2.062327431595751e-07, + "loss": 0.2677, + "step": 31409 + }, + { + "epoch": 0.91, + "grad_norm": 1.3261003293493385, + "learning_rate": 2.0609925507068863e-07, + "loss": 0.2687, + "step": 31410 + }, + { + "epoch": 0.91, + "grad_norm": 1.2327235878475489, + "learning_rate": 2.0596580928802956e-07, + "loss": 0.2808, + "step": 31411 + }, + { + "epoch": 0.91, + "grad_norm": 1.3875184641102885, + "learning_rate": 2.0583240581277642e-07, + "loss": 0.2975, + "step": 31412 + }, + { + "epoch": 0.91, + "grad_norm": 1.4942824317934988, + "learning_rate": 2.0569904464610434e-07, + "loss": 0.2867, + "step": 31413 + }, + { + "epoch": 0.91, + "grad_norm": 2.8436204022984906, + "learning_rate": 2.0556572578919188e-07, + "loss": 0.2681, + "step": 31414 + }, + { + "epoch": 0.91, + "grad_norm": 1.2135615883983835, + "learning_rate": 2.054324492432147e-07, + "loss": 0.2706, + "step": 31415 + }, + { + "epoch": 0.91, + "grad_norm": 1.747856539158701, + "learning_rate": 2.0529921500934969e-07, + "loss": 0.2807, + "step": 31416 + }, + { + "epoch": 0.91, + "grad_norm": 1.2889318371602634, + "learning_rate": 2.05166023088772e-07, + "loss": 0.2598, + "step": 31417 + }, + { + "epoch": 0.91, + "grad_norm": 1.605037842620682, + "learning_rate": 2.0503287348265732e-07, + "loss": 0.278, + "step": 31418 + }, + { + "epoch": 0.91, + "grad_norm": 1.3487334979235979, + "learning_rate": 2.0489976619218088e-07, + "loss": 0.2565, + "step": 31419 + }, + { + "epoch": 0.91, + "grad_norm": 1.3508219629904115, + "learning_rate": 2.047667012185167e-07, + "loss": 0.274, + "step": 31420 + }, + { + "epoch": 0.91, + "grad_norm": 1.2572508951534573, + "learning_rate": 2.046336785628411e-07, + "loss": 0.2652, + "step": 31421 + }, + { + "epoch": 0.91, + "grad_norm": 1.252095262649917, + "learning_rate": 2.0450069822632535e-07, + "loss": 0.2581, + "step": 31422 + }, + { + "epoch": 0.91, + "grad_norm": 1.241704962536772, + "learning_rate": 2.0436776021014403e-07, + "loss": 0.2416, + "step": 31423 + }, + { + "epoch": 0.91, + "grad_norm": 1.5013931259431335, + "learning_rate": 2.042348645154707e-07, + "loss": 0.289, + "step": 31424 + }, + { + "epoch": 0.91, + "grad_norm": 1.4013411679556131, + "learning_rate": 2.0410201114347772e-07, + "loss": 0.2889, + "step": 31425 + }, + { + "epoch": 0.91, + "grad_norm": 1.3984131141745193, + "learning_rate": 2.039692000953375e-07, + "loss": 0.2787, + "step": 31426 + }, + { + "epoch": 0.91, + "grad_norm": 1.4844192788089532, + "learning_rate": 2.038364313722224e-07, + "loss": 0.2735, + "step": 31427 + }, + { + "epoch": 0.91, + "grad_norm": 1.3860104532127657, + "learning_rate": 2.0370370497530488e-07, + "loss": 0.2706, + "step": 31428 + }, + { + "epoch": 0.91, + "grad_norm": 1.4332121651783485, + "learning_rate": 2.0357102090575452e-07, + "loss": 0.2501, + "step": 31429 + }, + { + "epoch": 0.91, + "grad_norm": 1.425534162248904, + "learning_rate": 2.0343837916474375e-07, + "loss": 0.2701, + "step": 31430 + }, + { + "epoch": 0.91, + "grad_norm": 1.2659528142164798, + "learning_rate": 2.0330577975344213e-07, + "loss": 0.2709, + "step": 31431 + }, + { + "epoch": 0.91, + "grad_norm": 1.518302790637527, + "learning_rate": 2.0317322267302043e-07, + "loss": 0.2654, + "step": 31432 + }, + { + "epoch": 0.91, + "grad_norm": 1.7782762608610532, + "learning_rate": 2.030407079246477e-07, + "loss": 0.2553, + "step": 31433 + }, + { + "epoch": 0.91, + "grad_norm": 1.2694760449776683, + "learning_rate": 2.0290823550949411e-07, + "loss": 0.256, + "step": 31434 + }, + { + "epoch": 0.91, + "grad_norm": 1.5418348631139684, + "learning_rate": 2.0277580542872877e-07, + "loss": 0.2571, + "step": 31435 + }, + { + "epoch": 0.91, + "grad_norm": 1.3964682628138352, + "learning_rate": 2.026434176835196e-07, + "loss": 0.285, + "step": 31436 + }, + { + "epoch": 0.91, + "grad_norm": 1.4177220249306588, + "learning_rate": 2.0251107227503618e-07, + "loss": 0.2784, + "step": 31437 + }, + { + "epoch": 0.91, + "grad_norm": 1.331492739524392, + "learning_rate": 2.0237876920444543e-07, + "loss": 0.2625, + "step": 31438 + }, + { + "epoch": 0.91, + "grad_norm": 1.4260597835883073, + "learning_rate": 2.0224650847291526e-07, + "loss": 0.2618, + "step": 31439 + }, + { + "epoch": 0.91, + "grad_norm": 1.2996147909460574, + "learning_rate": 2.021142900816131e-07, + "loss": 0.2611, + "step": 31440 + }, + { + "epoch": 0.91, + "grad_norm": 1.3406793127252017, + "learning_rate": 2.019821140317052e-07, + "loss": 0.2761, + "step": 31441 + }, + { + "epoch": 0.91, + "grad_norm": 0.965007831593008, + "learning_rate": 2.01849980324359e-07, + "loss": 0.583, + "step": 31442 + }, + { + "epoch": 0.91, + "grad_norm": 1.1855072595730802, + "learning_rate": 2.0171788896073907e-07, + "loss": 0.2693, + "step": 31443 + }, + { + "epoch": 0.91, + "grad_norm": 1.2726480961099784, + "learning_rate": 2.015858399420123e-07, + "loss": 0.2667, + "step": 31444 + }, + { + "epoch": 0.91, + "grad_norm": 1.8287151391543293, + "learning_rate": 2.0145383326934388e-07, + "loss": 0.25, + "step": 31445 + }, + { + "epoch": 0.91, + "grad_norm": 1.2797389877805077, + "learning_rate": 2.0132186894389893e-07, + "loss": 0.2651, + "step": 31446 + }, + { + "epoch": 0.91, + "grad_norm": 1.5099092127765694, + "learning_rate": 2.0118994696684103e-07, + "loss": 0.2661, + "step": 31447 + }, + { + "epoch": 0.91, + "grad_norm": 1.4792094740267419, + "learning_rate": 2.0105806733933586e-07, + "loss": 0.2738, + "step": 31448 + }, + { + "epoch": 0.91, + "grad_norm": 1.3096854308306891, + "learning_rate": 2.0092623006254642e-07, + "loss": 0.2712, + "step": 31449 + }, + { + "epoch": 0.91, + "grad_norm": 1.3958890770348167, + "learning_rate": 2.0079443513763564e-07, + "loss": 0.2645, + "step": 31450 + }, + { + "epoch": 0.91, + "grad_norm": 1.4173458569539374, + "learning_rate": 2.006626825657676e-07, + "loss": 0.2699, + "step": 31451 + }, + { + "epoch": 0.91, + "grad_norm": 1.958775698627264, + "learning_rate": 2.0053097234810414e-07, + "loss": 0.2783, + "step": 31452 + }, + { + "epoch": 0.91, + "grad_norm": 1.27006676618605, + "learning_rate": 2.0039930448580824e-07, + "loss": 0.2541, + "step": 31453 + }, + { + "epoch": 0.91, + "grad_norm": 3.094968073426766, + "learning_rate": 2.0026767898004174e-07, + "loss": 0.2578, + "step": 31454 + }, + { + "epoch": 0.91, + "grad_norm": 0.9289305784705649, + "learning_rate": 2.0013609583196647e-07, + "loss": 0.6054, + "step": 31455 + }, + { + "epoch": 0.91, + "grad_norm": 1.3327121593817235, + "learning_rate": 2.0000455504274374e-07, + "loss": 0.2648, + "step": 31456 + }, + { + "epoch": 0.91, + "grad_norm": 1.3895663105852103, + "learning_rate": 1.9987305661353373e-07, + "loss": 0.2687, + "step": 31457 + }, + { + "epoch": 0.91, + "grad_norm": 1.3006338523294378, + "learning_rate": 1.9974160054549718e-07, + "loss": 0.2645, + "step": 31458 + }, + { + "epoch": 0.91, + "grad_norm": 1.4417926646547756, + "learning_rate": 1.9961018683979482e-07, + "loss": 0.2904, + "step": 31459 + }, + { + "epoch": 0.91, + "grad_norm": 1.343447621486742, + "learning_rate": 1.994788154975852e-07, + "loss": 0.2539, + "step": 31460 + }, + { + "epoch": 0.91, + "grad_norm": 1.6010739410960522, + "learning_rate": 1.9934748652002845e-07, + "loss": 0.273, + "step": 31461 + }, + { + "epoch": 0.91, + "grad_norm": 1.4166933371403783, + "learning_rate": 1.9921619990828313e-07, + "loss": 0.2991, + "step": 31462 + }, + { + "epoch": 0.91, + "grad_norm": 1.5679861318636437, + "learning_rate": 1.9908495566350829e-07, + "loss": 0.2626, + "step": 31463 + }, + { + "epoch": 0.91, + "grad_norm": 1.3032339531460044, + "learning_rate": 1.9895375378686131e-07, + "loss": 0.2799, + "step": 31464 + }, + { + "epoch": 0.91, + "grad_norm": 1.3015592093237074, + "learning_rate": 1.9882259427950135e-07, + "loss": 0.3082, + "step": 31465 + }, + { + "epoch": 0.91, + "grad_norm": 1.3176067447793438, + "learning_rate": 1.9869147714258518e-07, + "loss": 0.2665, + "step": 31466 + }, + { + "epoch": 0.91, + "grad_norm": 1.3198362586725385, + "learning_rate": 1.985604023772708e-07, + "loss": 0.2819, + "step": 31467 + }, + { + "epoch": 0.91, + "grad_norm": 1.4543743011014036, + "learning_rate": 1.984293699847134e-07, + "loss": 0.2536, + "step": 31468 + }, + { + "epoch": 0.91, + "grad_norm": 1.3761135952759171, + "learning_rate": 1.9829837996606981e-07, + "loss": 0.2666, + "step": 31469 + }, + { + "epoch": 0.91, + "grad_norm": 3.7041242273834656, + "learning_rate": 1.9816743232249637e-07, + "loss": 0.2811, + "step": 31470 + }, + { + "epoch": 0.91, + "grad_norm": 1.792572595084921, + "learning_rate": 1.9803652705514875e-07, + "loss": 0.3125, + "step": 31471 + }, + { + "epoch": 0.91, + "grad_norm": 1.4539525854649598, + "learning_rate": 1.979056641651822e-07, + "loss": 0.254, + "step": 31472 + }, + { + "epoch": 0.91, + "grad_norm": 1.4523103146098635, + "learning_rate": 1.9777484365375077e-07, + "loss": 0.2863, + "step": 31473 + }, + { + "epoch": 0.91, + "grad_norm": 1.5189816462265955, + "learning_rate": 1.976440655220102e-07, + "loss": 0.2844, + "step": 31474 + }, + { + "epoch": 0.91, + "grad_norm": 1.3417639034237165, + "learning_rate": 1.9751332977111402e-07, + "loss": 0.2784, + "step": 31475 + }, + { + "epoch": 0.91, + "grad_norm": 1.5032627811307244, + "learning_rate": 1.9738263640221634e-07, + "loss": 0.2759, + "step": 31476 + }, + { + "epoch": 0.91, + "grad_norm": 1.7857575253950821, + "learning_rate": 1.9725198541647006e-07, + "loss": 0.2889, + "step": 31477 + }, + { + "epoch": 0.91, + "grad_norm": 1.8256588520708559, + "learning_rate": 1.9712137681502764e-07, + "loss": 0.2602, + "step": 31478 + }, + { + "epoch": 0.91, + "grad_norm": 1.343278592112497, + "learning_rate": 1.9699081059904258e-07, + "loss": 0.2564, + "step": 31479 + }, + { + "epoch": 0.91, + "grad_norm": 1.3202257017789067, + "learning_rate": 1.968602867696673e-07, + "loss": 0.263, + "step": 31480 + }, + { + "epoch": 0.91, + "grad_norm": 1.528980622835728, + "learning_rate": 1.9672980532805252e-07, + "loss": 0.2755, + "step": 31481 + }, + { + "epoch": 0.91, + "grad_norm": 1.223163887406764, + "learning_rate": 1.9659936627535125e-07, + "loss": 0.2674, + "step": 31482 + }, + { + "epoch": 0.91, + "grad_norm": 1.2415281365921689, + "learning_rate": 1.9646896961271312e-07, + "loss": 0.2788, + "step": 31483 + }, + { + "epoch": 0.91, + "grad_norm": 1.3988673931518356, + "learning_rate": 1.9633861534129107e-07, + "loss": 0.2848, + "step": 31484 + }, + { + "epoch": 0.91, + "grad_norm": 1.3757431853683308, + "learning_rate": 1.9620830346223252e-07, + "loss": 0.2897, + "step": 31485 + }, + { + "epoch": 0.91, + "grad_norm": 1.3330316405823142, + "learning_rate": 1.9607803397669046e-07, + "loss": 0.259, + "step": 31486 + }, + { + "epoch": 0.91, + "grad_norm": 1.5843727280986526, + "learning_rate": 1.9594780688581172e-07, + "loss": 0.2769, + "step": 31487 + }, + { + "epoch": 0.91, + "grad_norm": 1.4200481923204016, + "learning_rate": 1.9581762219074707e-07, + "loss": 0.2874, + "step": 31488 + }, + { + "epoch": 0.91, + "grad_norm": 1.3243789210955068, + "learning_rate": 1.9568747989264558e-07, + "loss": 0.2784, + "step": 31489 + }, + { + "epoch": 0.91, + "grad_norm": 0.8873821702500558, + "learning_rate": 1.9555737999265467e-07, + "loss": 0.5624, + "step": 31490 + }, + { + "epoch": 0.91, + "grad_norm": 1.402274491494559, + "learning_rate": 1.95427322491924e-07, + "loss": 0.2684, + "step": 31491 + }, + { + "epoch": 0.91, + "grad_norm": 1.3166535338681142, + "learning_rate": 1.952973073915998e-07, + "loss": 0.2552, + "step": 31492 + }, + { + "epoch": 0.91, + "grad_norm": 1.3514206230957104, + "learning_rate": 1.9516733469283066e-07, + "loss": 0.2778, + "step": 31493 + }, + { + "epoch": 0.91, + "grad_norm": 1.716616852671194, + "learning_rate": 1.950374043967629e-07, + "loss": 0.3108, + "step": 31494 + }, + { + "epoch": 0.91, + "grad_norm": 1.607296430447535, + "learning_rate": 1.9490751650454386e-07, + "loss": 0.2782, + "step": 31495 + }, + { + "epoch": 0.91, + "grad_norm": 1.3657583563494584, + "learning_rate": 1.947776710173188e-07, + "loss": 0.2464, + "step": 31496 + }, + { + "epoch": 0.91, + "grad_norm": 1.2418008193633612, + "learning_rate": 1.9464786793623402e-07, + "loss": 0.2605, + "step": 31497 + }, + { + "epoch": 0.91, + "grad_norm": 1.3995249252637252, + "learning_rate": 1.9451810726243525e-07, + "loss": 0.2796, + "step": 31498 + }, + { + "epoch": 0.91, + "grad_norm": 1.2208915651448338, + "learning_rate": 1.943883889970677e-07, + "loss": 0.2609, + "step": 31499 + }, + { + "epoch": 0.91, + "grad_norm": 1.3368194131369424, + "learning_rate": 1.9425871314127542e-07, + "loss": 0.2688, + "step": 31500 + }, + { + "epoch": 0.91, + "grad_norm": 1.4294626470295075, + "learning_rate": 1.9412907969620364e-07, + "loss": 0.2808, + "step": 31501 + }, + { + "epoch": 0.91, + "grad_norm": 1.6100708554616363, + "learning_rate": 1.939994886629959e-07, + "loss": 0.2591, + "step": 31502 + }, + { + "epoch": 0.91, + "grad_norm": 1.3122656466951117, + "learning_rate": 1.938699400427968e-07, + "loss": 0.2768, + "step": 31503 + }, + { + "epoch": 0.91, + "grad_norm": 1.3234673401878134, + "learning_rate": 1.9374043383674769e-07, + "loss": 0.3074, + "step": 31504 + }, + { + "epoch": 0.91, + "grad_norm": 1.5337642264609526, + "learning_rate": 1.9361097004599317e-07, + "loss": 0.2971, + "step": 31505 + }, + { + "epoch": 0.91, + "grad_norm": 1.5763932299148031, + "learning_rate": 1.9348154867167456e-07, + "loss": 0.2714, + "step": 31506 + }, + { + "epoch": 0.91, + "grad_norm": 2.578007816134049, + "learning_rate": 1.9335216971493488e-07, + "loss": 0.2364, + "step": 31507 + }, + { + "epoch": 0.91, + "grad_norm": 1.385278383732548, + "learning_rate": 1.9322283317691592e-07, + "loss": 0.2742, + "step": 31508 + }, + { + "epoch": 0.91, + "grad_norm": 1.319248150547913, + "learning_rate": 1.930935390587585e-07, + "loss": 0.266, + "step": 31509 + }, + { + "epoch": 0.91, + "grad_norm": 1.385749760977806, + "learning_rate": 1.9296428736160389e-07, + "loss": 0.2671, + "step": 31510 + }, + { + "epoch": 0.91, + "grad_norm": 1.3283059938584474, + "learning_rate": 1.9283507808659285e-07, + "loss": 0.2641, + "step": 31511 + }, + { + "epoch": 0.91, + "grad_norm": 1.2710375898084758, + "learning_rate": 1.9270591123486614e-07, + "loss": 0.252, + "step": 31512 + }, + { + "epoch": 0.91, + "grad_norm": 2.0066233615659748, + "learning_rate": 1.9257678680756342e-07, + "loss": 0.2761, + "step": 31513 + }, + { + "epoch": 0.91, + "grad_norm": 1.2230728117923155, + "learning_rate": 1.9244770480582264e-07, + "loss": 0.2512, + "step": 31514 + }, + { + "epoch": 0.91, + "grad_norm": 1.3366290208116, + "learning_rate": 1.9231866523078457e-07, + "loss": 0.2586, + "step": 31515 + }, + { + "epoch": 0.91, + "grad_norm": 1.9647269984204379, + "learning_rate": 1.9218966808358774e-07, + "loss": 0.2722, + "step": 31516 + }, + { + "epoch": 0.91, + "grad_norm": 1.2515220412869936, + "learning_rate": 1.9206071336537013e-07, + "loss": 0.2922, + "step": 31517 + }, + { + "epoch": 0.91, + "grad_norm": 2.4869246354233097, + "learning_rate": 1.9193180107727028e-07, + "loss": 0.2591, + "step": 31518 + }, + { + "epoch": 0.91, + "grad_norm": 1.3203298018945155, + "learning_rate": 1.9180293122042558e-07, + "loss": 0.2631, + "step": 31519 + }, + { + "epoch": 0.91, + "grad_norm": 1.4198816693768965, + "learning_rate": 1.916741037959735e-07, + "loss": 0.2883, + "step": 31520 + }, + { + "epoch": 0.91, + "grad_norm": 1.2520888792559561, + "learning_rate": 1.9154531880505034e-07, + "loss": 0.2788, + "step": 31521 + }, + { + "epoch": 0.91, + "grad_norm": 1.2949263003210099, + "learning_rate": 1.9141657624879406e-07, + "loss": 0.2589, + "step": 31522 + }, + { + "epoch": 0.91, + "grad_norm": 1.3067193240384425, + "learning_rate": 1.9128787612833932e-07, + "loss": 0.279, + "step": 31523 + }, + { + "epoch": 0.91, + "grad_norm": 1.572891861515246, + "learning_rate": 1.911592184448219e-07, + "loss": 0.289, + "step": 31524 + }, + { + "epoch": 0.91, + "grad_norm": 2.3416933581773525, + "learning_rate": 1.9103060319937806e-07, + "loss": 0.2961, + "step": 31525 + }, + { + "epoch": 0.91, + "grad_norm": 1.2728415393101344, + "learning_rate": 1.9090203039314248e-07, + "loss": 0.258, + "step": 31526 + }, + { + "epoch": 0.91, + "grad_norm": 1.507970559463657, + "learning_rate": 1.9077350002724982e-07, + "loss": 0.2745, + "step": 31527 + }, + { + "epoch": 0.91, + "grad_norm": 1.367828491126245, + "learning_rate": 1.906450121028347e-07, + "loss": 0.2694, + "step": 31528 + }, + { + "epoch": 0.91, + "grad_norm": 1.1837253294502237, + "learning_rate": 1.9051656662103013e-07, + "loss": 0.2445, + "step": 31529 + }, + { + "epoch": 0.91, + "grad_norm": 1.437657226666444, + "learning_rate": 1.9038816358297075e-07, + "loss": 0.2758, + "step": 31530 + }, + { + "epoch": 0.91, + "grad_norm": 1.2838842180624916, + "learning_rate": 1.9025980298978953e-07, + "loss": 0.2642, + "step": 31531 + }, + { + "epoch": 0.91, + "grad_norm": 1.343047886464757, + "learning_rate": 1.9013148484261835e-07, + "loss": 0.2864, + "step": 31532 + }, + { + "epoch": 0.91, + "grad_norm": 1.3802735032623383, + "learning_rate": 1.900032091425902e-07, + "loss": 0.2743, + "step": 31533 + }, + { + "epoch": 0.91, + "grad_norm": 1.4449518140624686, + "learning_rate": 1.8987497589083693e-07, + "loss": 0.2689, + "step": 31534 + }, + { + "epoch": 0.91, + "grad_norm": 1.2992729102349798, + "learning_rate": 1.8974678508849043e-07, + "loss": 0.2845, + "step": 31535 + }, + { + "epoch": 0.91, + "grad_norm": 1.2687564066186992, + "learning_rate": 1.8961863673668203e-07, + "loss": 0.2595, + "step": 31536 + }, + { + "epoch": 0.91, + "grad_norm": 1.531394074339337, + "learning_rate": 1.894905308365419e-07, + "loss": 0.2724, + "step": 31537 + }, + { + "epoch": 0.91, + "grad_norm": 1.437375836350898, + "learning_rate": 1.8936246738920195e-07, + "loss": 0.2992, + "step": 31538 + }, + { + "epoch": 0.91, + "grad_norm": 1.3832301714355724, + "learning_rate": 1.892344463957918e-07, + "loss": 0.26, + "step": 31539 + }, + { + "epoch": 0.91, + "grad_norm": 1.4074505154885044, + "learning_rate": 1.8910646785744057e-07, + "loss": 0.2723, + "step": 31540 + }, + { + "epoch": 0.91, + "grad_norm": 1.6757800604725501, + "learning_rate": 1.8897853177527902e-07, + "loss": 0.2935, + "step": 31541 + }, + { + "epoch": 0.91, + "grad_norm": 1.3930918093642792, + "learning_rate": 1.8885063815043457e-07, + "loss": 0.2973, + "step": 31542 + }, + { + "epoch": 0.91, + "grad_norm": 1.656068829618521, + "learning_rate": 1.8872278698403635e-07, + "loss": 0.2841, + "step": 31543 + }, + { + "epoch": 0.91, + "grad_norm": 1.3139610118638725, + "learning_rate": 1.8859497827721284e-07, + "loss": 0.2649, + "step": 31544 + }, + { + "epoch": 0.91, + "grad_norm": 1.3525183689073443, + "learning_rate": 1.8846721203109208e-07, + "loss": 0.2665, + "step": 31545 + }, + { + "epoch": 0.91, + "grad_norm": 1.5844772882846705, + "learning_rate": 1.8833948824680203e-07, + "loss": 0.2681, + "step": 31546 + }, + { + "epoch": 0.92, + "grad_norm": 2.1330379965138904, + "learning_rate": 1.8821180692546902e-07, + "loss": 0.2718, + "step": 31547 + }, + { + "epoch": 0.92, + "grad_norm": 1.320747959493603, + "learning_rate": 1.8808416806821993e-07, + "loss": 0.2571, + "step": 31548 + }, + { + "epoch": 0.92, + "grad_norm": 1.3458753414131568, + "learning_rate": 1.879565716761822e-07, + "loss": 0.2579, + "step": 31549 + }, + { + "epoch": 0.92, + "grad_norm": 1.3896212583879712, + "learning_rate": 1.8782901775048045e-07, + "loss": 0.271, + "step": 31550 + }, + { + "epoch": 0.92, + "grad_norm": 1.8929656816374754, + "learning_rate": 1.877015062922416e-07, + "loss": 0.2491, + "step": 31551 + }, + { + "epoch": 0.92, + "grad_norm": 1.3244586400340825, + "learning_rate": 1.8757403730258972e-07, + "loss": 0.2864, + "step": 31552 + }, + { + "epoch": 0.92, + "grad_norm": 1.9012898887881424, + "learning_rate": 1.874466107826506e-07, + "loss": 0.265, + "step": 31553 + }, + { + "epoch": 0.92, + "grad_norm": 1.3797541405342242, + "learning_rate": 1.873192267335483e-07, + "loss": 0.2773, + "step": 31554 + }, + { + "epoch": 0.92, + "grad_norm": 1.4487936957442535, + "learning_rate": 1.8719188515640752e-07, + "loss": 0.2996, + "step": 31555 + }, + { + "epoch": 0.92, + "grad_norm": 1.4448171227279905, + "learning_rate": 1.8706458605235124e-07, + "loss": 0.2693, + "step": 31556 + }, + { + "epoch": 0.92, + "grad_norm": 0.9941253896069098, + "learning_rate": 1.869373294225041e-07, + "loss": 0.6079, + "step": 31557 + }, + { + "epoch": 0.92, + "grad_norm": 1.6155366652888068, + "learning_rate": 1.8681011526798853e-07, + "loss": 0.2765, + "step": 31558 + }, + { + "epoch": 0.92, + "grad_norm": 1.5544451783063458, + "learning_rate": 1.8668294358992646e-07, + "loss": 0.2932, + "step": 31559 + }, + { + "epoch": 0.92, + "grad_norm": 1.5727126220309107, + "learning_rate": 1.8655581438944138e-07, + "loss": 0.2995, + "step": 31560 + }, + { + "epoch": 0.92, + "grad_norm": 1.7735436018789645, + "learning_rate": 1.8642872766765408e-07, + "loss": 0.2645, + "step": 31561 + }, + { + "epoch": 0.92, + "grad_norm": 1.3412826687768338, + "learning_rate": 1.8630168342568645e-07, + "loss": 0.2658, + "step": 31562 + }, + { + "epoch": 0.92, + "grad_norm": 1.3013954520558442, + "learning_rate": 1.8617468166466034e-07, + "loss": 0.2856, + "step": 31563 + }, + { + "epoch": 0.92, + "grad_norm": 1.8850057323560585, + "learning_rate": 1.8604772238569603e-07, + "loss": 0.2799, + "step": 31564 + }, + { + "epoch": 0.92, + "grad_norm": 1.3916665289690915, + "learning_rate": 1.8592080558991366e-07, + "loss": 0.2527, + "step": 31565 + }, + { + "epoch": 0.92, + "grad_norm": 1.3026230206939038, + "learning_rate": 1.8579393127843404e-07, + "loss": 0.266, + "step": 31566 + }, + { + "epoch": 0.92, + "grad_norm": 1.2801225715829474, + "learning_rate": 1.8566709945237682e-07, + "loss": 0.264, + "step": 31567 + }, + { + "epoch": 0.92, + "grad_norm": 1.3101418454372569, + "learning_rate": 1.8554031011286e-07, + "loss": 0.2763, + "step": 31568 + }, + { + "epoch": 0.92, + "grad_norm": 1.422994055631658, + "learning_rate": 1.8541356326100436e-07, + "loss": 0.2672, + "step": 31569 + }, + { + "epoch": 0.92, + "grad_norm": 1.4119869166890342, + "learning_rate": 1.8528685889792618e-07, + "loss": 0.2844, + "step": 31570 + }, + { + "epoch": 0.92, + "grad_norm": 2.140696093363142, + "learning_rate": 1.8516019702474575e-07, + "loss": 0.252, + "step": 31571 + }, + { + "epoch": 0.92, + "grad_norm": 1.711758958111953, + "learning_rate": 1.8503357764257935e-07, + "loss": 0.2661, + "step": 31572 + }, + { + "epoch": 0.92, + "grad_norm": 1.4055484731546262, + "learning_rate": 1.8490700075254497e-07, + "loss": 0.2609, + "step": 31573 + }, + { + "epoch": 0.92, + "grad_norm": 1.3182194471846458, + "learning_rate": 1.8478046635576007e-07, + "loss": 0.2683, + "step": 31574 + }, + { + "epoch": 0.92, + "grad_norm": 1.959927609163234, + "learning_rate": 1.84653974453341e-07, + "loss": 0.2687, + "step": 31575 + }, + { + "epoch": 0.92, + "grad_norm": 1.3175293433564925, + "learning_rate": 1.8452752504640403e-07, + "loss": 0.2851, + "step": 31576 + }, + { + "epoch": 0.92, + "grad_norm": 1.2793867854246985, + "learning_rate": 1.8440111813606554e-07, + "loss": 0.267, + "step": 31577 + }, + { + "epoch": 0.92, + "grad_norm": 0.9822073957262096, + "learning_rate": 1.8427475372344018e-07, + "loss": 0.5551, + "step": 31578 + }, + { + "epoch": 0.92, + "grad_norm": 1.411867499660722, + "learning_rate": 1.8414843180964316e-07, + "loss": 0.2604, + "step": 31579 + }, + { + "epoch": 0.92, + "grad_norm": 1.3778256019455375, + "learning_rate": 1.840221523957897e-07, + "loss": 0.2734, + "step": 31580 + }, + { + "epoch": 0.92, + "grad_norm": 1.8795932138291738, + "learning_rate": 1.8389591548299447e-07, + "loss": 0.2604, + "step": 31581 + }, + { + "epoch": 0.92, + "grad_norm": 1.2620818980850903, + "learning_rate": 1.83769721072371e-07, + "loss": 0.2748, + "step": 31582 + }, + { + "epoch": 0.92, + "grad_norm": 1.4529588872153427, + "learning_rate": 1.8364356916503346e-07, + "loss": 0.2675, + "step": 31583 + }, + { + "epoch": 0.92, + "grad_norm": 1.378327321212167, + "learning_rate": 1.835174597620948e-07, + "loss": 0.2794, + "step": 31584 + }, + { + "epoch": 0.92, + "grad_norm": 1.4446711021204508, + "learning_rate": 1.83391392864668e-07, + "loss": 0.2486, + "step": 31585 + }, + { + "epoch": 0.92, + "grad_norm": 1.2663293082194358, + "learning_rate": 1.8326536847386555e-07, + "loss": 0.2654, + "step": 31586 + }, + { + "epoch": 0.92, + "grad_norm": 1.4694056214824933, + "learning_rate": 1.8313938659079988e-07, + "loss": 0.2673, + "step": 31587 + }, + { + "epoch": 0.92, + "grad_norm": 1.284553730411811, + "learning_rate": 1.8301344721658175e-07, + "loss": 0.2744, + "step": 31588 + }, + { + "epoch": 0.92, + "grad_norm": 1.7279172286697642, + "learning_rate": 1.828875503523242e-07, + "loss": 0.2702, + "step": 31589 + }, + { + "epoch": 0.92, + "grad_norm": 1.7007101893267769, + "learning_rate": 1.8276169599913684e-07, + "loss": 0.276, + "step": 31590 + }, + { + "epoch": 0.92, + "grad_norm": 1.2495079777387488, + "learning_rate": 1.826358841581316e-07, + "loss": 0.2524, + "step": 31591 + }, + { + "epoch": 0.92, + "grad_norm": 1.3643805478179762, + "learning_rate": 1.8251011483041758e-07, + "loss": 0.2839, + "step": 31592 + }, + { + "epoch": 0.92, + "grad_norm": 1.440823729935159, + "learning_rate": 1.8238438801710557e-07, + "loss": 0.2765, + "step": 31593 + }, + { + "epoch": 0.92, + "grad_norm": 1.2493955763529407, + "learning_rate": 1.822587037193052e-07, + "loss": 0.2951, + "step": 31594 + }, + { + "epoch": 0.92, + "grad_norm": 1.822347779078548, + "learning_rate": 1.8213306193812453e-07, + "loss": 0.2639, + "step": 31595 + }, + { + "epoch": 0.92, + "grad_norm": 1.338293923068045, + "learning_rate": 1.8200746267467318e-07, + "loss": 0.2477, + "step": 31596 + }, + { + "epoch": 0.92, + "grad_norm": 1.3277758980834082, + "learning_rate": 1.818819059300597e-07, + "loss": 0.2768, + "step": 31597 + }, + { + "epoch": 0.92, + "grad_norm": 0.9431390000826927, + "learning_rate": 1.8175639170539161e-07, + "loss": 0.6205, + "step": 31598 + }, + { + "epoch": 0.92, + "grad_norm": 1.2311715236089482, + "learning_rate": 1.8163092000177686e-07, + "loss": 0.273, + "step": 31599 + }, + { + "epoch": 0.92, + "grad_norm": 1.2652195565520943, + "learning_rate": 1.815054908203223e-07, + "loss": 0.2817, + "step": 31600 + }, + { + "epoch": 0.92, + "grad_norm": 1.2037722681129415, + "learning_rate": 1.8138010416213548e-07, + "loss": 0.2539, + "step": 31601 + }, + { + "epoch": 0.92, + "grad_norm": 1.2714217088034423, + "learning_rate": 1.8125476002832321e-07, + "loss": 0.2654, + "step": 31602 + }, + { + "epoch": 0.92, + "grad_norm": 1.5055777587435146, + "learning_rate": 1.811294584199902e-07, + "loss": 0.2964, + "step": 31603 + }, + { + "epoch": 0.92, + "grad_norm": 1.3538263681723492, + "learning_rate": 1.8100419933824443e-07, + "loss": 0.2562, + "step": 31604 + }, + { + "epoch": 0.92, + "grad_norm": 1.218539140396059, + "learning_rate": 1.8087898278418948e-07, + "loss": 0.249, + "step": 31605 + }, + { + "epoch": 0.92, + "grad_norm": 1.5253251558815009, + "learning_rate": 1.8075380875893056e-07, + "loss": 0.2846, + "step": 31606 + }, + { + "epoch": 0.92, + "grad_norm": 1.4821121675851614, + "learning_rate": 1.8062867726357292e-07, + "loss": 0.2645, + "step": 31607 + }, + { + "epoch": 0.92, + "grad_norm": 1.2651454996977978, + "learning_rate": 1.8050358829922066e-07, + "loss": 0.2603, + "step": 31608 + }, + { + "epoch": 0.92, + "grad_norm": 1.6319765980514953, + "learning_rate": 1.803785418669779e-07, + "loss": 0.266, + "step": 31609 + }, + { + "epoch": 0.92, + "grad_norm": 1.289237259334171, + "learning_rate": 1.8025353796794765e-07, + "loss": 0.2605, + "step": 31610 + }, + { + "epoch": 0.92, + "grad_norm": 1.4050811643012868, + "learning_rate": 1.8012857660323347e-07, + "loss": 0.2853, + "step": 31611 + }, + { + "epoch": 0.92, + "grad_norm": 1.4577900794441734, + "learning_rate": 1.800036577739378e-07, + "loss": 0.2546, + "step": 31612 + }, + { + "epoch": 0.92, + "grad_norm": 1.5472247830997106, + "learning_rate": 1.7987878148116423e-07, + "loss": 0.258, + "step": 31613 + }, + { + "epoch": 0.92, + "grad_norm": 1.279326037541556, + "learning_rate": 1.7975394772601351e-07, + "loss": 0.258, + "step": 31614 + }, + { + "epoch": 0.92, + "grad_norm": 1.249391677205768, + "learning_rate": 1.7962915650958758e-07, + "loss": 0.2698, + "step": 31615 + }, + { + "epoch": 0.92, + "grad_norm": 0.9965329897190456, + "learning_rate": 1.7950440783298772e-07, + "loss": 0.6022, + "step": 31616 + }, + { + "epoch": 0.92, + "grad_norm": 1.30693330680212, + "learning_rate": 1.7937970169731477e-07, + "loss": 0.2912, + "step": 31617 + }, + { + "epoch": 0.92, + "grad_norm": 1.3252638731893296, + "learning_rate": 1.792550381036695e-07, + "loss": 0.2891, + "step": 31618 + }, + { + "epoch": 0.92, + "grad_norm": 1.4273426111106888, + "learning_rate": 1.7913041705315216e-07, + "loss": 0.2753, + "step": 31619 + }, + { + "epoch": 0.92, + "grad_norm": 1.6139344499603443, + "learning_rate": 1.7900583854686237e-07, + "loss": 0.2657, + "step": 31620 + }, + { + "epoch": 0.92, + "grad_norm": 1.3103092524186017, + "learning_rate": 1.7888130258589987e-07, + "loss": 0.2762, + "step": 31621 + }, + { + "epoch": 0.92, + "grad_norm": 0.9788744619193711, + "learning_rate": 1.7875680917136317e-07, + "loss": 0.5974, + "step": 31622 + }, + { + "epoch": 0.92, + "grad_norm": 1.3887540060938155, + "learning_rate": 1.7863235830435089e-07, + "loss": 0.2705, + "step": 31623 + }, + { + "epoch": 0.92, + "grad_norm": 1.2277831840385507, + "learning_rate": 1.7850794998596154e-07, + "loss": 0.2642, + "step": 31624 + }, + { + "epoch": 0.92, + "grad_norm": 1.3957200464028043, + "learning_rate": 1.7838358421729375e-07, + "loss": 0.2785, + "step": 31625 + }, + { + "epoch": 0.92, + "grad_norm": 1.2934193128461469, + "learning_rate": 1.7825926099944324e-07, + "loss": 0.245, + "step": 31626 + }, + { + "epoch": 0.92, + "grad_norm": 1.3813955478781599, + "learning_rate": 1.7813498033350862e-07, + "loss": 0.257, + "step": 31627 + }, + { + "epoch": 0.92, + "grad_norm": 1.513061755552366, + "learning_rate": 1.7801074222058623e-07, + "loss": 0.27, + "step": 31628 + }, + { + "epoch": 0.92, + "grad_norm": 1.3961117488532242, + "learning_rate": 1.7788654666177242e-07, + "loss": 0.2903, + "step": 31629 + }, + { + "epoch": 0.92, + "grad_norm": 1.3610891455803362, + "learning_rate": 1.777623936581635e-07, + "loss": 0.305, + "step": 31630 + }, + { + "epoch": 0.92, + "grad_norm": 1.4887120958967814, + "learning_rate": 1.776382832108553e-07, + "loss": 0.2739, + "step": 31631 + }, + { + "epoch": 0.92, + "grad_norm": 1.3744169476351786, + "learning_rate": 1.7751421532094247e-07, + "loss": 0.2733, + "step": 31632 + }, + { + "epoch": 0.92, + "grad_norm": 1.401953060126466, + "learning_rate": 1.773901899895203e-07, + "loss": 0.2708, + "step": 31633 + }, + { + "epoch": 0.92, + "grad_norm": 1.436760501865262, + "learning_rate": 1.7726620721768284e-07, + "loss": 0.2514, + "step": 31634 + }, + { + "epoch": 0.92, + "grad_norm": 1.2736829322881578, + "learning_rate": 1.771422670065248e-07, + "loss": 0.274, + "step": 31635 + }, + { + "epoch": 0.92, + "grad_norm": 1.4240966953774232, + "learning_rate": 1.770183693571398e-07, + "loss": 0.275, + "step": 31636 + }, + { + "epoch": 0.92, + "grad_norm": 1.2468203646360334, + "learning_rate": 1.7689451427062076e-07, + "loss": 0.2494, + "step": 31637 + }, + { + "epoch": 0.92, + "grad_norm": 1.4018877792715254, + "learning_rate": 1.7677070174806187e-07, + "loss": 0.2962, + "step": 31638 + }, + { + "epoch": 0.92, + "grad_norm": 1.4826078435604437, + "learning_rate": 1.7664693179055448e-07, + "loss": 0.2836, + "step": 31639 + }, + { + "epoch": 0.92, + "grad_norm": 2.355624280871888, + "learning_rate": 1.7652320439919212e-07, + "loss": 0.2878, + "step": 31640 + }, + { + "epoch": 0.92, + "grad_norm": 0.9754670572945687, + "learning_rate": 1.7639951957506562e-07, + "loss": 0.5177, + "step": 31641 + }, + { + "epoch": 0.92, + "grad_norm": 1.4074819568313548, + "learning_rate": 1.7627587731926687e-07, + "loss": 0.2704, + "step": 31642 + }, + { + "epoch": 0.92, + "grad_norm": 1.2669195020819852, + "learning_rate": 1.7615227763288666e-07, + "loss": 0.2802, + "step": 31643 + }, + { + "epoch": 0.92, + "grad_norm": 1.9324301069673873, + "learning_rate": 1.7602872051701635e-07, + "loss": 0.2655, + "step": 31644 + }, + { + "epoch": 0.92, + "grad_norm": 1.53610654537265, + "learning_rate": 1.7590520597274618e-07, + "loss": 0.2592, + "step": 31645 + }, + { + "epoch": 0.92, + "grad_norm": 2.167361021754859, + "learning_rate": 1.7578173400116583e-07, + "loss": 0.2672, + "step": 31646 + }, + { + "epoch": 0.92, + "grad_norm": 1.4506791325371795, + "learning_rate": 1.7565830460336498e-07, + "loss": 0.3103, + "step": 31647 + }, + { + "epoch": 0.92, + "grad_norm": 1.586234204987964, + "learning_rate": 1.7553491778043385e-07, + "loss": 0.27, + "step": 31648 + }, + { + "epoch": 0.92, + "grad_norm": 1.5606625100363571, + "learning_rate": 1.754115735334605e-07, + "loss": 0.293, + "step": 31649 + }, + { + "epoch": 0.92, + "grad_norm": 1.360406351371384, + "learning_rate": 1.7528827186353347e-07, + "loss": 0.2639, + "step": 31650 + }, + { + "epoch": 0.92, + "grad_norm": 1.3065978928523965, + "learning_rate": 1.7516501277174025e-07, + "loss": 0.2728, + "step": 31651 + }, + { + "epoch": 0.92, + "grad_norm": 1.4002095443164353, + "learning_rate": 1.7504179625916994e-07, + "loss": 0.2542, + "step": 31652 + }, + { + "epoch": 0.92, + "grad_norm": 1.273312791713774, + "learning_rate": 1.7491862232691005e-07, + "loss": 0.2575, + "step": 31653 + }, + { + "epoch": 0.92, + "grad_norm": 1.3786208212881566, + "learning_rate": 1.7479549097604575e-07, + "loss": 0.2836, + "step": 31654 + }, + { + "epoch": 0.92, + "grad_norm": 1.3896236154494894, + "learning_rate": 1.7467240220766513e-07, + "loss": 0.2807, + "step": 31655 + }, + { + "epoch": 0.92, + "grad_norm": 2.1685833002671413, + "learning_rate": 1.7454935602285393e-07, + "loss": 0.265, + "step": 31656 + }, + { + "epoch": 0.92, + "grad_norm": 1.4075650989445958, + "learning_rate": 1.74426352422698e-07, + "loss": 0.286, + "step": 31657 + }, + { + "epoch": 0.92, + "grad_norm": 1.3677800764817734, + "learning_rate": 1.7430339140828313e-07, + "loss": 0.2694, + "step": 31658 + }, + { + "epoch": 0.92, + "grad_norm": 1.577141018982839, + "learning_rate": 1.7418047298069507e-07, + "loss": 0.28, + "step": 31659 + }, + { + "epoch": 0.92, + "grad_norm": 1.3744049027417582, + "learning_rate": 1.7405759714101744e-07, + "loss": 0.2599, + "step": 31660 + }, + { + "epoch": 0.92, + "grad_norm": 1.5411741938734993, + "learning_rate": 1.7393476389033493e-07, + "loss": 0.2755, + "step": 31661 + }, + { + "epoch": 0.92, + "grad_norm": 1.6206646594162442, + "learning_rate": 1.738119732297311e-07, + "loss": 0.2447, + "step": 31662 + }, + { + "epoch": 0.92, + "grad_norm": 1.3807565709661074, + "learning_rate": 1.7368922516029062e-07, + "loss": 0.2696, + "step": 31663 + }, + { + "epoch": 0.92, + "grad_norm": 1.4735191052411372, + "learning_rate": 1.7356651968309656e-07, + "loss": 0.2585, + "step": 31664 + }, + { + "epoch": 0.92, + "grad_norm": 1.3645305786625008, + "learning_rate": 1.7344385679923137e-07, + "loss": 0.2609, + "step": 31665 + }, + { + "epoch": 0.92, + "grad_norm": 1.401221253196409, + "learning_rate": 1.733212365097775e-07, + "loss": 0.2705, + "step": 31666 + }, + { + "epoch": 0.92, + "grad_norm": 1.9445958825415934, + "learning_rate": 1.7319865881581745e-07, + "loss": 0.2677, + "step": 31667 + }, + { + "epoch": 0.92, + "grad_norm": 1.3383267381551722, + "learning_rate": 1.7307612371843307e-07, + "loss": 0.2783, + "step": 31668 + }, + { + "epoch": 0.92, + "grad_norm": 1.5913977833494322, + "learning_rate": 1.7295363121870524e-07, + "loss": 0.2624, + "step": 31669 + }, + { + "epoch": 0.92, + "grad_norm": 2.650898199492578, + "learning_rate": 1.7283118131771472e-07, + "loss": 0.2856, + "step": 31670 + }, + { + "epoch": 0.92, + "grad_norm": 1.2899382462177738, + "learning_rate": 1.7270877401654283e-07, + "loss": 0.2694, + "step": 31671 + }, + { + "epoch": 0.92, + "grad_norm": 1.2939436523395667, + "learning_rate": 1.7258640931626935e-07, + "loss": 0.2462, + "step": 31672 + }, + { + "epoch": 0.92, + "grad_norm": 0.959886043846489, + "learning_rate": 1.7246408721797448e-07, + "loss": 0.5805, + "step": 31673 + }, + { + "epoch": 0.92, + "grad_norm": 1.2759434332109167, + "learning_rate": 1.723418077227379e-07, + "loss": 0.2756, + "step": 31674 + }, + { + "epoch": 0.92, + "grad_norm": 1.2988831901010178, + "learning_rate": 1.722195708316382e-07, + "loss": 0.2581, + "step": 31675 + }, + { + "epoch": 0.92, + "grad_norm": 1.412628165238186, + "learning_rate": 1.720973765457551e-07, + "loss": 0.2636, + "step": 31676 + }, + { + "epoch": 0.92, + "grad_norm": 1.399056086420023, + "learning_rate": 1.7197522486616546e-07, + "loss": 0.2717, + "step": 31677 + }, + { + "epoch": 0.92, + "grad_norm": 1.298367461922204, + "learning_rate": 1.7185311579394793e-07, + "loss": 0.2723, + "step": 31678 + }, + { + "epoch": 0.92, + "grad_norm": 1.484764796450184, + "learning_rate": 1.717310493301799e-07, + "loss": 0.2908, + "step": 31679 + }, + { + "epoch": 0.92, + "grad_norm": 1.4656876013484799, + "learning_rate": 1.716090254759395e-07, + "loss": 0.2771, + "step": 31680 + }, + { + "epoch": 0.92, + "grad_norm": 1.7931640281788737, + "learning_rate": 1.7148704423230356e-07, + "loss": 0.2689, + "step": 31681 + }, + { + "epoch": 0.92, + "grad_norm": 1.3496149278254845, + "learning_rate": 1.7136510560034737e-07, + "loss": 0.2667, + "step": 31682 + }, + { + "epoch": 0.92, + "grad_norm": 1.511674269271546, + "learning_rate": 1.7124320958114725e-07, + "loss": 0.2693, + "step": 31683 + }, + { + "epoch": 0.92, + "grad_norm": 1.675404770045342, + "learning_rate": 1.7112135617577962e-07, + "loss": 0.2703, + "step": 31684 + }, + { + "epoch": 0.92, + "grad_norm": 1.1844442283553438, + "learning_rate": 1.709995453853197e-07, + "loss": 0.2562, + "step": 31685 + }, + { + "epoch": 0.92, + "grad_norm": 1.3005710687456202, + "learning_rate": 1.7087777721084274e-07, + "loss": 0.3033, + "step": 31686 + }, + { + "epoch": 0.92, + "grad_norm": 1.4800302148592048, + "learning_rate": 1.7075605165342292e-07, + "loss": 0.2764, + "step": 31687 + }, + { + "epoch": 0.92, + "grad_norm": 1.2892134636664587, + "learning_rate": 1.706343687141343e-07, + "loss": 0.2537, + "step": 31688 + }, + { + "epoch": 0.92, + "grad_norm": 1.4159902866690472, + "learning_rate": 1.7051272839405053e-07, + "loss": 0.2865, + "step": 31689 + }, + { + "epoch": 0.92, + "grad_norm": 1.408095296908771, + "learning_rate": 1.7039113069424628e-07, + "loss": 0.2622, + "step": 31690 + }, + { + "epoch": 0.92, + "grad_norm": 1.6587237651747806, + "learning_rate": 1.7026957561579295e-07, + "loss": 0.2699, + "step": 31691 + }, + { + "epoch": 0.92, + "grad_norm": 1.2565107275868919, + "learning_rate": 1.7014806315976461e-07, + "loss": 0.2806, + "step": 31692 + }, + { + "epoch": 0.92, + "grad_norm": 1.4933278652624342, + "learning_rate": 1.7002659332723326e-07, + "loss": 0.2571, + "step": 31693 + }, + { + "epoch": 0.92, + "grad_norm": 1.5280879773367453, + "learning_rate": 1.6990516611927077e-07, + "loss": 0.2406, + "step": 31694 + }, + { + "epoch": 0.92, + "grad_norm": 1.415953353332787, + "learning_rate": 1.6978378153694962e-07, + "loss": 0.2646, + "step": 31695 + }, + { + "epoch": 0.92, + "grad_norm": 1.672942941999934, + "learning_rate": 1.6966243958133955e-07, + "loss": 0.2731, + "step": 31696 + }, + { + "epoch": 0.92, + "grad_norm": 1.6734398540195676, + "learning_rate": 1.6954114025351132e-07, + "loss": 0.2666, + "step": 31697 + }, + { + "epoch": 0.92, + "grad_norm": 1.371442965682367, + "learning_rate": 1.6941988355453687e-07, + "loss": 0.3245, + "step": 31698 + }, + { + "epoch": 0.92, + "grad_norm": 1.5026392876324461, + "learning_rate": 1.6929866948548535e-07, + "loss": 0.2917, + "step": 31699 + }, + { + "epoch": 0.92, + "grad_norm": 1.331371143043977, + "learning_rate": 1.6917749804742645e-07, + "loss": 0.2577, + "step": 31700 + }, + { + "epoch": 0.92, + "grad_norm": 2.33112661733337, + "learning_rate": 1.6905636924142987e-07, + "loss": 0.2661, + "step": 31701 + }, + { + "epoch": 0.92, + "grad_norm": 1.4391620447859308, + "learning_rate": 1.689352830685642e-07, + "loss": 0.2777, + "step": 31702 + }, + { + "epoch": 0.92, + "grad_norm": 1.3339021660808374, + "learning_rate": 1.688142395298986e-07, + "loss": 0.2681, + "step": 31703 + }, + { + "epoch": 0.92, + "grad_norm": 1.228077675470296, + "learning_rate": 1.6869323862650168e-07, + "loss": 0.2444, + "step": 31704 + }, + { + "epoch": 0.92, + "grad_norm": 1.3334395170720594, + "learning_rate": 1.6857228035943972e-07, + "loss": 0.2638, + "step": 31705 + }, + { + "epoch": 0.92, + "grad_norm": 1.3826386300654796, + "learning_rate": 1.6845136472978085e-07, + "loss": 0.2579, + "step": 31706 + }, + { + "epoch": 0.92, + "grad_norm": 1.6815239983668482, + "learning_rate": 1.6833049173859196e-07, + "loss": 0.2464, + "step": 31707 + }, + { + "epoch": 0.92, + "grad_norm": 1.4068091787638557, + "learning_rate": 1.6820966138694052e-07, + "loss": 0.2702, + "step": 31708 + }, + { + "epoch": 0.92, + "grad_norm": 1.3465105925578673, + "learning_rate": 1.6808887367589288e-07, + "loss": 0.2729, + "step": 31709 + }, + { + "epoch": 0.92, + "grad_norm": 1.2385688507380976, + "learning_rate": 1.6796812860651434e-07, + "loss": 0.2518, + "step": 31710 + }, + { + "epoch": 0.92, + "grad_norm": 1.4130522256360485, + "learning_rate": 1.6784742617987016e-07, + "loss": 0.266, + "step": 31711 + }, + { + "epoch": 0.92, + "grad_norm": 1.3096628466722293, + "learning_rate": 1.6772676639702613e-07, + "loss": 0.2484, + "step": 31712 + }, + { + "epoch": 0.92, + "grad_norm": 1.3640300498784572, + "learning_rate": 1.6760614925904751e-07, + "loss": 0.2708, + "step": 31713 + }, + { + "epoch": 0.92, + "grad_norm": 1.373728469729803, + "learning_rate": 1.674855747669979e-07, + "loss": 0.2634, + "step": 31714 + }, + { + "epoch": 0.92, + "grad_norm": 1.353710608920806, + "learning_rate": 1.67365042921942e-07, + "loss": 0.2821, + "step": 31715 + }, + { + "epoch": 0.92, + "grad_norm": 1.3298675745758148, + "learning_rate": 1.6724455372494287e-07, + "loss": 0.2636, + "step": 31716 + }, + { + "epoch": 0.92, + "grad_norm": 1.3231832609233103, + "learning_rate": 1.6712410717706406e-07, + "loss": 0.2755, + "step": 31717 + }, + { + "epoch": 0.92, + "grad_norm": 1.3734396922496015, + "learning_rate": 1.6700370327936866e-07, + "loss": 0.2608, + "step": 31718 + }, + { + "epoch": 0.92, + "grad_norm": 1.253158545970521, + "learning_rate": 1.668833420329191e-07, + "loss": 0.2718, + "step": 31719 + }, + { + "epoch": 0.92, + "grad_norm": 1.5092359804059303, + "learning_rate": 1.6676302343877792e-07, + "loss": 0.2493, + "step": 31720 + }, + { + "epoch": 0.92, + "grad_norm": 1.233193448622882, + "learning_rate": 1.6664274749800646e-07, + "loss": 0.2663, + "step": 31721 + }, + { + "epoch": 0.92, + "grad_norm": 1.3474125982498115, + "learning_rate": 1.665225142116661e-07, + "loss": 0.2628, + "step": 31722 + }, + { + "epoch": 0.92, + "grad_norm": 1.3483714831889164, + "learning_rate": 1.6640232358081932e-07, + "loss": 0.265, + "step": 31723 + }, + { + "epoch": 0.92, + "grad_norm": 1.397987974339326, + "learning_rate": 1.662821756065247e-07, + "loss": 0.2565, + "step": 31724 + }, + { + "epoch": 0.92, + "grad_norm": 1.374713846865494, + "learning_rate": 1.6616207028984365e-07, + "loss": 0.2663, + "step": 31725 + }, + { + "epoch": 0.92, + "grad_norm": 1.4180604803128107, + "learning_rate": 1.6604200763183586e-07, + "loss": 0.2846, + "step": 31726 + }, + { + "epoch": 0.92, + "grad_norm": 1.7796847351465563, + "learning_rate": 1.6592198763356103e-07, + "loss": 0.2747, + "step": 31727 + }, + { + "epoch": 0.92, + "grad_norm": 1.4872657256253015, + "learning_rate": 1.6580201029607833e-07, + "loss": 0.2723, + "step": 31728 + }, + { + "epoch": 0.92, + "grad_norm": 1.8363375219347666, + "learning_rate": 1.6568207562044692e-07, + "loss": 0.276, + "step": 31729 + }, + { + "epoch": 0.92, + "grad_norm": 1.4340881885692833, + "learning_rate": 1.6556218360772424e-07, + "loss": 0.2628, + "step": 31730 + }, + { + "epoch": 0.92, + "grad_norm": 1.3035222768728436, + "learning_rate": 1.654423342589695e-07, + "loss": 0.2754, + "step": 31731 + }, + { + "epoch": 0.92, + "grad_norm": 1.3644953929661146, + "learning_rate": 1.653225275752396e-07, + "loss": 0.2512, + "step": 31732 + }, + { + "epoch": 0.92, + "grad_norm": 1.6528935552376174, + "learning_rate": 1.65202763557592e-07, + "loss": 0.2663, + "step": 31733 + }, + { + "epoch": 0.92, + "grad_norm": 1.3377058420661303, + "learning_rate": 1.6508304220708316e-07, + "loss": 0.2651, + "step": 31734 + }, + { + "epoch": 0.92, + "grad_norm": 4.6263768948658806, + "learning_rate": 1.649633635247705e-07, + "loss": 0.29, + "step": 31735 + }, + { + "epoch": 0.92, + "grad_norm": 1.253733879403828, + "learning_rate": 1.6484372751170984e-07, + "loss": 0.2736, + "step": 31736 + }, + { + "epoch": 0.92, + "grad_norm": 1.0087980492751387, + "learning_rate": 1.647241341689565e-07, + "loss": 0.5306, + "step": 31737 + }, + { + "epoch": 0.92, + "grad_norm": 1.336364496145183, + "learning_rate": 1.6460458349756736e-07, + "loss": 0.2629, + "step": 31738 + }, + { + "epoch": 0.92, + "grad_norm": 1.2714079649586487, + "learning_rate": 1.644850754985955e-07, + "loss": 0.2783, + "step": 31739 + }, + { + "epoch": 0.92, + "grad_norm": 1.349815769624122, + "learning_rate": 1.643656101730967e-07, + "loss": 0.2784, + "step": 31740 + }, + { + "epoch": 0.92, + "grad_norm": 1.3545002860543691, + "learning_rate": 1.6424618752212518e-07, + "loss": 0.2734, + "step": 31741 + }, + { + "epoch": 0.92, + "grad_norm": 1.4028570475663162, + "learning_rate": 1.6412680754673506e-07, + "loss": 0.2578, + "step": 31742 + }, + { + "epoch": 0.92, + "grad_norm": 1.3851661241994326, + "learning_rate": 1.640074702479788e-07, + "loss": 0.2727, + "step": 31743 + }, + { + "epoch": 0.92, + "grad_norm": 1.3895181056120314, + "learning_rate": 1.6388817562691062e-07, + "loss": 0.2829, + "step": 31744 + }, + { + "epoch": 0.92, + "grad_norm": 1.3867688035290413, + "learning_rate": 1.637689236845824e-07, + "loss": 0.2537, + "step": 31745 + }, + { + "epoch": 0.92, + "grad_norm": 1.393295507138222, + "learning_rate": 1.6364971442204725e-07, + "loss": 0.2704, + "step": 31746 + }, + { + "epoch": 0.92, + "grad_norm": 1.4752358388604296, + "learning_rate": 1.635305478403576e-07, + "loss": 0.2644, + "step": 31747 + }, + { + "epoch": 0.92, + "grad_norm": 1.47753497011999, + "learning_rate": 1.6341142394056376e-07, + "loss": 0.2728, + "step": 31748 + }, + { + "epoch": 0.92, + "grad_norm": 1.3685440826001254, + "learning_rate": 1.6329234272371819e-07, + "loss": 0.2715, + "step": 31749 + }, + { + "epoch": 0.92, + "grad_norm": 1.3495191613547741, + "learning_rate": 1.6317330419087175e-07, + "loss": 0.2823, + "step": 31750 + }, + { + "epoch": 0.92, + "grad_norm": 1.4403124953602733, + "learning_rate": 1.6305430834307413e-07, + "loss": 0.2843, + "step": 31751 + }, + { + "epoch": 0.92, + "grad_norm": 1.3039080060598833, + "learning_rate": 1.6293535518137616e-07, + "loss": 0.2677, + "step": 31752 + }, + { + "epoch": 0.92, + "grad_norm": 1.3682597904689136, + "learning_rate": 1.6281644470682701e-07, + "loss": 0.2737, + "step": 31753 + }, + { + "epoch": 0.92, + "grad_norm": 1.8042491155382059, + "learning_rate": 1.6269757692047638e-07, + "loss": 0.2665, + "step": 31754 + }, + { + "epoch": 0.92, + "grad_norm": 1.3145528125758912, + "learning_rate": 1.625787518233729e-07, + "loss": 0.2665, + "step": 31755 + }, + { + "epoch": 0.92, + "grad_norm": 1.286893277741605, + "learning_rate": 1.6245996941656628e-07, + "loss": 0.2814, + "step": 31756 + }, + { + "epoch": 0.92, + "grad_norm": 1.2988761383697822, + "learning_rate": 1.62341229701104e-07, + "loss": 0.2499, + "step": 31757 + }, + { + "epoch": 0.92, + "grad_norm": 1.5048264232533657, + "learning_rate": 1.6222253267803357e-07, + "loss": 0.2575, + "step": 31758 + }, + { + "epoch": 0.92, + "grad_norm": 1.2682354498552526, + "learning_rate": 1.6210387834840412e-07, + "loss": 0.25, + "step": 31759 + }, + { + "epoch": 0.92, + "grad_norm": 1.3129244515319813, + "learning_rate": 1.6198526671326041e-07, + "loss": 0.2764, + "step": 31760 + }, + { + "epoch": 0.92, + "grad_norm": 1.3775827626732076, + "learning_rate": 1.6186669777365106e-07, + "loss": 0.2594, + "step": 31761 + }, + { + "epoch": 0.92, + "grad_norm": 1.431406528200753, + "learning_rate": 1.617481715306213e-07, + "loss": 0.2846, + "step": 31762 + }, + { + "epoch": 0.92, + "grad_norm": 1.2814097657824126, + "learning_rate": 1.616296879852175e-07, + "loss": 0.2662, + "step": 31763 + }, + { + "epoch": 0.92, + "grad_norm": 1.4416845270131986, + "learning_rate": 1.6151124713848554e-07, + "loss": 0.2733, + "step": 31764 + }, + { + "epoch": 0.92, + "grad_norm": 1.3869136643025668, + "learning_rate": 1.6139284899147067e-07, + "loss": 0.2866, + "step": 31765 + }, + { + "epoch": 0.92, + "grad_norm": 1.302968198338526, + "learning_rate": 1.6127449354521817e-07, + "loss": 0.2698, + "step": 31766 + }, + { + "epoch": 0.92, + "grad_norm": 1.6241504406974712, + "learning_rate": 1.611561808007711e-07, + "loss": 0.2768, + "step": 31767 + }, + { + "epoch": 0.92, + "grad_norm": 1.4308501368261473, + "learning_rate": 1.6103791075917475e-07, + "loss": 0.2809, + "step": 31768 + }, + { + "epoch": 0.92, + "grad_norm": 1.6989442563930315, + "learning_rate": 1.609196834214727e-07, + "loss": 0.3007, + "step": 31769 + }, + { + "epoch": 0.92, + "grad_norm": 2.015564307751107, + "learning_rate": 1.60801498788708e-07, + "loss": 0.2748, + "step": 31770 + }, + { + "epoch": 0.92, + "grad_norm": 1.2690201035425919, + "learning_rate": 1.606833568619237e-07, + "loss": 0.2649, + "step": 31771 + }, + { + "epoch": 0.92, + "grad_norm": 1.4594888080522317, + "learning_rate": 1.605652576421618e-07, + "loss": 0.2753, + "step": 31772 + }, + { + "epoch": 0.92, + "grad_norm": 1.2809436083914305, + "learning_rate": 1.6044720113046586e-07, + "loss": 0.2914, + "step": 31773 + }, + { + "epoch": 0.92, + "grad_norm": 1.409410419591653, + "learning_rate": 1.6032918732787674e-07, + "loss": 0.276, + "step": 31774 + }, + { + "epoch": 0.92, + "grad_norm": 1.241507401848981, + "learning_rate": 1.6021121623543635e-07, + "loss": 0.2555, + "step": 31775 + }, + { + "epoch": 0.92, + "grad_norm": 1.3189812151544018, + "learning_rate": 1.600932878541861e-07, + "loss": 0.2602, + "step": 31776 + }, + { + "epoch": 0.92, + "grad_norm": 1.4780535857858241, + "learning_rate": 1.5997540218516572e-07, + "loss": 0.2589, + "step": 31777 + }, + { + "epoch": 0.92, + "grad_norm": 1.4033757841674988, + "learning_rate": 1.598575592294166e-07, + "loss": 0.2653, + "step": 31778 + }, + { + "epoch": 0.92, + "grad_norm": 1.3260172680120421, + "learning_rate": 1.597397589879779e-07, + "loss": 0.2656, + "step": 31779 + }, + { + "epoch": 0.92, + "grad_norm": 1.29166488490882, + "learning_rate": 1.5962200146188932e-07, + "loss": 0.2497, + "step": 31780 + }, + { + "epoch": 0.92, + "grad_norm": 1.7847941466594623, + "learning_rate": 1.5950428665219065e-07, + "loss": 0.2696, + "step": 31781 + }, + { + "epoch": 0.92, + "grad_norm": 2.483620187622207, + "learning_rate": 1.5938661455991988e-07, + "loss": 0.2682, + "step": 31782 + }, + { + "epoch": 0.92, + "grad_norm": 1.733665376565152, + "learning_rate": 1.5926898518611621e-07, + "loss": 0.2653, + "step": 31783 + }, + { + "epoch": 0.92, + "grad_norm": 1.3702437422877305, + "learning_rate": 1.5915139853181772e-07, + "loss": 0.2883, + "step": 31784 + }, + { + "epoch": 0.92, + "grad_norm": 1.450458650040949, + "learning_rate": 1.5903385459806132e-07, + "loss": 0.262, + "step": 31785 + }, + { + "epoch": 0.92, + "grad_norm": 1.3644841236878362, + "learning_rate": 1.5891635338588563e-07, + "loss": 0.2933, + "step": 31786 + }, + { + "epoch": 0.92, + "grad_norm": 0.9967564907942807, + "learning_rate": 1.5879889489632648e-07, + "loss": 0.5856, + "step": 31787 + }, + { + "epoch": 0.92, + "grad_norm": 1.2674545386313902, + "learning_rate": 1.5868147913042032e-07, + "loss": 0.2636, + "step": 31788 + }, + { + "epoch": 0.92, + "grad_norm": 1.4184100723483146, + "learning_rate": 1.5856410608920403e-07, + "loss": 0.2805, + "step": 31789 + }, + { + "epoch": 0.92, + "grad_norm": 1.5496285487343942, + "learning_rate": 1.5844677577371348e-07, + "loss": 0.2633, + "step": 31790 + }, + { + "epoch": 0.92, + "grad_norm": 1.8445322414515575, + "learning_rate": 1.583294881849834e-07, + "loss": 0.2778, + "step": 31791 + }, + { + "epoch": 0.92, + "grad_norm": 1.375945839157984, + "learning_rate": 1.582122433240496e-07, + "loss": 0.2697, + "step": 31792 + }, + { + "epoch": 0.92, + "grad_norm": 1.344664203661786, + "learning_rate": 1.580950411919463e-07, + "loss": 0.3154, + "step": 31793 + }, + { + "epoch": 0.92, + "grad_norm": 1.4976727331850583, + "learning_rate": 1.5797788178970875e-07, + "loss": 0.265, + "step": 31794 + }, + { + "epoch": 0.92, + "grad_norm": 1.371983699486886, + "learning_rate": 1.5786076511836945e-07, + "loss": 0.2682, + "step": 31795 + }, + { + "epoch": 0.92, + "grad_norm": 1.3163533355950432, + "learning_rate": 1.577436911789626e-07, + "loss": 0.254, + "step": 31796 + }, + { + "epoch": 0.92, + "grad_norm": 1.4223437398252579, + "learning_rate": 1.576266599725218e-07, + "loss": 0.2616, + "step": 31797 + }, + { + "epoch": 0.92, + "grad_norm": 1.4020818647601314, + "learning_rate": 1.57509671500079e-07, + "loss": 0.268, + "step": 31798 + }, + { + "epoch": 0.92, + "grad_norm": 1.6340027253572504, + "learning_rate": 1.573927257626673e-07, + "loss": 0.2723, + "step": 31799 + }, + { + "epoch": 0.92, + "grad_norm": 1.3630666942012821, + "learning_rate": 1.5727582276131803e-07, + "loss": 0.2905, + "step": 31800 + }, + { + "epoch": 0.92, + "grad_norm": 1.4998925543608634, + "learning_rate": 1.5715896249706374e-07, + "loss": 0.2602, + "step": 31801 + }, + { + "epoch": 0.92, + "grad_norm": 1.8625417788539047, + "learning_rate": 1.570421449709353e-07, + "loss": 0.2727, + "step": 31802 + }, + { + "epoch": 0.92, + "grad_norm": 1.4673255623014883, + "learning_rate": 1.569253701839635e-07, + "loss": 0.2891, + "step": 31803 + }, + { + "epoch": 0.92, + "grad_norm": 1.2669814225574203, + "learning_rate": 1.5680863813717918e-07, + "loss": 0.2471, + "step": 31804 + }, + { + "epoch": 0.92, + "grad_norm": 1.3583218223650813, + "learning_rate": 1.5669194883161264e-07, + "loss": 0.248, + "step": 31805 + }, + { + "epoch": 0.92, + "grad_norm": 1.3296502460185757, + "learning_rate": 1.565753022682931e-07, + "loss": 0.2714, + "step": 31806 + }, + { + "epoch": 0.92, + "grad_norm": 1.678604758496714, + "learning_rate": 1.5645869844825023e-07, + "loss": 0.269, + "step": 31807 + }, + { + "epoch": 0.92, + "grad_norm": 1.3762684659625786, + "learning_rate": 1.5634213737251324e-07, + "loss": 0.2684, + "step": 31808 + }, + { + "epoch": 0.92, + "grad_norm": 1.4695714388748524, + "learning_rate": 1.562256190421102e-07, + "loss": 0.2687, + "step": 31809 + }, + { + "epoch": 0.92, + "grad_norm": 1.3417166351436278, + "learning_rate": 1.561091434580697e-07, + "loss": 0.2683, + "step": 31810 + }, + { + "epoch": 0.92, + "grad_norm": 1.4486057612995904, + "learning_rate": 1.559927106214204e-07, + "loss": 0.2871, + "step": 31811 + }, + { + "epoch": 0.92, + "grad_norm": 1.4520598663292859, + "learning_rate": 1.558763205331887e-07, + "loss": 0.2574, + "step": 31812 + }, + { + "epoch": 0.92, + "grad_norm": 1.2659759210073516, + "learning_rate": 1.5575997319440205e-07, + "loss": 0.2667, + "step": 31813 + }, + { + "epoch": 0.92, + "grad_norm": 1.2075733572470453, + "learning_rate": 1.556436686060886e-07, + "loss": 0.2536, + "step": 31814 + }, + { + "epoch": 0.92, + "grad_norm": 1.3213593897203246, + "learning_rate": 1.5552740676927247e-07, + "loss": 0.2561, + "step": 31815 + }, + { + "epoch": 0.92, + "grad_norm": 1.2612554313808841, + "learning_rate": 1.5541118768498065e-07, + "loss": 0.2738, + "step": 31816 + }, + { + "epoch": 0.92, + "grad_norm": 1.2856416082347781, + "learning_rate": 1.5529501135423897e-07, + "loss": 0.2577, + "step": 31817 + }, + { + "epoch": 0.92, + "grad_norm": 1.3342519292252157, + "learning_rate": 1.5517887777807273e-07, + "loss": 0.2663, + "step": 31818 + }, + { + "epoch": 0.92, + "grad_norm": 1.2673567551265124, + "learning_rate": 1.5506278695750665e-07, + "loss": 0.2558, + "step": 31819 + }, + { + "epoch": 0.92, + "grad_norm": 1.3224492100136112, + "learning_rate": 1.5494673889356493e-07, + "loss": 0.2809, + "step": 31820 + }, + { + "epoch": 0.92, + "grad_norm": 1.3608579292760108, + "learning_rate": 1.548307335872723e-07, + "loss": 0.2707, + "step": 31821 + }, + { + "epoch": 0.92, + "grad_norm": 1.3909157268513992, + "learning_rate": 1.5471477103965237e-07, + "loss": 0.3011, + "step": 31822 + }, + { + "epoch": 0.92, + "grad_norm": 1.494374137860533, + "learning_rate": 1.545988512517277e-07, + "loss": 0.3039, + "step": 31823 + }, + { + "epoch": 0.92, + "grad_norm": 1.4611765153651344, + "learning_rate": 1.5448297422452298e-07, + "loss": 0.2723, + "step": 31824 + }, + { + "epoch": 0.92, + "grad_norm": 2.1418912181437224, + "learning_rate": 1.5436713995905906e-07, + "loss": 0.2818, + "step": 31825 + }, + { + "epoch": 0.92, + "grad_norm": 1.2185484115625993, + "learning_rate": 1.5425134845635903e-07, + "loss": 0.2764, + "step": 31826 + }, + { + "epoch": 0.92, + "grad_norm": 1.9703419010335756, + "learning_rate": 1.541355997174443e-07, + "loss": 0.2736, + "step": 31827 + }, + { + "epoch": 0.92, + "grad_norm": 1.271561386467686, + "learning_rate": 1.540198937433368e-07, + "loss": 0.2828, + "step": 31828 + }, + { + "epoch": 0.92, + "grad_norm": 1.4653061701064383, + "learning_rate": 1.5390423053505744e-07, + "loss": 0.2674, + "step": 31829 + }, + { + "epoch": 0.92, + "grad_norm": 1.479437606275555, + "learning_rate": 1.53788610093627e-07, + "loss": 0.2695, + "step": 31830 + }, + { + "epoch": 0.92, + "grad_norm": 1.314481186800244, + "learning_rate": 1.536730324200658e-07, + "loss": 0.2688, + "step": 31831 + }, + { + "epoch": 0.92, + "grad_norm": 1.2608344642241731, + "learning_rate": 1.5355749751539361e-07, + "loss": 0.2465, + "step": 31832 + }, + { + "epoch": 0.92, + "grad_norm": 1.2828124131680834, + "learning_rate": 1.5344200538063125e-07, + "loss": 0.2775, + "step": 31833 + }, + { + "epoch": 0.92, + "grad_norm": 1.5198875002713925, + "learning_rate": 1.5332655601679625e-07, + "loss": 0.2764, + "step": 31834 + }, + { + "epoch": 0.92, + "grad_norm": 1.3590341452798862, + "learning_rate": 1.5321114942490833e-07, + "loss": 0.2964, + "step": 31835 + }, + { + "epoch": 0.92, + "grad_norm": 1.3058406359191632, + "learning_rate": 1.5309578560598558e-07, + "loss": 0.2665, + "step": 31836 + }, + { + "epoch": 0.92, + "grad_norm": 1.3502112863277123, + "learning_rate": 1.529804645610461e-07, + "loss": 0.2752, + "step": 31837 + }, + { + "epoch": 0.92, + "grad_norm": 1.3640884579536399, + "learning_rate": 1.528651862911079e-07, + "loss": 0.2588, + "step": 31838 + }, + { + "epoch": 0.92, + "grad_norm": 1.3007009036627268, + "learning_rate": 1.5274995079718857e-07, + "loss": 0.2732, + "step": 31839 + }, + { + "epoch": 0.92, + "grad_norm": 1.2949119713277522, + "learning_rate": 1.5263475808030448e-07, + "loss": 0.2498, + "step": 31840 + }, + { + "epoch": 0.92, + "grad_norm": 1.7289407334483686, + "learning_rate": 1.5251960814147316e-07, + "loss": 0.2423, + "step": 31841 + }, + { + "epoch": 0.92, + "grad_norm": 1.3205169363085647, + "learning_rate": 1.5240450098170934e-07, + "loss": 0.2615, + "step": 31842 + }, + { + "epoch": 0.92, + "grad_norm": 1.2560001745595688, + "learning_rate": 1.5228943660202945e-07, + "loss": 0.2663, + "step": 31843 + }, + { + "epoch": 0.92, + "grad_norm": 1.2710793164412633, + "learning_rate": 1.5217441500344932e-07, + "loss": 0.2747, + "step": 31844 + }, + { + "epoch": 0.92, + "grad_norm": 1.3368023060172454, + "learning_rate": 1.5205943618698315e-07, + "loss": 0.2841, + "step": 31845 + }, + { + "epoch": 0.92, + "grad_norm": 1.417093518501486, + "learning_rate": 1.5194450015364681e-07, + "loss": 0.2591, + "step": 31846 + }, + { + "epoch": 0.92, + "grad_norm": 1.8645322265600104, + "learning_rate": 1.5182960690445392e-07, + "loss": 0.301, + "step": 31847 + }, + { + "epoch": 0.92, + "grad_norm": 1.3140922320815442, + "learning_rate": 1.5171475644041867e-07, + "loss": 0.2458, + "step": 31848 + }, + { + "epoch": 0.92, + "grad_norm": 1.3663136777679152, + "learning_rate": 1.5159994876255412e-07, + "loss": 0.2781, + "step": 31849 + }, + { + "epoch": 0.92, + "grad_norm": 1.4035310993415355, + "learning_rate": 1.514851838718745e-07, + "loss": 0.2791, + "step": 31850 + }, + { + "epoch": 0.92, + "grad_norm": 1.438936535517097, + "learning_rate": 1.513704617693923e-07, + "loss": 0.2757, + "step": 31851 + }, + { + "epoch": 0.92, + "grad_norm": 1.2482656808046861, + "learning_rate": 1.512557824561184e-07, + "loss": 0.2956, + "step": 31852 + }, + { + "epoch": 0.92, + "grad_norm": 1.5510058773277355, + "learning_rate": 1.511411459330664e-07, + "loss": 0.2624, + "step": 31853 + }, + { + "epoch": 0.92, + "grad_norm": 1.2832125915650163, + "learning_rate": 1.5102655220124773e-07, + "loss": 0.2692, + "step": 31854 + }, + { + "epoch": 0.92, + "grad_norm": 1.41279635709111, + "learning_rate": 1.5091200126167328e-07, + "loss": 0.2716, + "step": 31855 + }, + { + "epoch": 0.92, + "grad_norm": 1.3110758662674664, + "learning_rate": 1.5079749311535386e-07, + "loss": 0.2695, + "step": 31856 + }, + { + "epoch": 0.92, + "grad_norm": 1.427277750033339, + "learning_rate": 1.506830277633009e-07, + "loss": 0.2548, + "step": 31857 + }, + { + "epoch": 0.92, + "grad_norm": 1.3861076126920193, + "learning_rate": 1.505686052065236e-07, + "loss": 0.2543, + "step": 31858 + }, + { + "epoch": 0.92, + "grad_norm": 1.294178543627414, + "learning_rate": 1.5045422544603283e-07, + "loss": 0.2591, + "step": 31859 + }, + { + "epoch": 0.92, + "grad_norm": 1.3686512147124164, + "learning_rate": 1.503398884828372e-07, + "loss": 0.2697, + "step": 31860 + }, + { + "epoch": 0.92, + "grad_norm": 2.1907230236164708, + "learning_rate": 1.5022559431794538e-07, + "loss": 0.2907, + "step": 31861 + }, + { + "epoch": 0.92, + "grad_norm": 2.079678197340257, + "learning_rate": 1.5011134295236706e-07, + "loss": 0.2731, + "step": 31862 + }, + { + "epoch": 0.92, + "grad_norm": 0.9338742859031494, + "learning_rate": 1.4999713438710928e-07, + "loss": 0.5796, + "step": 31863 + }, + { + "epoch": 0.92, + "grad_norm": 1.2879568618813126, + "learning_rate": 1.4988296862318062e-07, + "loss": 0.2664, + "step": 31864 + }, + { + "epoch": 0.92, + "grad_norm": 1.2742684536397793, + "learning_rate": 1.4976884566158923e-07, + "loss": 0.2774, + "step": 31865 + }, + { + "epoch": 0.92, + "grad_norm": 1.7201632146939567, + "learning_rate": 1.496547655033409e-07, + "loss": 0.2834, + "step": 31866 + }, + { + "epoch": 0.92, + "grad_norm": 1.6732193844797472, + "learning_rate": 1.495407281494432e-07, + "loss": 0.2723, + "step": 31867 + }, + { + "epoch": 0.92, + "grad_norm": 1.5815395398362726, + "learning_rate": 1.4942673360090255e-07, + "loss": 0.2671, + "step": 31868 + }, + { + "epoch": 0.92, + "grad_norm": 1.2022189775900434, + "learning_rate": 1.4931278185872532e-07, + "loss": 0.2876, + "step": 31869 + }, + { + "epoch": 0.92, + "grad_norm": 1.4525485710048642, + "learning_rate": 1.4919887292391577e-07, + "loss": 0.2652, + "step": 31870 + }, + { + "epoch": 0.92, + "grad_norm": 1.2419240249876091, + "learning_rate": 1.4908500679748029e-07, + "loss": 0.2555, + "step": 31871 + }, + { + "epoch": 0.92, + "grad_norm": 3.72878513999178, + "learning_rate": 1.489711834804236e-07, + "loss": 0.2635, + "step": 31872 + }, + { + "epoch": 0.92, + "grad_norm": 1.4280834711015236, + "learning_rate": 1.4885740297374995e-07, + "loss": 0.3148, + "step": 31873 + }, + { + "epoch": 0.92, + "grad_norm": 1.7916201663988192, + "learning_rate": 1.4874366527846296e-07, + "loss": 0.2751, + "step": 31874 + }, + { + "epoch": 0.92, + "grad_norm": 1.5732140373408676, + "learning_rate": 1.4862997039556738e-07, + "loss": 0.2814, + "step": 31875 + }, + { + "epoch": 0.92, + "grad_norm": 1.2649173616265834, + "learning_rate": 1.4851631832606628e-07, + "loss": 0.2821, + "step": 31876 + }, + { + "epoch": 0.92, + "grad_norm": 1.3128231569121915, + "learning_rate": 1.4840270907096276e-07, + "loss": 0.2607, + "step": 31877 + }, + { + "epoch": 0.92, + "grad_norm": 1.5355310032061646, + "learning_rate": 1.4828914263125882e-07, + "loss": 0.275, + "step": 31878 + }, + { + "epoch": 0.92, + "grad_norm": 1.377568428340233, + "learning_rate": 1.4817561900795751e-07, + "loss": 0.2694, + "step": 31879 + }, + { + "epoch": 0.92, + "grad_norm": 1.2581161794236075, + "learning_rate": 1.4806213820205973e-07, + "loss": 0.2571, + "step": 31880 + }, + { + "epoch": 0.92, + "grad_norm": 1.5294157262700323, + "learning_rate": 1.4794870021456686e-07, + "loss": 0.2738, + "step": 31881 + }, + { + "epoch": 0.92, + "grad_norm": 1.413208369331515, + "learning_rate": 1.4783530504648093e-07, + "loss": 0.2721, + "step": 31882 + }, + { + "epoch": 0.92, + "grad_norm": 1.6002715804302616, + "learning_rate": 1.477219526988022e-07, + "loss": 0.2678, + "step": 31883 + }, + { + "epoch": 0.92, + "grad_norm": 1.3642435744197225, + "learning_rate": 1.4760864317253154e-07, + "loss": 0.2731, + "step": 31884 + }, + { + "epoch": 0.92, + "grad_norm": 1.399448373860458, + "learning_rate": 1.4749537646866762e-07, + "loss": 0.2627, + "step": 31885 + }, + { + "epoch": 0.92, + "grad_norm": 1.3102412494485163, + "learning_rate": 1.4738215258821132e-07, + "loss": 0.2697, + "step": 31886 + }, + { + "epoch": 0.92, + "grad_norm": 1.2771225393422163, + "learning_rate": 1.472689715321618e-07, + "loss": 0.2613, + "step": 31887 + }, + { + "epoch": 0.92, + "grad_norm": 1.4328331376145464, + "learning_rate": 1.4715583330151718e-07, + "loss": 0.2763, + "step": 31888 + }, + { + "epoch": 0.92, + "grad_norm": 1.29724642589964, + "learning_rate": 1.4704273789727553e-07, + "loss": 0.3221, + "step": 31889 + }, + { + "epoch": 0.92, + "grad_norm": 1.195215649499575, + "learning_rate": 1.4692968532043606e-07, + "loss": 0.2645, + "step": 31890 + }, + { + "epoch": 0.92, + "grad_norm": 1.2791701372043596, + "learning_rate": 1.4681667557199576e-07, + "loss": 0.2799, + "step": 31891 + }, + { + "epoch": 0.93, + "grad_norm": 1.3595357982757874, + "learning_rate": 1.4670370865295214e-07, + "loss": 0.2739, + "step": 31892 + }, + { + "epoch": 0.93, + "grad_norm": 1.4560038481633548, + "learning_rate": 1.465907845643022e-07, + "loss": 0.2853, + "step": 31893 + }, + { + "epoch": 0.93, + "grad_norm": 1.3591489005438697, + "learning_rate": 1.4647790330704236e-07, + "loss": 0.2528, + "step": 31894 + }, + { + "epoch": 0.93, + "grad_norm": 1.4172120950789402, + "learning_rate": 1.4636506488216906e-07, + "loss": 0.2789, + "step": 31895 + }, + { + "epoch": 0.93, + "grad_norm": 1.500151948387348, + "learning_rate": 1.4625226929067815e-07, + "loss": 0.2655, + "step": 31896 + }, + { + "epoch": 0.93, + "grad_norm": 1.2934680921425603, + "learning_rate": 1.461395165335644e-07, + "loss": 0.2628, + "step": 31897 + }, + { + "epoch": 0.93, + "grad_norm": 1.2981167246238932, + "learning_rate": 1.4602680661182368e-07, + "loss": 0.2672, + "step": 31898 + }, + { + "epoch": 0.93, + "grad_norm": 1.3152104060962693, + "learning_rate": 1.459141395264496e-07, + "loss": 0.2603, + "step": 31899 + }, + { + "epoch": 0.93, + "grad_norm": 1.7536557210259196, + "learning_rate": 1.4580151527843756e-07, + "loss": 0.296, + "step": 31900 + }, + { + "epoch": 0.93, + "grad_norm": 1.6591755399411248, + "learning_rate": 1.4568893386878057e-07, + "loss": 0.2786, + "step": 31901 + }, + { + "epoch": 0.93, + "grad_norm": 1.3244328534490273, + "learning_rate": 1.455763952984729e-07, + "loss": 0.2575, + "step": 31902 + }, + { + "epoch": 0.93, + "grad_norm": 1.2483328357833168, + "learning_rate": 1.4546389956850704e-07, + "loss": 0.2833, + "step": 31903 + }, + { + "epoch": 0.93, + "grad_norm": 1.4248709905220613, + "learning_rate": 1.4535144667987665e-07, + "loss": 0.2447, + "step": 31904 + }, + { + "epoch": 0.93, + "grad_norm": 0.9773467706170755, + "learning_rate": 1.4523903663357376e-07, + "loss": 0.5212, + "step": 31905 + }, + { + "epoch": 0.93, + "grad_norm": 1.4182289566305546, + "learning_rate": 1.4512666943059027e-07, + "loss": 0.2696, + "step": 31906 + }, + { + "epoch": 0.93, + "grad_norm": 1.259892239789811, + "learning_rate": 1.4501434507191713e-07, + "loss": 0.2567, + "step": 31907 + }, + { + "epoch": 0.93, + "grad_norm": 1.278316885672127, + "learning_rate": 1.4490206355854687e-07, + "loss": 0.2679, + "step": 31908 + }, + { + "epoch": 0.93, + "grad_norm": 1.5582104709578581, + "learning_rate": 1.447898248914692e-07, + "loss": 0.272, + "step": 31909 + }, + { + "epoch": 0.93, + "grad_norm": 1.2597142822891836, + "learning_rate": 1.4467762907167505e-07, + "loss": 0.2641, + "step": 31910 + }, + { + "epoch": 0.93, + "grad_norm": 1.3110436999043733, + "learning_rate": 1.4456547610015524e-07, + "loss": 0.2712, + "step": 31911 + }, + { + "epoch": 0.93, + "grad_norm": 1.523628106340498, + "learning_rate": 1.444533659778985e-07, + "loss": 0.2736, + "step": 31912 + }, + { + "epoch": 0.93, + "grad_norm": 1.310420196222209, + "learning_rate": 1.4434129870589452e-07, + "loss": 0.2427, + "step": 31913 + }, + { + "epoch": 0.93, + "grad_norm": 1.4151151985978132, + "learning_rate": 1.442292742851331e-07, + "loss": 0.2582, + "step": 31914 + }, + { + "epoch": 0.93, + "grad_norm": 1.5119403591319172, + "learning_rate": 1.4411729271660236e-07, + "loss": 0.2849, + "step": 31915 + }, + { + "epoch": 0.93, + "grad_norm": 1.4370055906036858, + "learning_rate": 1.4400535400128978e-07, + "loss": 0.2586, + "step": 31916 + }, + { + "epoch": 0.93, + "grad_norm": 1.3422209567156462, + "learning_rate": 1.4389345814018408e-07, + "loss": 0.2758, + "step": 31917 + }, + { + "epoch": 0.93, + "grad_norm": 1.4555455062848874, + "learning_rate": 1.437816051342722e-07, + "loss": 0.2828, + "step": 31918 + }, + { + "epoch": 0.93, + "grad_norm": 2.482858109847286, + "learning_rate": 1.436697949845417e-07, + "loss": 0.274, + "step": 31919 + }, + { + "epoch": 0.93, + "grad_norm": 1.7316531484846924, + "learning_rate": 1.4355802769197903e-07, + "loss": 0.2595, + "step": 31920 + }, + { + "epoch": 0.93, + "grad_norm": 1.5568457482054794, + "learning_rate": 1.4344630325757058e-07, + "loss": 0.2911, + "step": 31921 + }, + { + "epoch": 0.93, + "grad_norm": 1.4036849441616428, + "learning_rate": 1.4333462168230228e-07, + "loss": 0.2754, + "step": 31922 + }, + { + "epoch": 0.93, + "grad_norm": 1.57704446659818, + "learning_rate": 1.4322298296716054e-07, + "loss": 0.2926, + "step": 31923 + }, + { + "epoch": 0.93, + "grad_norm": 1.3423449375955692, + "learning_rate": 1.4311138711312955e-07, + "loss": 0.2536, + "step": 31924 + }, + { + "epoch": 0.93, + "grad_norm": 1.4476754347484446, + "learning_rate": 1.429998341211941e-07, + "loss": 0.3009, + "step": 31925 + }, + { + "epoch": 0.93, + "grad_norm": 1.4248479495312232, + "learning_rate": 1.42888323992339e-07, + "loss": 0.2586, + "step": 31926 + }, + { + "epoch": 0.93, + "grad_norm": 1.5004735974853463, + "learning_rate": 1.4277685672754838e-07, + "loss": 0.2797, + "step": 31927 + }, + { + "epoch": 0.93, + "grad_norm": 1.2961402893031622, + "learning_rate": 1.4266543232780595e-07, + "loss": 0.2455, + "step": 31928 + }, + { + "epoch": 0.93, + "grad_norm": 1.2217148236623525, + "learning_rate": 1.4255405079409534e-07, + "loss": 0.2532, + "step": 31929 + }, + { + "epoch": 0.93, + "grad_norm": 1.4022285971888209, + "learning_rate": 1.424427121273986e-07, + "loss": 0.2699, + "step": 31930 + }, + { + "epoch": 0.93, + "grad_norm": 1.326972070718108, + "learning_rate": 1.4233141632869874e-07, + "loss": 0.2498, + "step": 31931 + }, + { + "epoch": 0.93, + "grad_norm": 1.485385322021579, + "learning_rate": 1.4222016339897893e-07, + "loss": 0.2606, + "step": 31932 + }, + { + "epoch": 0.93, + "grad_norm": 1.4628129736988542, + "learning_rate": 1.4210895333921947e-07, + "loss": 0.2689, + "step": 31933 + }, + { + "epoch": 0.93, + "grad_norm": 1.3465552753911307, + "learning_rate": 1.4199778615040295e-07, + "loss": 0.2799, + "step": 31934 + }, + { + "epoch": 0.93, + "grad_norm": 1.312034494519237, + "learning_rate": 1.4188666183350908e-07, + "loss": 0.2676, + "step": 31935 + }, + { + "epoch": 0.93, + "grad_norm": 1.5090005320841406, + "learning_rate": 1.4177558038951932e-07, + "loss": 0.2776, + "step": 31936 + }, + { + "epoch": 0.93, + "grad_norm": 1.3186106104854587, + "learning_rate": 1.4166454181941403e-07, + "loss": 0.2706, + "step": 31937 + }, + { + "epoch": 0.93, + "grad_norm": 1.490745693821302, + "learning_rate": 1.4155354612417293e-07, + "loss": 0.271, + "step": 31938 + }, + { + "epoch": 0.93, + "grad_norm": 1.3239047784830473, + "learning_rate": 1.4144259330477583e-07, + "loss": 0.2487, + "step": 31939 + }, + { + "epoch": 0.93, + "grad_norm": 2.068944802982114, + "learning_rate": 1.4133168336220193e-07, + "loss": 0.2908, + "step": 31940 + }, + { + "epoch": 0.93, + "grad_norm": 1.3924741679341965, + "learning_rate": 1.412208162974299e-07, + "loss": 0.2882, + "step": 31941 + }, + { + "epoch": 0.93, + "grad_norm": 1.4010394060746405, + "learning_rate": 1.4110999211143838e-07, + "loss": 0.2825, + "step": 31942 + }, + { + "epoch": 0.93, + "grad_norm": 1.383587770063918, + "learning_rate": 1.409992108052044e-07, + "loss": 0.2613, + "step": 31943 + }, + { + "epoch": 0.93, + "grad_norm": 1.3372172255469226, + "learning_rate": 1.408884723797066e-07, + "loss": 0.2784, + "step": 31944 + }, + { + "epoch": 0.93, + "grad_norm": 1.2940032147963856, + "learning_rate": 1.407777768359214e-07, + "loss": 0.2723, + "step": 31945 + }, + { + "epoch": 0.93, + "grad_norm": 1.293975748282592, + "learning_rate": 1.406671241748264e-07, + "loss": 0.2685, + "step": 31946 + }, + { + "epoch": 0.93, + "grad_norm": 1.2389292961475935, + "learning_rate": 1.405565143973986e-07, + "loss": 0.2784, + "step": 31947 + }, + { + "epoch": 0.93, + "grad_norm": 1.57935166619359, + "learning_rate": 1.404459475046127e-07, + "loss": 0.275, + "step": 31948 + }, + { + "epoch": 0.93, + "grad_norm": 1.858858938903065, + "learning_rate": 1.403354234974452e-07, + "loss": 0.2892, + "step": 31949 + }, + { + "epoch": 0.93, + "grad_norm": 1.3905488231770053, + "learning_rate": 1.40224942376872e-07, + "loss": 0.2643, + "step": 31950 + }, + { + "epoch": 0.93, + "grad_norm": 2.0786891656451294, + "learning_rate": 1.4011450414386784e-07, + "loss": 0.2539, + "step": 31951 + }, + { + "epoch": 0.93, + "grad_norm": 1.2289442562106252, + "learning_rate": 1.4000410879940697e-07, + "loss": 0.2573, + "step": 31952 + }, + { + "epoch": 0.93, + "grad_norm": 1.4870914248307718, + "learning_rate": 1.39893756344463e-07, + "loss": 0.2766, + "step": 31953 + }, + { + "epoch": 0.93, + "grad_norm": 1.4358673476308481, + "learning_rate": 1.397834467800113e-07, + "loss": 0.2488, + "step": 31954 + }, + { + "epoch": 0.93, + "grad_norm": 1.4170926682928253, + "learning_rate": 1.396731801070239e-07, + "loss": 0.2732, + "step": 31955 + }, + { + "epoch": 0.93, + "grad_norm": 1.4023215321838027, + "learning_rate": 1.39562956326475e-07, + "loss": 0.2677, + "step": 31956 + }, + { + "epoch": 0.93, + "grad_norm": 1.9111252353717993, + "learning_rate": 1.3945277543933712e-07, + "loss": 0.2816, + "step": 31957 + }, + { + "epoch": 0.93, + "grad_norm": 1.4185391139957388, + "learning_rate": 1.3934263744658228e-07, + "loss": 0.2558, + "step": 31958 + }, + { + "epoch": 0.93, + "grad_norm": 1.1917869377791022, + "learning_rate": 1.3923254234918305e-07, + "loss": 0.2386, + "step": 31959 + }, + { + "epoch": 0.93, + "grad_norm": 1.3302275034169648, + "learning_rate": 1.3912249014810974e-07, + "loss": 0.2676, + "step": 31960 + }, + { + "epoch": 0.93, + "grad_norm": 1.6777516748297157, + "learning_rate": 1.3901248084433493e-07, + "loss": 0.2554, + "step": 31961 + }, + { + "epoch": 0.93, + "grad_norm": 2.1112492413847774, + "learning_rate": 1.3890251443882897e-07, + "loss": 0.2751, + "step": 31962 + }, + { + "epoch": 0.93, + "grad_norm": 1.4107135955675003, + "learning_rate": 1.3879259093256214e-07, + "loss": 0.2651, + "step": 31963 + }, + { + "epoch": 0.93, + "grad_norm": 1.6549909121022697, + "learning_rate": 1.3868271032650426e-07, + "loss": 0.2524, + "step": 31964 + }, + { + "epoch": 0.93, + "grad_norm": 1.6474714825562555, + "learning_rate": 1.3857287262162567e-07, + "loss": 0.2924, + "step": 31965 + }, + { + "epoch": 0.93, + "grad_norm": 1.363934151923474, + "learning_rate": 1.3846307781889502e-07, + "loss": 0.2788, + "step": 31966 + }, + { + "epoch": 0.93, + "grad_norm": 1.6082558206093764, + "learning_rate": 1.383533259192821e-07, + "loss": 0.268, + "step": 31967 + }, + { + "epoch": 0.93, + "grad_norm": 1.6100756562620953, + "learning_rate": 1.3824361692375443e-07, + "loss": 0.2723, + "step": 31968 + }, + { + "epoch": 0.93, + "grad_norm": 1.3414968949830819, + "learning_rate": 1.381339508332813e-07, + "loss": 0.2535, + "step": 31969 + }, + { + "epoch": 0.93, + "grad_norm": 1.7255564988825274, + "learning_rate": 1.3802432764883022e-07, + "loss": 0.2611, + "step": 31970 + }, + { + "epoch": 0.93, + "grad_norm": 1.7428392986034942, + "learning_rate": 1.3791474737136824e-07, + "loss": 0.2762, + "step": 31971 + }, + { + "epoch": 0.93, + "grad_norm": 1.2169016639785244, + "learning_rate": 1.378052100018623e-07, + "loss": 0.2591, + "step": 31972 + }, + { + "epoch": 0.93, + "grad_norm": 1.4255893295941204, + "learning_rate": 1.3769571554127892e-07, + "loss": 0.2543, + "step": 31973 + }, + { + "epoch": 0.93, + "grad_norm": 1.5457097032244485, + "learning_rate": 1.375862639905856e-07, + "loss": 0.2584, + "step": 31974 + }, + { + "epoch": 0.93, + "grad_norm": 1.35991533370029, + "learning_rate": 1.3747685535074661e-07, + "loss": 0.2717, + "step": 31975 + }, + { + "epoch": 0.93, + "grad_norm": 1.3085661788737146, + "learning_rate": 1.3736748962272895e-07, + "loss": 0.285, + "step": 31976 + }, + { + "epoch": 0.93, + "grad_norm": 1.3244922495457707, + "learning_rate": 1.3725816680749683e-07, + "loss": 0.265, + "step": 31977 + }, + { + "epoch": 0.93, + "grad_norm": 1.5256462307941605, + "learning_rate": 1.3714888690601613e-07, + "loss": 0.2717, + "step": 31978 + }, + { + "epoch": 0.93, + "grad_norm": 1.4323622441718649, + "learning_rate": 1.3703964991924945e-07, + "loss": 0.2641, + "step": 31979 + }, + { + "epoch": 0.93, + "grad_norm": 1.4852239748968674, + "learning_rate": 1.3693045584816213e-07, + "loss": 0.2891, + "step": 31980 + }, + { + "epoch": 0.93, + "grad_norm": 1.36202949169626, + "learning_rate": 1.3682130469371723e-07, + "loss": 0.2567, + "step": 31981 + }, + { + "epoch": 0.93, + "grad_norm": 1.446003458857306, + "learning_rate": 1.367121964568785e-07, + "loss": 0.2704, + "step": 31982 + }, + { + "epoch": 0.93, + "grad_norm": 1.2466761403577362, + "learning_rate": 1.3660313113860902e-07, + "loss": 0.2913, + "step": 31983 + }, + { + "epoch": 0.93, + "grad_norm": 1.6511245275006603, + "learning_rate": 1.3649410873987023e-07, + "loss": 0.2848, + "step": 31984 + }, + { + "epoch": 0.93, + "grad_norm": 1.229771220521028, + "learning_rate": 1.363851292616253e-07, + "loss": 0.26, + "step": 31985 + }, + { + "epoch": 0.93, + "grad_norm": 1.3459996917583745, + "learning_rate": 1.3627619270483505e-07, + "loss": 0.2689, + "step": 31986 + }, + { + "epoch": 0.93, + "grad_norm": 1.346522363168147, + "learning_rate": 1.361672990704621e-07, + "loss": 0.2696, + "step": 31987 + }, + { + "epoch": 0.93, + "grad_norm": 1.6438785400311084, + "learning_rate": 1.3605844835946625e-07, + "loss": 0.2625, + "step": 31988 + }, + { + "epoch": 0.93, + "grad_norm": 1.4382757556681767, + "learning_rate": 1.3594964057280835e-07, + "loss": 0.2795, + "step": 31989 + }, + { + "epoch": 0.93, + "grad_norm": 2.375365999667802, + "learning_rate": 1.358408757114499e-07, + "loss": 0.2704, + "step": 31990 + }, + { + "epoch": 0.93, + "grad_norm": 1.5338926071808419, + "learning_rate": 1.357321537763484e-07, + "loss": 0.2344, + "step": 31991 + }, + { + "epoch": 0.93, + "grad_norm": 3.3033555432859694, + "learning_rate": 1.3562347476846537e-07, + "loss": 0.2614, + "step": 31992 + }, + { + "epoch": 0.93, + "grad_norm": 1.293811061411021, + "learning_rate": 1.3551483868875836e-07, + "loss": 0.2528, + "step": 31993 + }, + { + "epoch": 0.93, + "grad_norm": 1.5485316588984022, + "learning_rate": 1.3540624553818716e-07, + "loss": 0.2815, + "step": 31994 + }, + { + "epoch": 0.93, + "grad_norm": 1.2892577956049638, + "learning_rate": 1.3529769531770986e-07, + "loss": 0.2719, + "step": 31995 + }, + { + "epoch": 0.93, + "grad_norm": 1.3669797540263704, + "learning_rate": 1.3518918802828463e-07, + "loss": 0.2741, + "step": 31996 + }, + { + "epoch": 0.93, + "grad_norm": 1.3633160984870194, + "learning_rate": 1.3508072367086899e-07, + "loss": 0.2978, + "step": 31997 + }, + { + "epoch": 0.93, + "grad_norm": 2.3087635090115444, + "learning_rate": 1.3497230224641944e-07, + "loss": 0.2773, + "step": 31998 + }, + { + "epoch": 0.93, + "grad_norm": 1.5219363827200554, + "learning_rate": 1.3486392375589352e-07, + "loss": 0.2483, + "step": 31999 + }, + { + "epoch": 0.93, + "grad_norm": 1.340503811869445, + "learning_rate": 1.3475558820024715e-07, + "loss": 0.2728, + "step": 32000 + }, + { + "epoch": 0.93, + "grad_norm": 1.5393658190568569, + "learning_rate": 1.3464729558043678e-07, + "loss": 0.2573, + "step": 32001 + }, + { + "epoch": 0.93, + "grad_norm": 1.485160995860272, + "learning_rate": 1.345390458974183e-07, + "loss": 0.2671, + "step": 32002 + }, + { + "epoch": 0.93, + "grad_norm": 1.3415991687812765, + "learning_rate": 1.3443083915214649e-07, + "loss": 0.2611, + "step": 32003 + }, + { + "epoch": 0.93, + "grad_norm": 1.3067411905811481, + "learning_rate": 1.343226753455762e-07, + "loss": 0.2808, + "step": 32004 + }, + { + "epoch": 0.93, + "grad_norm": 1.3234863822651026, + "learning_rate": 1.3421455447866272e-07, + "loss": 0.2538, + "step": 32005 + }, + { + "epoch": 0.93, + "grad_norm": 1.4696403775280797, + "learning_rate": 1.341064765523603e-07, + "loss": 0.2972, + "step": 32006 + }, + { + "epoch": 0.93, + "grad_norm": 1.429022711513887, + "learning_rate": 1.33998441567621e-07, + "loss": 0.2664, + "step": 32007 + }, + { + "epoch": 0.93, + "grad_norm": 1.3741642837914032, + "learning_rate": 1.338904495254001e-07, + "loss": 0.2722, + "step": 32008 + }, + { + "epoch": 0.93, + "grad_norm": 1.5868101264612455, + "learning_rate": 1.3378250042664964e-07, + "loss": 0.2913, + "step": 32009 + }, + { + "epoch": 0.93, + "grad_norm": 1.638825800259279, + "learning_rate": 1.3367459427232275e-07, + "loss": 0.2819, + "step": 32010 + }, + { + "epoch": 0.93, + "grad_norm": 1.444857518614817, + "learning_rate": 1.3356673106337204e-07, + "loss": 0.2665, + "step": 32011 + }, + { + "epoch": 0.93, + "grad_norm": 1.3207974617304352, + "learning_rate": 1.3345891080074836e-07, + "loss": 0.2596, + "step": 32012 + }, + { + "epoch": 0.93, + "grad_norm": 1.641795209319031, + "learning_rate": 1.3335113348540374e-07, + "loss": 0.2461, + "step": 32013 + }, + { + "epoch": 0.93, + "grad_norm": 1.3084533934484834, + "learning_rate": 1.3324339911828965e-07, + "loss": 0.2529, + "step": 32014 + }, + { + "epoch": 0.93, + "grad_norm": 0.9211077729769278, + "learning_rate": 1.3313570770035644e-07, + "loss": 0.5587, + "step": 32015 + }, + { + "epoch": 0.93, + "grad_norm": 1.486981236502503, + "learning_rate": 1.3302805923255445e-07, + "loss": 0.2926, + "step": 32016 + }, + { + "epoch": 0.93, + "grad_norm": 1.6190150630604945, + "learning_rate": 1.3292045371583406e-07, + "loss": 0.2684, + "step": 32017 + }, + { + "epoch": 0.93, + "grad_norm": 1.4295309659660826, + "learning_rate": 1.328128911511445e-07, + "loss": 0.282, + "step": 32018 + }, + { + "epoch": 0.93, + "grad_norm": 1.6208321333990396, + "learning_rate": 1.3270537153943552e-07, + "loss": 0.3099, + "step": 32019 + }, + { + "epoch": 0.93, + "grad_norm": 1.4364224329331625, + "learning_rate": 1.325978948816553e-07, + "loss": 0.2726, + "step": 32020 + }, + { + "epoch": 0.93, + "grad_norm": 1.2970783186589425, + "learning_rate": 1.3249046117875253e-07, + "loss": 0.2625, + "step": 32021 + }, + { + "epoch": 0.93, + "grad_norm": 0.9466565483744467, + "learning_rate": 1.3238307043167585e-07, + "loss": 0.5927, + "step": 32022 + }, + { + "epoch": 0.93, + "grad_norm": 1.322422211658626, + "learning_rate": 1.3227572264137235e-07, + "loss": 0.2482, + "step": 32023 + }, + { + "epoch": 0.93, + "grad_norm": 1.4915274461091936, + "learning_rate": 1.3216841780878952e-07, + "loss": 0.2561, + "step": 32024 + }, + { + "epoch": 0.93, + "grad_norm": 1.3367644656301079, + "learning_rate": 1.3206115593487502e-07, + "loss": 0.3026, + "step": 32025 + }, + { + "epoch": 0.93, + "grad_norm": 1.3402950179140387, + "learning_rate": 1.3195393702057414e-07, + "loss": 0.2484, + "step": 32026 + }, + { + "epoch": 0.93, + "grad_norm": 1.3226402464155693, + "learning_rate": 1.3184676106683337e-07, + "loss": 0.2714, + "step": 32027 + }, + { + "epoch": 0.93, + "grad_norm": 1.3462609682078757, + "learning_rate": 1.3173962807459971e-07, + "loss": 0.2517, + "step": 32028 + }, + { + "epoch": 0.93, + "grad_norm": 1.5651123244856033, + "learning_rate": 1.3163253804481746e-07, + "loss": 0.2571, + "step": 32029 + }, + { + "epoch": 0.93, + "grad_norm": 1.2366803649138212, + "learning_rate": 1.3152549097843194e-07, + "loss": 0.544, + "step": 32030 + }, + { + "epoch": 0.93, + "grad_norm": 1.4138673048384696, + "learning_rate": 1.3141848687638847e-07, + "loss": 0.256, + "step": 32031 + }, + { + "epoch": 0.93, + "grad_norm": 1.687983347417439, + "learning_rate": 1.3131152573963023e-07, + "loss": 0.2868, + "step": 32032 + }, + { + "epoch": 0.93, + "grad_norm": 1.3927640770473293, + "learning_rate": 1.3120460756910258e-07, + "loss": 0.2571, + "step": 32033 + }, + { + "epoch": 0.93, + "grad_norm": 1.2703937926600104, + "learning_rate": 1.3109773236574807e-07, + "loss": 0.2652, + "step": 32034 + }, + { + "epoch": 0.93, + "grad_norm": 1.039289888029125, + "learning_rate": 1.3099090013050985e-07, + "loss": 0.5651, + "step": 32035 + }, + { + "epoch": 0.93, + "grad_norm": 1.3027708670744542, + "learning_rate": 1.3088411086433105e-07, + "loss": 0.307, + "step": 32036 + }, + { + "epoch": 0.93, + "grad_norm": 1.421075149475547, + "learning_rate": 1.307773645681537e-07, + "loss": 0.2811, + "step": 32037 + }, + { + "epoch": 0.93, + "grad_norm": 1.3952031710604367, + "learning_rate": 1.3067066124292037e-07, + "loss": 0.2852, + "step": 32038 + }, + { + "epoch": 0.93, + "grad_norm": 1.3456777345866124, + "learning_rate": 1.30564000889572e-07, + "loss": 0.2712, + "step": 32039 + }, + { + "epoch": 0.93, + "grad_norm": 3.4580983113432606, + "learning_rate": 1.3045738350905113e-07, + "loss": 0.2709, + "step": 32040 + }, + { + "epoch": 0.93, + "grad_norm": 1.3442838181463337, + "learning_rate": 1.3035080910229704e-07, + "loss": 0.2541, + "step": 32041 + }, + { + "epoch": 0.93, + "grad_norm": 1.3323816777065398, + "learning_rate": 1.3024427767025228e-07, + "loss": 0.2711, + "step": 32042 + }, + { + "epoch": 0.93, + "grad_norm": 1.9609848483455807, + "learning_rate": 1.30137789213855e-07, + "loss": 0.2835, + "step": 32043 + }, + { + "epoch": 0.93, + "grad_norm": 1.3087388958156325, + "learning_rate": 1.300313437340456e-07, + "loss": 0.2696, + "step": 32044 + }, + { + "epoch": 0.93, + "grad_norm": 1.7867062977118815, + "learning_rate": 1.299249412317638e-07, + "loss": 0.2571, + "step": 32045 + }, + { + "epoch": 0.93, + "grad_norm": 1.400919595508324, + "learning_rate": 1.2981858170794836e-07, + "loss": 0.2799, + "step": 32046 + }, + { + "epoch": 0.93, + "grad_norm": 1.2422541524925157, + "learning_rate": 1.2971226516353853e-07, + "loss": 0.2574, + "step": 32047 + }, + { + "epoch": 0.93, + "grad_norm": 2.078515483741236, + "learning_rate": 1.2960599159947184e-07, + "loss": 0.2776, + "step": 32048 + }, + { + "epoch": 0.93, + "grad_norm": 1.3352205152877379, + "learning_rate": 1.2949976101668594e-07, + "loss": 0.2965, + "step": 32049 + }, + { + "epoch": 0.93, + "grad_norm": 1.338794307048656, + "learning_rate": 1.2939357341611837e-07, + "loss": 0.2888, + "step": 32050 + }, + { + "epoch": 0.93, + "grad_norm": 1.290549903086209, + "learning_rate": 1.292874287987067e-07, + "loss": 0.2785, + "step": 32051 + }, + { + "epoch": 0.93, + "grad_norm": 1.353437672250533, + "learning_rate": 1.2918132716538855e-07, + "loss": 0.2736, + "step": 32052 + }, + { + "epoch": 0.93, + "grad_norm": 1.3278072510333772, + "learning_rate": 1.2907526851709818e-07, + "loss": 0.2799, + "step": 32053 + }, + { + "epoch": 0.93, + "grad_norm": 2.1770884749448474, + "learning_rate": 1.289692528547726e-07, + "loss": 0.2785, + "step": 32054 + }, + { + "epoch": 0.93, + "grad_norm": 1.2864541951982633, + "learning_rate": 1.2886328017934768e-07, + "loss": 0.3112, + "step": 32055 + }, + { + "epoch": 0.93, + "grad_norm": 1.453883857035641, + "learning_rate": 1.287573504917583e-07, + "loss": 0.2739, + "step": 32056 + }, + { + "epoch": 0.93, + "grad_norm": 1.3090270142277705, + "learning_rate": 1.2865146379293924e-07, + "loss": 0.2747, + "step": 32057 + }, + { + "epoch": 0.93, + "grad_norm": 1.3499813915278613, + "learning_rate": 1.2854562008382475e-07, + "loss": 0.2521, + "step": 32058 + }, + { + "epoch": 0.93, + "grad_norm": 1.41385571449623, + "learning_rate": 1.2843981936534967e-07, + "loss": 0.2774, + "step": 32059 + }, + { + "epoch": 0.93, + "grad_norm": 1.3471712344097126, + "learning_rate": 1.2833406163844709e-07, + "loss": 0.2653, + "step": 32060 + }, + { + "epoch": 0.93, + "grad_norm": 1.4492734720825653, + "learning_rate": 1.2822834690405127e-07, + "loss": 0.2771, + "step": 32061 + }, + { + "epoch": 0.93, + "grad_norm": 1.3033069558005335, + "learning_rate": 1.2812267516309318e-07, + "loss": 0.2428, + "step": 32062 + }, + { + "epoch": 0.93, + "grad_norm": 1.8164688152645267, + "learning_rate": 1.2801704641650702e-07, + "loss": 0.2645, + "step": 32063 + }, + { + "epoch": 0.93, + "grad_norm": 0.931382522265915, + "learning_rate": 1.279114606652243e-07, + "loss": 0.5446, + "step": 32064 + }, + { + "epoch": 0.93, + "grad_norm": 1.285554824301222, + "learning_rate": 1.2780591791017705e-07, + "loss": 0.2562, + "step": 32065 + }, + { + "epoch": 0.93, + "grad_norm": 1.532816850203727, + "learning_rate": 1.2770041815229673e-07, + "loss": 0.2452, + "step": 32066 + }, + { + "epoch": 0.93, + "grad_norm": 1.3542674984149312, + "learning_rate": 1.275949613925137e-07, + "loss": 0.2901, + "step": 32067 + }, + { + "epoch": 0.93, + "grad_norm": 1.3635650566338455, + "learning_rate": 1.2748954763175948e-07, + "loss": 0.2658, + "step": 32068 + }, + { + "epoch": 0.93, + "grad_norm": 0.9429339161921854, + "learning_rate": 1.2738417687096494e-07, + "loss": 0.5823, + "step": 32069 + }, + { + "epoch": 0.93, + "grad_norm": 1.2955110360066984, + "learning_rate": 1.2727884911105827e-07, + "loss": 0.2657, + "step": 32070 + }, + { + "epoch": 0.93, + "grad_norm": 1.4632264957613905, + "learning_rate": 1.271735643529698e-07, + "loss": 0.2752, + "step": 32071 + }, + { + "epoch": 0.93, + "grad_norm": 1.2795621563144204, + "learning_rate": 1.270683225976288e-07, + "loss": 0.2583, + "step": 32072 + }, + { + "epoch": 0.93, + "grad_norm": 2.5928762742358824, + "learning_rate": 1.2696312384596398e-07, + "loss": 0.2493, + "step": 32073 + }, + { + "epoch": 0.93, + "grad_norm": 1.7052636062085353, + "learning_rate": 1.2685796809890348e-07, + "loss": 0.2705, + "step": 32074 + }, + { + "epoch": 0.93, + "grad_norm": 1.308842872307246, + "learning_rate": 1.2675285535737604e-07, + "loss": 0.2804, + "step": 32075 + }, + { + "epoch": 0.93, + "grad_norm": 1.3848998665071406, + "learning_rate": 1.2664778562230806e-07, + "loss": 0.2599, + "step": 32076 + }, + { + "epoch": 0.93, + "grad_norm": 1.5593538943699172, + "learning_rate": 1.2654275889462775e-07, + "loss": 0.2756, + "step": 32077 + }, + { + "epoch": 0.93, + "grad_norm": 1.2965776684853338, + "learning_rate": 1.2643777517526156e-07, + "loss": 0.2502, + "step": 32078 + }, + { + "epoch": 0.93, + "grad_norm": 0.892477763928606, + "learning_rate": 1.26332834465136e-07, + "loss": 0.5251, + "step": 32079 + }, + { + "epoch": 0.93, + "grad_norm": 6.455420721243902, + "learning_rate": 1.2622793676517753e-07, + "loss": 0.29, + "step": 32080 + }, + { + "epoch": 0.93, + "grad_norm": 1.3442726407902348, + "learning_rate": 1.2612308207631153e-07, + "loss": 0.2769, + "step": 32081 + }, + { + "epoch": 0.93, + "grad_norm": 1.4268732051152027, + "learning_rate": 1.2601827039946279e-07, + "loss": 0.2812, + "step": 32082 + }, + { + "epoch": 0.93, + "grad_norm": 1.4040028738759829, + "learning_rate": 1.2591350173555729e-07, + "loss": 0.2429, + "step": 32083 + }, + { + "epoch": 0.93, + "grad_norm": 1.4435863413780494, + "learning_rate": 1.2580877608551922e-07, + "loss": 0.283, + "step": 32084 + }, + { + "epoch": 0.93, + "grad_norm": 1.2836643198067006, + "learning_rate": 1.257040934502729e-07, + "loss": 0.2545, + "step": 32085 + }, + { + "epoch": 0.93, + "grad_norm": 1.9619895887936527, + "learning_rate": 1.2559945383074145e-07, + "loss": 0.3077, + "step": 32086 + }, + { + "epoch": 0.93, + "grad_norm": 1.4795090516783713, + "learning_rate": 1.254948572278497e-07, + "loss": 0.2688, + "step": 32087 + }, + { + "epoch": 0.93, + "grad_norm": 1.4547817251205184, + "learning_rate": 1.253903036425197e-07, + "loss": 0.2504, + "step": 32088 + }, + { + "epoch": 0.93, + "grad_norm": 1.3223623226392105, + "learning_rate": 1.2528579307567402e-07, + "loss": 0.269, + "step": 32089 + }, + { + "epoch": 0.93, + "grad_norm": 1.5868536688679613, + "learning_rate": 1.2518132552823582e-07, + "loss": 0.2651, + "step": 32090 + }, + { + "epoch": 0.93, + "grad_norm": 1.4781153184871287, + "learning_rate": 1.25076901001126e-07, + "loss": 0.2786, + "step": 32091 + }, + { + "epoch": 0.93, + "grad_norm": 1.535589218241998, + "learning_rate": 1.2497251949526667e-07, + "loss": 0.2827, + "step": 32092 + }, + { + "epoch": 0.93, + "grad_norm": 1.3932186197430647, + "learning_rate": 1.2486818101157873e-07, + "loss": 0.2622, + "step": 32093 + }, + { + "epoch": 0.93, + "grad_norm": 1.3029823261422058, + "learning_rate": 1.2476388555098363e-07, + "loss": 0.257, + "step": 32094 + }, + { + "epoch": 0.93, + "grad_norm": 1.3713166856358203, + "learning_rate": 1.2465963311440122e-07, + "loss": 0.2635, + "step": 32095 + }, + { + "epoch": 0.93, + "grad_norm": 2.4490721386563066, + "learning_rate": 1.245554237027513e-07, + "loss": 0.272, + "step": 32096 + }, + { + "epoch": 0.93, + "grad_norm": 4.574302293425755, + "learning_rate": 1.2445125731695484e-07, + "loss": 0.2575, + "step": 32097 + }, + { + "epoch": 0.93, + "grad_norm": 1.3960198684070009, + "learning_rate": 1.2434713395792941e-07, + "loss": 0.2578, + "step": 32098 + }, + { + "epoch": 0.93, + "grad_norm": 1.0334577586882585, + "learning_rate": 1.2424305362659428e-07, + "loss": 0.5542, + "step": 32099 + }, + { + "epoch": 0.93, + "grad_norm": 1.3676578798827197, + "learning_rate": 1.2413901632386871e-07, + "loss": 0.2842, + "step": 32100 + }, + { + "epoch": 0.93, + "grad_norm": 1.5433268718962325, + "learning_rate": 1.2403502205067032e-07, + "loss": 0.2739, + "step": 32101 + }, + { + "epoch": 0.93, + "grad_norm": 1.4940371783119164, + "learning_rate": 1.2393107080791666e-07, + "loss": 0.2756, + "step": 32102 + }, + { + "epoch": 0.93, + "grad_norm": 2.04470708630273, + "learning_rate": 1.2382716259652594e-07, + "loss": 0.2889, + "step": 32103 + }, + { + "epoch": 0.93, + "grad_norm": 1.482498253419966, + "learning_rate": 1.2372329741741463e-07, + "loss": 0.2821, + "step": 32104 + }, + { + "epoch": 0.93, + "grad_norm": 1.3506244780990164, + "learning_rate": 1.2361947527149864e-07, + "loss": 0.2548, + "step": 32105 + }, + { + "epoch": 0.93, + "grad_norm": 1.4077757447655492, + "learning_rate": 1.235156961596956e-07, + "loss": 0.2978, + "step": 32106 + }, + { + "epoch": 0.93, + "grad_norm": 1.5883213860555452, + "learning_rate": 1.2341196008292032e-07, + "loss": 0.2732, + "step": 32107 + }, + { + "epoch": 0.93, + "grad_norm": 1.6832551346970277, + "learning_rate": 1.2330826704208875e-07, + "loss": 0.257, + "step": 32108 + }, + { + "epoch": 0.93, + "grad_norm": 1.2833334689516465, + "learning_rate": 1.232046170381157e-07, + "loss": 0.2597, + "step": 32109 + }, + { + "epoch": 0.93, + "grad_norm": 1.789274485942761, + "learning_rate": 1.23101010071916e-07, + "loss": 0.2568, + "step": 32110 + }, + { + "epoch": 0.93, + "grad_norm": 1.449451849617553, + "learning_rate": 1.229974461444039e-07, + "loss": 0.2755, + "step": 32111 + }, + { + "epoch": 0.93, + "grad_norm": 1.805740417364996, + "learning_rate": 1.2289392525649312e-07, + "loss": 0.2628, + "step": 32112 + }, + { + "epoch": 0.93, + "grad_norm": 2.0112642880015104, + "learning_rate": 1.2279044740909794e-07, + "loss": 0.2712, + "step": 32113 + }, + { + "epoch": 0.93, + "grad_norm": 1.282547575643111, + "learning_rate": 1.2268701260313098e-07, + "loss": 0.2692, + "step": 32114 + }, + { + "epoch": 0.93, + "grad_norm": 1.4751700582425689, + "learning_rate": 1.2258362083950537e-07, + "loss": 0.2716, + "step": 32115 + }, + { + "epoch": 0.93, + "grad_norm": 1.5229343945928293, + "learning_rate": 1.2248027211913426e-07, + "loss": 0.2796, + "step": 32116 + }, + { + "epoch": 0.93, + "grad_norm": 1.0483664046865793, + "learning_rate": 1.2237696644292752e-07, + "loss": 0.5785, + "step": 32117 + }, + { + "epoch": 0.93, + "grad_norm": 1.388927911808154, + "learning_rate": 1.2227370381179882e-07, + "loss": 0.2616, + "step": 32118 + }, + { + "epoch": 0.93, + "grad_norm": 1.317211885070341, + "learning_rate": 1.2217048422665855e-07, + "loss": 0.2709, + "step": 32119 + }, + { + "epoch": 0.93, + "grad_norm": 1.3774227624238438, + "learning_rate": 1.2206730768841822e-07, + "loss": 0.2508, + "step": 32120 + }, + { + "epoch": 0.93, + "grad_norm": 1.7743778598821693, + "learning_rate": 1.2196417419798767e-07, + "loss": 0.2616, + "step": 32121 + }, + { + "epoch": 0.93, + "grad_norm": 1.2524740359855502, + "learning_rate": 1.2186108375627725e-07, + "loss": 0.276, + "step": 32122 + }, + { + "epoch": 0.93, + "grad_norm": 1.5724785664992793, + "learning_rate": 1.2175803636419738e-07, + "loss": 0.2596, + "step": 32123 + }, + { + "epoch": 0.93, + "grad_norm": 0.9568220972362488, + "learning_rate": 1.2165503202265673e-07, + "loss": 0.5534, + "step": 32124 + }, + { + "epoch": 0.93, + "grad_norm": 1.574101274538689, + "learning_rate": 1.215520707325646e-07, + "loss": 0.2753, + "step": 32125 + }, + { + "epoch": 0.93, + "grad_norm": 1.365614441198071, + "learning_rate": 1.214491524948297e-07, + "loss": 0.2602, + "step": 32126 + }, + { + "epoch": 0.93, + "grad_norm": 1.363702815897733, + "learning_rate": 1.2134627731035963e-07, + "loss": 0.2674, + "step": 32127 + }, + { + "epoch": 0.93, + "grad_norm": 1.4291279532187877, + "learning_rate": 1.2124344518006316e-07, + "loss": 0.2883, + "step": 32128 + }, + { + "epoch": 0.93, + "grad_norm": 1.3977552066236534, + "learning_rate": 1.2114065610484726e-07, + "loss": 0.2723, + "step": 32129 + }, + { + "epoch": 0.93, + "grad_norm": 1.3871720002602776, + "learning_rate": 1.2103791008561904e-07, + "loss": 0.2693, + "step": 32130 + }, + { + "epoch": 0.93, + "grad_norm": 1.2523472994400986, + "learning_rate": 1.209352071232861e-07, + "loss": 0.2591, + "step": 32131 + }, + { + "epoch": 0.93, + "grad_norm": 1.2745569219034345, + "learning_rate": 1.2083254721875325e-07, + "loss": 0.2662, + "step": 32132 + }, + { + "epoch": 0.93, + "grad_norm": 1.4255037630178509, + "learning_rate": 1.2072993037292757e-07, + "loss": 0.2573, + "step": 32133 + }, + { + "epoch": 0.93, + "grad_norm": 1.2848505440748283, + "learning_rate": 1.2062735658671442e-07, + "loss": 0.2601, + "step": 32134 + }, + { + "epoch": 0.93, + "grad_norm": 1.7862076353657645, + "learning_rate": 1.2052482586101977e-07, + "loss": 0.2621, + "step": 32135 + }, + { + "epoch": 0.93, + "grad_norm": 1.2737361907594413, + "learning_rate": 1.2042233819674675e-07, + "loss": 0.264, + "step": 32136 + }, + { + "epoch": 0.93, + "grad_norm": 0.9520421641052399, + "learning_rate": 1.203198935948008e-07, + "loss": 0.5198, + "step": 32137 + }, + { + "epoch": 0.93, + "grad_norm": 1.332008460408012, + "learning_rate": 1.2021749205608614e-07, + "loss": 0.2605, + "step": 32138 + }, + { + "epoch": 0.93, + "grad_norm": 2.3935862787016844, + "learning_rate": 1.2011513358150595e-07, + "loss": 0.2761, + "step": 32139 + }, + { + "epoch": 0.93, + "grad_norm": 1.2856564036252436, + "learning_rate": 1.2001281817196397e-07, + "loss": 0.2443, + "step": 32140 + }, + { + "epoch": 0.93, + "grad_norm": 1.3287154316112144, + "learning_rate": 1.1991054582836282e-07, + "loss": 0.2758, + "step": 32141 + }, + { + "epoch": 0.93, + "grad_norm": 1.2846530765183606, + "learning_rate": 1.1980831655160563e-07, + "loss": 0.2481, + "step": 32142 + }, + { + "epoch": 0.93, + "grad_norm": 1.3599622980067736, + "learning_rate": 1.197061303425945e-07, + "loss": 0.2805, + "step": 32143 + }, + { + "epoch": 0.93, + "grad_norm": 1.3605795498430882, + "learning_rate": 1.1960398720223033e-07, + "loss": 0.2536, + "step": 32144 + }, + { + "epoch": 0.93, + "grad_norm": 1.4698018986735406, + "learning_rate": 1.195018871314152e-07, + "loss": 0.2777, + "step": 32145 + }, + { + "epoch": 0.93, + "grad_norm": 1.6053516299339, + "learning_rate": 1.1939983013104949e-07, + "loss": 0.2689, + "step": 32146 + }, + { + "epoch": 0.93, + "grad_norm": 1.3236628201627354, + "learning_rate": 1.1929781620203529e-07, + "loss": 0.2707, + "step": 32147 + }, + { + "epoch": 0.93, + "grad_norm": 1.438952440128901, + "learning_rate": 1.1919584534527184e-07, + "loss": 0.2529, + "step": 32148 + }, + { + "epoch": 0.93, + "grad_norm": 1.3132440077427003, + "learning_rate": 1.19093917561659e-07, + "loss": 0.2553, + "step": 32149 + }, + { + "epoch": 0.93, + "grad_norm": 3.1567747040404965, + "learning_rate": 1.1899203285209604e-07, + "loss": 0.2776, + "step": 32150 + }, + { + "epoch": 0.93, + "grad_norm": 1.285910961372216, + "learning_rate": 1.1889019121748335e-07, + "loss": 0.2699, + "step": 32151 + }, + { + "epoch": 0.93, + "grad_norm": 1.3557876912068727, + "learning_rate": 1.1878839265871855e-07, + "loss": 0.2843, + "step": 32152 + }, + { + "epoch": 0.93, + "grad_norm": 1.3397584982267163, + "learning_rate": 1.1868663717670037e-07, + "loss": 0.2746, + "step": 32153 + }, + { + "epoch": 0.93, + "grad_norm": 1.6316859086719302, + "learning_rate": 1.1858492477232697e-07, + "loss": 0.2752, + "step": 32154 + }, + { + "epoch": 0.93, + "grad_norm": 4.413865211548676, + "learning_rate": 1.1848325544649542e-07, + "loss": 0.2869, + "step": 32155 + }, + { + "epoch": 0.93, + "grad_norm": 1.4590582688363414, + "learning_rate": 1.1838162920010333e-07, + "loss": 0.2734, + "step": 32156 + }, + { + "epoch": 0.93, + "grad_norm": 1.4153969230319987, + "learning_rate": 1.1828004603404775e-07, + "loss": 0.2721, + "step": 32157 + }, + { + "epoch": 0.93, + "grad_norm": 1.4135846254501154, + "learning_rate": 1.1817850594922465e-07, + "loss": 0.2572, + "step": 32158 + }, + { + "epoch": 0.93, + "grad_norm": 1.6641395482528059, + "learning_rate": 1.1807700894653107e-07, + "loss": 0.2712, + "step": 32159 + }, + { + "epoch": 0.93, + "grad_norm": 1.4645344870116446, + "learning_rate": 1.179755550268613e-07, + "loss": 0.2489, + "step": 32160 + }, + { + "epoch": 0.93, + "grad_norm": 2.401076149944746, + "learning_rate": 1.1787414419111131e-07, + "loss": 0.2517, + "step": 32161 + }, + { + "epoch": 0.93, + "grad_norm": 1.3142150342519718, + "learning_rate": 1.1777277644017704e-07, + "loss": 0.2765, + "step": 32162 + }, + { + "epoch": 0.93, + "grad_norm": 1.5603914750465355, + "learning_rate": 1.1767145177495165e-07, + "loss": 0.2545, + "step": 32163 + }, + { + "epoch": 0.93, + "grad_norm": 1.4619832716861494, + "learning_rate": 1.1757017019632943e-07, + "loss": 0.2644, + "step": 32164 + }, + { + "epoch": 0.93, + "grad_norm": 1.4242545790692624, + "learning_rate": 1.1746893170520524e-07, + "loss": 0.253, + "step": 32165 + }, + { + "epoch": 0.93, + "grad_norm": 1.8693687125788703, + "learning_rate": 1.1736773630247112e-07, + "loss": 0.2437, + "step": 32166 + }, + { + "epoch": 0.93, + "grad_norm": 1.305851187594647, + "learning_rate": 1.1726658398902135e-07, + "loss": 0.2678, + "step": 32167 + }, + { + "epoch": 0.93, + "grad_norm": 1.608050168418617, + "learning_rate": 1.1716547476574857e-07, + "loss": 0.274, + "step": 32168 + }, + { + "epoch": 0.93, + "grad_norm": 1.4324857683048766, + "learning_rate": 1.170644086335443e-07, + "loss": 0.2748, + "step": 32169 + }, + { + "epoch": 0.93, + "grad_norm": 0.9602943410338134, + "learning_rate": 1.1696338559330056e-07, + "loss": 0.5385, + "step": 32170 + }, + { + "epoch": 0.93, + "grad_norm": 1.596598802695917, + "learning_rate": 1.1686240564590945e-07, + "loss": 0.2639, + "step": 32171 + }, + { + "epoch": 0.93, + "grad_norm": 1.678417990503338, + "learning_rate": 1.1676146879226192e-07, + "loss": 0.275, + "step": 32172 + }, + { + "epoch": 0.93, + "grad_norm": 1.3094009939663598, + "learning_rate": 1.1666057503324834e-07, + "loss": 0.2991, + "step": 32173 + }, + { + "epoch": 0.93, + "grad_norm": 1.3154256413001764, + "learning_rate": 1.1655972436975915e-07, + "loss": 0.2622, + "step": 32174 + }, + { + "epoch": 0.93, + "grad_norm": 1.2945591015767768, + "learning_rate": 1.1645891680268473e-07, + "loss": 0.2844, + "step": 32175 + }, + { + "epoch": 0.93, + "grad_norm": 1.3104184646410464, + "learning_rate": 1.1635815233291436e-07, + "loss": 0.2567, + "step": 32176 + }, + { + "epoch": 0.93, + "grad_norm": 2.326464188262218, + "learning_rate": 1.162574309613379e-07, + "loss": 0.2642, + "step": 32177 + }, + { + "epoch": 0.93, + "grad_norm": 1.4055227258382745, + "learning_rate": 1.1615675268884352e-07, + "loss": 0.2631, + "step": 32178 + }, + { + "epoch": 0.93, + "grad_norm": 1.3237795481945984, + "learning_rate": 1.160561175163205e-07, + "loss": 0.2854, + "step": 32179 + }, + { + "epoch": 0.93, + "grad_norm": 1.6152416258023428, + "learning_rate": 1.1595552544465593e-07, + "loss": 0.2569, + "step": 32180 + }, + { + "epoch": 0.93, + "grad_norm": 1.4554508442866492, + "learning_rate": 1.1585497647473798e-07, + "loss": 0.2917, + "step": 32181 + }, + { + "epoch": 0.93, + "grad_norm": 1.3478139896196677, + "learning_rate": 1.157544706074537e-07, + "loss": 0.2796, + "step": 32182 + }, + { + "epoch": 0.93, + "grad_norm": 1.3023987912767652, + "learning_rate": 1.1565400784369074e-07, + "loss": 0.2893, + "step": 32183 + }, + { + "epoch": 0.93, + "grad_norm": 1.4399432288927796, + "learning_rate": 1.155535881843356e-07, + "loss": 0.2696, + "step": 32184 + }, + { + "epoch": 0.93, + "grad_norm": 1.3246826735684065, + "learning_rate": 1.1545321163027368e-07, + "loss": 0.2655, + "step": 32185 + }, + { + "epoch": 0.93, + "grad_norm": 1.3547188156338483, + "learning_rate": 1.1535287818239205e-07, + "loss": 0.2759, + "step": 32186 + }, + { + "epoch": 0.93, + "grad_norm": 1.3230991502978724, + "learning_rate": 1.1525258784157556e-07, + "loss": 0.2694, + "step": 32187 + }, + { + "epoch": 0.93, + "grad_norm": 1.2299988504920534, + "learning_rate": 1.151523406087085e-07, + "loss": 0.2588, + "step": 32188 + }, + { + "epoch": 0.93, + "grad_norm": 1.487602886115454, + "learning_rate": 1.1505213648467684e-07, + "loss": 0.2951, + "step": 32189 + }, + { + "epoch": 0.93, + "grad_norm": 1.3434703989893453, + "learning_rate": 1.1495197547036375e-07, + "loss": 0.2561, + "step": 32190 + }, + { + "epoch": 0.93, + "grad_norm": 1.4111476220779957, + "learning_rate": 1.1485185756665407e-07, + "loss": 0.2868, + "step": 32191 + }, + { + "epoch": 0.93, + "grad_norm": 1.4236944715395126, + "learning_rate": 1.1475178277443044e-07, + "loss": 0.2651, + "step": 32192 + }, + { + "epoch": 0.93, + "grad_norm": 1.291121205207494, + "learning_rate": 1.146517510945766e-07, + "loss": 0.2638, + "step": 32193 + }, + { + "epoch": 0.93, + "grad_norm": 1.4153572634341058, + "learning_rate": 1.1455176252797573e-07, + "loss": 0.2653, + "step": 32194 + }, + { + "epoch": 0.93, + "grad_norm": 1.5252361841665087, + "learning_rate": 1.1445181707550934e-07, + "loss": 0.2616, + "step": 32195 + }, + { + "epoch": 0.93, + "grad_norm": 0.9795674683182074, + "learning_rate": 1.1435191473805951e-07, + "loss": 0.5427, + "step": 32196 + }, + { + "epoch": 0.93, + "grad_norm": 1.3109299676384267, + "learning_rate": 1.1425205551650887e-07, + "loss": 0.2677, + "step": 32197 + }, + { + "epoch": 0.93, + "grad_norm": 1.2096289728258007, + "learning_rate": 1.1415223941173837e-07, + "loss": 0.2608, + "step": 32198 + }, + { + "epoch": 0.93, + "grad_norm": 1.4471465361001685, + "learning_rate": 1.1405246642462786e-07, + "loss": 0.2809, + "step": 32199 + }, + { + "epoch": 0.93, + "grad_norm": 1.4148139638503545, + "learning_rate": 1.1395273655605888e-07, + "loss": 0.2759, + "step": 32200 + }, + { + "epoch": 0.93, + "grad_norm": 2.4948554290604927, + "learning_rate": 1.138530498069107e-07, + "loss": 0.2511, + "step": 32201 + }, + { + "epoch": 0.93, + "grad_norm": 1.3431482693697046, + "learning_rate": 1.1375340617806375e-07, + "loss": 0.3101, + "step": 32202 + }, + { + "epoch": 0.93, + "grad_norm": 1.2674963033822308, + "learning_rate": 1.136538056703973e-07, + "loss": 0.2913, + "step": 32203 + }, + { + "epoch": 0.93, + "grad_norm": 1.4684702490827974, + "learning_rate": 1.1355424828479067e-07, + "loss": 0.2969, + "step": 32204 + }, + { + "epoch": 0.93, + "grad_norm": 1.4244018540452854, + "learning_rate": 1.1345473402212148e-07, + "loss": 0.2608, + "step": 32205 + }, + { + "epoch": 0.93, + "grad_norm": 1.3617223075564628, + "learning_rate": 1.1335526288326848e-07, + "loss": 0.2668, + "step": 32206 + }, + { + "epoch": 0.93, + "grad_norm": 1.6574317699196437, + "learning_rate": 1.1325583486910985e-07, + "loss": 0.2581, + "step": 32207 + }, + { + "epoch": 0.93, + "grad_norm": 1.7958549511124287, + "learning_rate": 1.1315644998052266e-07, + "loss": 0.2802, + "step": 32208 + }, + { + "epoch": 0.93, + "grad_norm": 2.329388806155235, + "learning_rate": 1.1305710821838345e-07, + "loss": 0.2804, + "step": 32209 + }, + { + "epoch": 0.93, + "grad_norm": 1.50121368417131, + "learning_rate": 1.1295780958356983e-07, + "loss": 0.2634, + "step": 32210 + }, + { + "epoch": 0.93, + "grad_norm": 1.5051070911849953, + "learning_rate": 1.1285855407695779e-07, + "loss": 0.2648, + "step": 32211 + }, + { + "epoch": 0.93, + "grad_norm": 1.3776975747064728, + "learning_rate": 1.1275934169942326e-07, + "loss": 0.2686, + "step": 32212 + }, + { + "epoch": 0.93, + "grad_norm": 1.2448755109298608, + "learning_rate": 1.1266017245184169e-07, + "loss": 0.2565, + "step": 32213 + }, + { + "epoch": 0.93, + "grad_norm": 1.4275238401315973, + "learning_rate": 1.1256104633508791e-07, + "loss": 0.2804, + "step": 32214 + }, + { + "epoch": 0.93, + "grad_norm": 1.6432955137172043, + "learning_rate": 1.1246196335003789e-07, + "loss": 0.2855, + "step": 32215 + }, + { + "epoch": 0.93, + "grad_norm": 1.4842281696942357, + "learning_rate": 1.1236292349756483e-07, + "loss": 0.2736, + "step": 32216 + }, + { + "epoch": 0.93, + "grad_norm": 1.358425404358729, + "learning_rate": 1.1226392677854359e-07, + "loss": 0.2708, + "step": 32217 + }, + { + "epoch": 0.93, + "grad_norm": 1.5182209029255604, + "learning_rate": 1.1216497319384733e-07, + "loss": 0.2709, + "step": 32218 + }, + { + "epoch": 0.93, + "grad_norm": 1.5040616128901443, + "learning_rate": 1.1206606274434872e-07, + "loss": 0.2652, + "step": 32219 + }, + { + "epoch": 0.93, + "grad_norm": 1.9609942037234127, + "learning_rate": 1.1196719543092205e-07, + "loss": 0.2723, + "step": 32220 + }, + { + "epoch": 0.93, + "grad_norm": 1.351029166877415, + "learning_rate": 1.1186837125443884e-07, + "loss": 0.2716, + "step": 32221 + }, + { + "epoch": 0.93, + "grad_norm": 1.3384496986995182, + "learning_rate": 1.1176959021577116e-07, + "loss": 0.2549, + "step": 32222 + }, + { + "epoch": 0.93, + "grad_norm": 1.3323914503351075, + "learning_rate": 1.1167085231579111e-07, + "loss": 0.2729, + "step": 32223 + }, + { + "epoch": 0.93, + "grad_norm": 1.2731928348080532, + "learning_rate": 1.115721575553702e-07, + "loss": 0.273, + "step": 32224 + }, + { + "epoch": 0.93, + "grad_norm": 1.379528336129859, + "learning_rate": 1.1147350593537998e-07, + "loss": 0.2693, + "step": 32225 + }, + { + "epoch": 0.93, + "grad_norm": 1.5003553088938277, + "learning_rate": 1.1137489745668972e-07, + "loss": 0.2707, + "step": 32226 + }, + { + "epoch": 0.93, + "grad_norm": 1.5008603409787074, + "learning_rate": 1.1127633212016986e-07, + "loss": 0.2737, + "step": 32227 + }, + { + "epoch": 0.93, + "grad_norm": 1.320777303971541, + "learning_rate": 1.1117780992669081e-07, + "loss": 0.3047, + "step": 32228 + }, + { + "epoch": 0.93, + "grad_norm": 1.2886680550045189, + "learning_rate": 1.1107933087712186e-07, + "loss": 0.2843, + "step": 32229 + }, + { + "epoch": 0.93, + "grad_norm": 1.3756650335651144, + "learning_rate": 1.1098089497233177e-07, + "loss": 0.2814, + "step": 32230 + }, + { + "epoch": 0.93, + "grad_norm": 1.4672629258495922, + "learning_rate": 1.1088250221318986e-07, + "loss": 0.2807, + "step": 32231 + }, + { + "epoch": 0.93, + "grad_norm": 1.211198867719289, + "learning_rate": 1.107841526005643e-07, + "loss": 0.2636, + "step": 32232 + }, + { + "epoch": 0.93, + "grad_norm": 1.69593314955376, + "learning_rate": 1.1068584613532218e-07, + "loss": 0.2675, + "step": 32233 + }, + { + "epoch": 0.93, + "grad_norm": 1.2901358377692524, + "learning_rate": 1.1058758281833281e-07, + "loss": 0.2756, + "step": 32234 + }, + { + "epoch": 0.93, + "grad_norm": 2.781203582495052, + "learning_rate": 1.1048936265046162e-07, + "loss": 0.2812, + "step": 32235 + }, + { + "epoch": 0.94, + "grad_norm": 1.4646950012967586, + "learning_rate": 1.1039118563257567e-07, + "loss": 0.2681, + "step": 32236 + }, + { + "epoch": 0.94, + "grad_norm": 1.4154275676755417, + "learning_rate": 1.1029305176554206e-07, + "loss": 0.2758, + "step": 32237 + }, + { + "epoch": 0.94, + "grad_norm": 1.440429021098351, + "learning_rate": 1.101949610502262e-07, + "loss": 0.266, + "step": 32238 + }, + { + "epoch": 0.94, + "grad_norm": 1.2863148986489819, + "learning_rate": 1.1009691348749463e-07, + "loss": 0.2524, + "step": 32239 + }, + { + "epoch": 0.94, + "grad_norm": 1.447970994657798, + "learning_rate": 1.0999890907821165e-07, + "loss": 0.2753, + "step": 32240 + }, + { + "epoch": 0.94, + "grad_norm": 4.2598273435020975, + "learning_rate": 1.0990094782324268e-07, + "loss": 0.2709, + "step": 32241 + }, + { + "epoch": 0.94, + "grad_norm": 1.2825753567263927, + "learning_rate": 1.0980302972345202e-07, + "loss": 0.2524, + "step": 32242 + }, + { + "epoch": 0.94, + "grad_norm": 1.496356034285606, + "learning_rate": 1.0970515477970455e-07, + "loss": 0.2564, + "step": 32243 + }, + { + "epoch": 0.94, + "grad_norm": 1.3995821948705842, + "learning_rate": 1.0960732299286292e-07, + "loss": 0.2719, + "step": 32244 + }, + { + "epoch": 0.94, + "grad_norm": 1.3777669751225678, + "learning_rate": 1.0950953436379085e-07, + "loss": 0.2805, + "step": 32245 + }, + { + "epoch": 0.94, + "grad_norm": 1.6985557766584207, + "learning_rate": 1.0941178889335102e-07, + "loss": 0.2674, + "step": 32246 + }, + { + "epoch": 0.94, + "grad_norm": 1.2906519385476267, + "learning_rate": 1.0931408658240661e-07, + "loss": 0.2894, + "step": 32247 + }, + { + "epoch": 0.94, + "grad_norm": 1.3920248116649865, + "learning_rate": 1.092164274318197e-07, + "loss": 0.2724, + "step": 32248 + }, + { + "epoch": 0.94, + "grad_norm": 1.3959599368245912, + "learning_rate": 1.0911881144245184e-07, + "loss": 0.2889, + "step": 32249 + }, + { + "epoch": 0.94, + "grad_norm": 1.327443538367016, + "learning_rate": 1.0902123861516511e-07, + "loss": 0.2826, + "step": 32250 + }, + { + "epoch": 0.94, + "grad_norm": 1.346722167348554, + "learning_rate": 1.0892370895081994e-07, + "loss": 0.2546, + "step": 32251 + }, + { + "epoch": 0.94, + "grad_norm": 1.5105855579098297, + "learning_rate": 1.088262224502773e-07, + "loss": 0.2805, + "step": 32252 + }, + { + "epoch": 0.94, + "grad_norm": 1.3411708479608997, + "learning_rate": 1.0872877911439816e-07, + "loss": 0.2552, + "step": 32253 + }, + { + "epoch": 0.94, + "grad_norm": 1.4388022921152703, + "learning_rate": 1.086313789440413e-07, + "loss": 0.2593, + "step": 32254 + }, + { + "epoch": 0.94, + "grad_norm": 1.5794321016171762, + "learning_rate": 1.0853402194006657e-07, + "loss": 0.2766, + "step": 32255 + }, + { + "epoch": 0.94, + "grad_norm": 2.3054625228030057, + "learning_rate": 1.0843670810333329e-07, + "loss": 0.2651, + "step": 32256 + }, + { + "epoch": 0.94, + "grad_norm": 1.3896129167629818, + "learning_rate": 1.0833943743470076e-07, + "loss": 0.2772, + "step": 32257 + }, + { + "epoch": 0.94, + "grad_norm": 1.345915561348825, + "learning_rate": 1.0824220993502665e-07, + "loss": 0.2775, + "step": 32258 + }, + { + "epoch": 0.94, + "grad_norm": 1.274764597369378, + "learning_rate": 1.0814502560516915e-07, + "loss": 0.2545, + "step": 32259 + }, + { + "epoch": 0.94, + "grad_norm": 1.367637358800215, + "learning_rate": 1.0804788444598646e-07, + "loss": 0.261, + "step": 32260 + }, + { + "epoch": 0.94, + "grad_norm": 1.4872277141844614, + "learning_rate": 1.0795078645833567e-07, + "loss": 0.2656, + "step": 32261 + }, + { + "epoch": 0.94, + "grad_norm": 1.2495671020025578, + "learning_rate": 1.0785373164307333e-07, + "loss": 0.2695, + "step": 32262 + }, + { + "epoch": 0.94, + "grad_norm": 1.3505762702360495, + "learning_rate": 1.0775672000105541e-07, + "loss": 0.2772, + "step": 32263 + }, + { + "epoch": 0.94, + "grad_norm": 1.5948737013985301, + "learning_rate": 1.0765975153313957e-07, + "loss": 0.2915, + "step": 32264 + }, + { + "epoch": 0.94, + "grad_norm": 0.9585279480570029, + "learning_rate": 1.0756282624018011e-07, + "loss": 0.5455, + "step": 32265 + }, + { + "epoch": 0.94, + "grad_norm": 1.2586423790729473, + "learning_rate": 1.0746594412303302e-07, + "loss": 0.2748, + "step": 32266 + }, + { + "epoch": 0.94, + "grad_norm": 1.3979933977445071, + "learning_rate": 1.0736910518255317e-07, + "loss": 0.2426, + "step": 32267 + }, + { + "epoch": 0.94, + "grad_norm": 1.3724831244303883, + "learning_rate": 1.0727230941959543e-07, + "loss": 0.28, + "step": 32268 + }, + { + "epoch": 0.94, + "grad_norm": 1.3149972049225815, + "learning_rate": 1.0717555683501413e-07, + "loss": 0.2602, + "step": 32269 + }, + { + "epoch": 0.94, + "grad_norm": 1.3143971994258756, + "learning_rate": 1.0707884742966301e-07, + "loss": 0.2668, + "step": 32270 + }, + { + "epoch": 0.94, + "grad_norm": 0.985527166726045, + "learning_rate": 1.0698218120439475e-07, + "loss": 0.6265, + "step": 32271 + }, + { + "epoch": 0.94, + "grad_norm": 1.4300230482156442, + "learning_rate": 1.0688555816006362e-07, + "loss": 0.2599, + "step": 32272 + }, + { + "epoch": 0.94, + "grad_norm": 1.527605123929649, + "learning_rate": 1.0678897829752121e-07, + "loss": 0.2603, + "step": 32273 + }, + { + "epoch": 0.94, + "grad_norm": 2.9982639338254162, + "learning_rate": 1.0669244161762016e-07, + "loss": 0.2677, + "step": 32274 + }, + { + "epoch": 0.94, + "grad_norm": 1.4321615452092988, + "learning_rate": 1.065959481212131e-07, + "loss": 0.2687, + "step": 32275 + }, + { + "epoch": 0.94, + "grad_norm": 1.434819578595851, + "learning_rate": 1.0649949780915048e-07, + "loss": 0.271, + "step": 32276 + }, + { + "epoch": 0.94, + "grad_norm": 1.2624686900001318, + "learning_rate": 1.0640309068228438e-07, + "loss": 0.293, + "step": 32277 + }, + { + "epoch": 0.94, + "grad_norm": 1.2468735450470174, + "learning_rate": 1.0630672674146524e-07, + "loss": 0.27, + "step": 32278 + }, + { + "epoch": 0.94, + "grad_norm": 3.1263273237412266, + "learning_rate": 1.0621040598754351e-07, + "loss": 0.2823, + "step": 32279 + }, + { + "epoch": 0.94, + "grad_norm": 1.3727360165909877, + "learning_rate": 1.0611412842137014e-07, + "loss": 0.2581, + "step": 32280 + }, + { + "epoch": 0.94, + "grad_norm": 1.3241509192886676, + "learning_rate": 1.0601789404379281e-07, + "loss": 0.2593, + "step": 32281 + }, + { + "epoch": 0.94, + "grad_norm": 1.4569782028914482, + "learning_rate": 1.0592170285566194e-07, + "loss": 0.2621, + "step": 32282 + }, + { + "epoch": 0.94, + "grad_norm": 1.540935399453091, + "learning_rate": 1.0582555485782631e-07, + "loss": 0.266, + "step": 32283 + }, + { + "epoch": 0.94, + "grad_norm": 1.334332833808104, + "learning_rate": 1.0572945005113466e-07, + "loss": 0.265, + "step": 32284 + }, + { + "epoch": 0.94, + "grad_norm": 1.3305125486223788, + "learning_rate": 1.0563338843643523e-07, + "loss": 0.2832, + "step": 32285 + }, + { + "epoch": 0.94, + "grad_norm": 3.215292833925244, + "learning_rate": 1.0553737001457509e-07, + "loss": 0.2757, + "step": 32286 + }, + { + "epoch": 0.94, + "grad_norm": 1.2379440529701866, + "learning_rate": 1.0544139478640192e-07, + "loss": 0.2743, + "step": 32287 + }, + { + "epoch": 0.94, + "grad_norm": 1.3884956440727834, + "learning_rate": 1.053454627527628e-07, + "loss": 0.2724, + "step": 32288 + }, + { + "epoch": 0.94, + "grad_norm": 1.555312701129072, + "learning_rate": 1.052495739145043e-07, + "loss": 0.2511, + "step": 32289 + }, + { + "epoch": 0.94, + "grad_norm": 1.5791125516599653, + "learning_rate": 1.0515372827247295e-07, + "loss": 0.2647, + "step": 32290 + }, + { + "epoch": 0.94, + "grad_norm": 1.3257925617773234, + "learning_rate": 1.0505792582751362e-07, + "loss": 0.2497, + "step": 32291 + }, + { + "epoch": 0.94, + "grad_norm": 1.554859624739661, + "learning_rate": 1.0496216658047287e-07, + "loss": 0.2672, + "step": 32292 + }, + { + "epoch": 0.94, + "grad_norm": 1.5169783321000831, + "learning_rate": 1.0486645053219502e-07, + "loss": 0.2789, + "step": 32293 + }, + { + "epoch": 0.94, + "grad_norm": 1.3596288692699605, + "learning_rate": 1.0477077768352495e-07, + "loss": 0.2551, + "step": 32294 + }, + { + "epoch": 0.94, + "grad_norm": 1.316821974117315, + "learning_rate": 1.0467514803530699e-07, + "loss": 0.2695, + "step": 32295 + }, + { + "epoch": 0.94, + "grad_norm": 1.7891526878465414, + "learning_rate": 1.0457956158838545e-07, + "loss": 0.2915, + "step": 32296 + }, + { + "epoch": 0.94, + "grad_norm": 1.30662964750287, + "learning_rate": 1.0448401834360411e-07, + "loss": 0.2732, + "step": 32297 + }, + { + "epoch": 0.94, + "grad_norm": 1.2681927088189702, + "learning_rate": 1.0438851830180507e-07, + "loss": 0.2568, + "step": 32298 + }, + { + "epoch": 0.94, + "grad_norm": 2.6957598280869246, + "learning_rate": 1.0429306146383155e-07, + "loss": 0.3045, + "step": 32299 + }, + { + "epoch": 0.94, + "grad_norm": 1.2828789291462759, + "learning_rate": 1.0419764783052622e-07, + "loss": 0.2766, + "step": 32300 + }, + { + "epoch": 0.94, + "grad_norm": 1.5044684596603963, + "learning_rate": 1.0410227740273116e-07, + "loss": 0.2655, + "step": 32301 + }, + { + "epoch": 0.94, + "grad_norm": 1.7108890785609483, + "learning_rate": 1.0400695018128737e-07, + "loss": 0.2642, + "step": 32302 + }, + { + "epoch": 0.94, + "grad_norm": 1.4621015799259844, + "learning_rate": 1.0391166616703641e-07, + "loss": 0.2521, + "step": 32303 + }, + { + "epoch": 0.94, + "grad_norm": 1.2682485156873486, + "learning_rate": 1.0381642536081981e-07, + "loss": 0.271, + "step": 32304 + }, + { + "epoch": 0.94, + "grad_norm": 1.3026311983528651, + "learning_rate": 1.0372122776347748e-07, + "loss": 0.2682, + "step": 32305 + }, + { + "epoch": 0.94, + "grad_norm": 1.300212451294475, + "learning_rate": 1.0362607337584929e-07, + "loss": 0.2711, + "step": 32306 + }, + { + "epoch": 0.94, + "grad_norm": 1.3685695180326873, + "learning_rate": 1.0353096219877568e-07, + "loss": 0.2588, + "step": 32307 + }, + { + "epoch": 0.94, + "grad_norm": 1.5582060497935726, + "learning_rate": 1.0343589423309596e-07, + "loss": 0.2744, + "step": 32308 + }, + { + "epoch": 0.94, + "grad_norm": 1.4161773503285144, + "learning_rate": 1.0334086947964838e-07, + "loss": 0.2586, + "step": 32309 + }, + { + "epoch": 0.94, + "grad_norm": 1.4498909441004837, + "learning_rate": 1.0324588793927171e-07, + "loss": 0.2883, + "step": 32310 + }, + { + "epoch": 0.94, + "grad_norm": 1.309835308239954, + "learning_rate": 1.031509496128047e-07, + "loss": 0.256, + "step": 32311 + }, + { + "epoch": 0.94, + "grad_norm": 0.9459357510100694, + "learning_rate": 1.0305605450108503e-07, + "loss": 0.5258, + "step": 32312 + }, + { + "epoch": 0.94, + "grad_norm": 1.4166540394399088, + "learning_rate": 1.0296120260494979e-07, + "loss": 0.257, + "step": 32313 + }, + { + "epoch": 0.94, + "grad_norm": 1.2460765995178955, + "learning_rate": 1.0286639392523612e-07, + "loss": 0.2626, + "step": 32314 + }, + { + "epoch": 0.94, + "grad_norm": 1.4317068725736377, + "learning_rate": 1.027716284627811e-07, + "loss": 0.2559, + "step": 32315 + }, + { + "epoch": 0.94, + "grad_norm": 2.263789558721952, + "learning_rate": 1.0267690621842075e-07, + "loss": 0.269, + "step": 32316 + }, + { + "epoch": 0.94, + "grad_norm": 1.304258241065863, + "learning_rate": 1.0258222719299105e-07, + "loss": 0.2693, + "step": 32317 + }, + { + "epoch": 0.94, + "grad_norm": 1.559390554681999, + "learning_rate": 1.0248759138732745e-07, + "loss": 0.2554, + "step": 32318 + }, + { + "epoch": 0.94, + "grad_norm": 1.3395184842371, + "learning_rate": 1.0239299880226484e-07, + "loss": 0.2816, + "step": 32319 + }, + { + "epoch": 0.94, + "grad_norm": 1.5919151375917824, + "learning_rate": 1.0229844943863865e-07, + "loss": 0.2852, + "step": 32320 + }, + { + "epoch": 0.94, + "grad_norm": 1.255586610889811, + "learning_rate": 1.0220394329728323e-07, + "loss": 0.264, + "step": 32321 + }, + { + "epoch": 0.94, + "grad_norm": 1.3587549146830624, + "learning_rate": 1.0210948037903234e-07, + "loss": 0.2584, + "step": 32322 + }, + { + "epoch": 0.94, + "grad_norm": 1.5630114882515385, + "learning_rate": 1.0201506068471922e-07, + "loss": 0.2932, + "step": 32323 + }, + { + "epoch": 0.94, + "grad_norm": 1.3093489199050956, + "learning_rate": 1.0192068421517765e-07, + "loss": 0.2537, + "step": 32324 + }, + { + "epoch": 0.94, + "grad_norm": 1.322339477608347, + "learning_rate": 1.0182635097124083e-07, + "loss": 0.2661, + "step": 32325 + }, + { + "epoch": 0.94, + "grad_norm": 1.5110972147467192, + "learning_rate": 1.0173206095374032e-07, + "loss": 0.2751, + "step": 32326 + }, + { + "epoch": 0.94, + "grad_norm": 1.2542231612019448, + "learning_rate": 1.0163781416350882e-07, + "loss": 0.2685, + "step": 32327 + }, + { + "epoch": 0.94, + "grad_norm": 1.4993290355034454, + "learning_rate": 1.0154361060137841e-07, + "loss": 0.2594, + "step": 32328 + }, + { + "epoch": 0.94, + "grad_norm": 1.5067996161512314, + "learning_rate": 1.0144945026817954e-07, + "loss": 0.2739, + "step": 32329 + }, + { + "epoch": 0.94, + "grad_norm": 1.3955432147446187, + "learning_rate": 1.0135533316474322e-07, + "loss": 0.2698, + "step": 32330 + }, + { + "epoch": 0.94, + "grad_norm": 1.2805534612868203, + "learning_rate": 1.0126125929190101e-07, + "loss": 0.251, + "step": 32331 + }, + { + "epoch": 0.94, + "grad_norm": 1.3889086474355732, + "learning_rate": 1.0116722865048224e-07, + "loss": 0.2766, + "step": 32332 + }, + { + "epoch": 0.94, + "grad_norm": 1.673157692607636, + "learning_rate": 1.0107324124131678e-07, + "loss": 0.2512, + "step": 32333 + }, + { + "epoch": 0.94, + "grad_norm": 1.2669500545752705, + "learning_rate": 1.0097929706523457e-07, + "loss": 0.2775, + "step": 32334 + }, + { + "epoch": 0.94, + "grad_norm": 1.9561313037900967, + "learning_rate": 1.0088539612306491e-07, + "loss": 0.2631, + "step": 32335 + }, + { + "epoch": 0.94, + "grad_norm": 1.4245107170526445, + "learning_rate": 1.0079153841563549e-07, + "loss": 0.2721, + "step": 32336 + }, + { + "epoch": 0.94, + "grad_norm": 1.3347225331312844, + "learning_rate": 1.0069772394377509e-07, + "loss": 0.2742, + "step": 32337 + }, + { + "epoch": 0.94, + "grad_norm": 1.3321752049921345, + "learning_rate": 1.0060395270831136e-07, + "loss": 0.2737, + "step": 32338 + }, + { + "epoch": 0.94, + "grad_norm": 2.31496164159008, + "learning_rate": 1.0051022471007255e-07, + "loss": 0.2644, + "step": 32339 + }, + { + "epoch": 0.94, + "grad_norm": 1.2832349412480815, + "learning_rate": 1.0041653994988521e-07, + "loss": 0.2485, + "step": 32340 + }, + { + "epoch": 0.94, + "grad_norm": 1.327640653761505, + "learning_rate": 1.0032289842857645e-07, + "loss": 0.2431, + "step": 32341 + }, + { + "epoch": 0.94, + "grad_norm": 1.3279704646070833, + "learning_rate": 1.0022930014697174e-07, + "loss": 0.2578, + "step": 32342 + }, + { + "epoch": 0.94, + "grad_norm": 1.3235116592227107, + "learning_rate": 1.0013574510589874e-07, + "loss": 0.2461, + "step": 32343 + }, + { + "epoch": 0.94, + "grad_norm": 1.338350781938569, + "learning_rate": 1.0004223330618179e-07, + "loss": 0.2462, + "step": 32344 + }, + { + "epoch": 0.94, + "grad_norm": 1.2086477087176781, + "learning_rate": 9.994876474864635e-08, + "loss": 0.252, + "step": 32345 + }, + { + "epoch": 0.94, + "grad_norm": 1.2598121397421926, + "learning_rate": 9.985533943411785e-08, + "loss": 0.2417, + "step": 32346 + }, + { + "epoch": 0.94, + "grad_norm": 1.3536905572629987, + "learning_rate": 9.976195736341953e-08, + "loss": 0.2589, + "step": 32347 + }, + { + "epoch": 0.94, + "grad_norm": 1.4293540497304191, + "learning_rate": 9.966861853737686e-08, + "loss": 0.2488, + "step": 32348 + }, + { + "epoch": 0.94, + "grad_norm": 1.447978508614401, + "learning_rate": 9.957532295681304e-08, + "loss": 0.2696, + "step": 32349 + }, + { + "epoch": 0.94, + "grad_norm": 1.4430461005792965, + "learning_rate": 9.948207062255133e-08, + "loss": 0.2773, + "step": 32350 + }, + { + "epoch": 0.94, + "grad_norm": 1.3615441608135492, + "learning_rate": 9.938886153541439e-08, + "loss": 0.2657, + "step": 32351 + }, + { + "epoch": 0.94, + "grad_norm": 1.2963500968788466, + "learning_rate": 9.929569569622544e-08, + "loss": 0.2544, + "step": 32352 + }, + { + "epoch": 0.94, + "grad_norm": 1.4992013693912156, + "learning_rate": 9.920257310580661e-08, + "loss": 0.2498, + "step": 32353 + }, + { + "epoch": 0.94, + "grad_norm": 1.4246667462926188, + "learning_rate": 9.910949376497891e-08, + "loss": 0.2595, + "step": 32354 + }, + { + "epoch": 0.94, + "grad_norm": 1.3116554448021573, + "learning_rate": 9.901645767456447e-08, + "loss": 0.2649, + "step": 32355 + }, + { + "epoch": 0.94, + "grad_norm": 1.2667893199522229, + "learning_rate": 9.892346483538429e-08, + "loss": 0.2652, + "step": 32356 + }, + { + "epoch": 0.94, + "grad_norm": 0.9290109423107942, + "learning_rate": 9.883051524825881e-08, + "loss": 0.5741, + "step": 32357 + }, + { + "epoch": 0.94, + "grad_norm": 1.6235258072955296, + "learning_rate": 9.873760891400852e-08, + "loss": 0.3002, + "step": 32358 + }, + { + "epoch": 0.94, + "grad_norm": 1.282288104685244, + "learning_rate": 9.864474583345274e-08, + "loss": 0.2476, + "step": 32359 + }, + { + "epoch": 0.94, + "grad_norm": 1.2737950485177287, + "learning_rate": 9.855192600741193e-08, + "loss": 0.2529, + "step": 32360 + }, + { + "epoch": 0.94, + "grad_norm": 1.2790262853582586, + "learning_rate": 9.845914943670432e-08, + "loss": 0.3133, + "step": 32361 + }, + { + "epoch": 0.94, + "grad_norm": 1.3803998583291772, + "learning_rate": 9.836641612214925e-08, + "loss": 0.2556, + "step": 32362 + }, + { + "epoch": 0.94, + "grad_norm": 1.0089115925448895, + "learning_rate": 9.827372606456553e-08, + "loss": 0.6094, + "step": 32363 + }, + { + "epoch": 0.94, + "grad_norm": 2.2632203901969294, + "learning_rate": 9.818107926476972e-08, + "loss": 0.3421, + "step": 32364 + }, + { + "epoch": 0.94, + "grad_norm": 1.4117377574100922, + "learning_rate": 9.80884757235806e-08, + "loss": 0.2759, + "step": 32365 + }, + { + "epoch": 0.94, + "grad_norm": 1.4735165232434677, + "learning_rate": 9.799591544181475e-08, + "loss": 0.2856, + "step": 32366 + }, + { + "epoch": 0.94, + "grad_norm": 1.4364227832545913, + "learning_rate": 9.790339842028984e-08, + "loss": 0.2651, + "step": 32367 + }, + { + "epoch": 0.94, + "grad_norm": 1.3403749423939053, + "learning_rate": 9.781092465982133e-08, + "loss": 0.2744, + "step": 32368 + }, + { + "epoch": 0.94, + "grad_norm": 1.0015532654089632, + "learning_rate": 9.771849416122636e-08, + "loss": 0.5538, + "step": 32369 + }, + { + "epoch": 0.94, + "grad_norm": 1.6492284463236053, + "learning_rate": 9.762610692531927e-08, + "loss": 0.2701, + "step": 32370 + }, + { + "epoch": 0.94, + "grad_norm": 1.3564193462869039, + "learning_rate": 9.753376295291716e-08, + "loss": 0.2679, + "step": 32371 + }, + { + "epoch": 0.94, + "grad_norm": 1.31495628593542, + "learning_rate": 9.744146224483331e-08, + "loss": 0.2665, + "step": 32372 + }, + { + "epoch": 0.94, + "grad_norm": 5.304873956700528, + "learning_rate": 9.734920480188315e-08, + "loss": 0.2527, + "step": 32373 + }, + { + "epoch": 0.94, + "grad_norm": 1.573182592192894, + "learning_rate": 9.725699062488048e-08, + "loss": 0.2934, + "step": 32374 + }, + { + "epoch": 0.94, + "grad_norm": 1.3617648284568942, + "learning_rate": 9.716481971463965e-08, + "loss": 0.26, + "step": 32375 + }, + { + "epoch": 0.94, + "grad_norm": 1.4642865437504788, + "learning_rate": 9.707269207197334e-08, + "loss": 0.2809, + "step": 32376 + }, + { + "epoch": 0.94, + "grad_norm": 1.3369831796393832, + "learning_rate": 9.69806076976948e-08, + "loss": 0.2543, + "step": 32377 + }, + { + "epoch": 0.94, + "grad_norm": 1.2890686016599866, + "learning_rate": 9.688856659261669e-08, + "loss": 0.2855, + "step": 32378 + }, + { + "epoch": 0.94, + "grad_norm": 1.6630779303001109, + "learning_rate": 9.679656875755172e-08, + "loss": 0.2709, + "step": 32379 + }, + { + "epoch": 0.94, + "grad_norm": 1.2769525989243253, + "learning_rate": 9.670461419331201e-08, + "loss": 0.2558, + "step": 32380 + }, + { + "epoch": 0.94, + "grad_norm": 1.2581043733811594, + "learning_rate": 9.661270290070745e-08, + "loss": 0.2695, + "step": 32381 + }, + { + "epoch": 0.94, + "grad_norm": 0.9827261565993253, + "learning_rate": 9.652083488055076e-08, + "loss": 0.5892, + "step": 32382 + }, + { + "epoch": 0.94, + "grad_norm": 1.8671029043610785, + "learning_rate": 9.64290101336518e-08, + "loss": 0.3294, + "step": 32383 + }, + { + "epoch": 0.94, + "grad_norm": 1.282787348958051, + "learning_rate": 9.633722866082163e-08, + "loss": 0.2647, + "step": 32384 + }, + { + "epoch": 0.94, + "grad_norm": 1.3942335562274937, + "learning_rate": 9.624549046286957e-08, + "loss": 0.2731, + "step": 32385 + }, + { + "epoch": 0.94, + "grad_norm": 1.3089332516468462, + "learning_rate": 9.6153795540605e-08, + "loss": 0.2755, + "step": 32386 + }, + { + "epoch": 0.94, + "grad_norm": 1.691155354774123, + "learning_rate": 9.606214389483837e-08, + "loss": 0.2502, + "step": 32387 + }, + { + "epoch": 0.94, + "grad_norm": 1.3114470994501595, + "learning_rate": 9.59705355263768e-08, + "loss": 0.2904, + "step": 32388 + }, + { + "epoch": 0.94, + "grad_norm": 1.4026808966335498, + "learning_rate": 9.587897043603023e-08, + "loss": 0.2729, + "step": 32389 + }, + { + "epoch": 0.94, + "grad_norm": 1.8224929682720576, + "learning_rate": 9.578744862460687e-08, + "loss": 0.2845, + "step": 32390 + }, + { + "epoch": 0.94, + "grad_norm": 1.5769256609069207, + "learning_rate": 9.569597009291276e-08, + "loss": 0.2922, + "step": 32391 + }, + { + "epoch": 0.94, + "grad_norm": 2.1036053377651642, + "learning_rate": 9.560453484175614e-08, + "loss": 0.2538, + "step": 32392 + }, + { + "epoch": 0.94, + "grad_norm": 1.4035486118271074, + "learning_rate": 9.551314287194414e-08, + "loss": 0.2834, + "step": 32393 + }, + { + "epoch": 0.94, + "grad_norm": 1.4632394860998725, + "learning_rate": 9.542179418428276e-08, + "loss": 0.2726, + "step": 32394 + }, + { + "epoch": 0.94, + "grad_norm": 1.5481593044602924, + "learning_rate": 9.533048877957862e-08, + "loss": 0.2728, + "step": 32395 + }, + { + "epoch": 0.94, + "grad_norm": 1.2600087783449307, + "learning_rate": 9.523922665863716e-08, + "loss": 0.2762, + "step": 32396 + }, + { + "epoch": 0.94, + "grad_norm": 1.4459596249126865, + "learning_rate": 9.514800782226385e-08, + "loss": 0.2763, + "step": 32397 + }, + { + "epoch": 0.94, + "grad_norm": 1.3239915706863645, + "learning_rate": 9.505683227126417e-08, + "loss": 0.2746, + "step": 32398 + }, + { + "epoch": 0.94, + "grad_norm": 1.5788234579228344, + "learning_rate": 9.496570000644245e-08, + "loss": 0.2688, + "step": 32399 + }, + { + "epoch": 0.94, + "grad_norm": 1.3020183473615754, + "learning_rate": 9.487461102860307e-08, + "loss": 0.2551, + "step": 32400 + }, + { + "epoch": 0.94, + "grad_norm": 1.923064412171141, + "learning_rate": 9.478356533854871e-08, + "loss": 0.2558, + "step": 32401 + }, + { + "epoch": 0.94, + "grad_norm": 1.3116092372768435, + "learning_rate": 9.469256293708429e-08, + "loss": 0.2918, + "step": 32402 + }, + { + "epoch": 0.94, + "grad_norm": 2.290955817554981, + "learning_rate": 9.460160382501249e-08, + "loss": 0.2645, + "step": 32403 + }, + { + "epoch": 0.94, + "grad_norm": 1.0264918604775135, + "learning_rate": 9.4510688003136e-08, + "loss": 0.552, + "step": 32404 + }, + { + "epoch": 0.94, + "grad_norm": 1.5697452367181628, + "learning_rate": 9.441981547225698e-08, + "loss": 0.2656, + "step": 32405 + }, + { + "epoch": 0.94, + "grad_norm": 1.4612296305477188, + "learning_rate": 9.432898623317754e-08, + "loss": 0.2588, + "step": 32406 + }, + { + "epoch": 0.94, + "grad_norm": 1.2487942418136742, + "learning_rate": 9.423820028669983e-08, + "loss": 0.3015, + "step": 32407 + }, + { + "epoch": 0.94, + "grad_norm": 1.393298163843187, + "learning_rate": 9.414745763362376e-08, + "loss": 0.2773, + "step": 32408 + }, + { + "epoch": 0.94, + "grad_norm": 1.413524113970056, + "learning_rate": 9.405675827475036e-08, + "loss": 0.2825, + "step": 32409 + }, + { + "epoch": 0.94, + "grad_norm": 1.4473666378168772, + "learning_rate": 9.396610221088121e-08, + "loss": 0.2712, + "step": 32410 + }, + { + "epoch": 0.94, + "grad_norm": 3.9901022627498954, + "learning_rate": 9.38754894428151e-08, + "loss": 0.2857, + "step": 32411 + }, + { + "epoch": 0.94, + "grad_norm": 1.4373057740017594, + "learning_rate": 9.378491997135308e-08, + "loss": 0.2416, + "step": 32412 + }, + { + "epoch": 0.94, + "grad_norm": 1.5131045327427055, + "learning_rate": 9.369439379729283e-08, + "loss": 0.2905, + "step": 32413 + }, + { + "epoch": 0.94, + "grad_norm": 1.4011221757586272, + "learning_rate": 9.360391092143428e-08, + "loss": 0.2565, + "step": 32414 + }, + { + "epoch": 0.94, + "grad_norm": 1.2832676687218278, + "learning_rate": 9.351347134457566e-08, + "loss": 0.2591, + "step": 32415 + }, + { + "epoch": 0.94, + "grad_norm": 1.2183419258430517, + "learning_rate": 9.342307506751469e-08, + "loss": 0.2536, + "step": 32416 + }, + { + "epoch": 0.94, + "grad_norm": 0.9381224667890772, + "learning_rate": 9.33327220910496e-08, + "loss": 0.5605, + "step": 32417 + }, + { + "epoch": 0.94, + "grad_norm": 1.347842675300449, + "learning_rate": 9.324241241597864e-08, + "loss": 0.2786, + "step": 32418 + }, + { + "epoch": 0.94, + "grad_norm": 1.505589524591654, + "learning_rate": 9.315214604309674e-08, + "loss": 0.2664, + "step": 32419 + }, + { + "epoch": 0.94, + "grad_norm": 0.9753505091764346, + "learning_rate": 9.306192297320216e-08, + "loss": 0.619, + "step": 32420 + }, + { + "epoch": 0.94, + "grad_norm": 4.832167996789444, + "learning_rate": 9.297174320708979e-08, + "loss": 0.2746, + "step": 32421 + }, + { + "epoch": 0.94, + "grad_norm": 1.3486232552354558, + "learning_rate": 9.28816067455568e-08, + "loss": 0.2973, + "step": 32422 + }, + { + "epoch": 0.94, + "grad_norm": 1.5277296083071137, + "learning_rate": 9.279151358939809e-08, + "loss": 0.3011, + "step": 32423 + }, + { + "epoch": 0.94, + "grad_norm": 1.2704986357230263, + "learning_rate": 9.270146373940859e-08, + "loss": 0.2461, + "step": 32424 + }, + { + "epoch": 0.94, + "grad_norm": 1.2829923527971563, + "learning_rate": 9.261145719638265e-08, + "loss": 0.2667, + "step": 32425 + }, + { + "epoch": 0.94, + "grad_norm": 1.2834078796440682, + "learning_rate": 9.252149396111632e-08, + "loss": 0.2629, + "step": 32426 + }, + { + "epoch": 0.94, + "grad_norm": 1.6622477644519207, + "learning_rate": 9.243157403440118e-08, + "loss": 0.2649, + "step": 32427 + }, + { + "epoch": 0.94, + "grad_norm": 1.5101918137005284, + "learning_rate": 9.234169741703214e-08, + "loss": 0.2906, + "step": 32428 + }, + { + "epoch": 0.94, + "grad_norm": 1.2804169770749179, + "learning_rate": 9.225186410980192e-08, + "loss": 0.2721, + "step": 32429 + }, + { + "epoch": 0.94, + "grad_norm": 1.3150245935572946, + "learning_rate": 9.21620741135032e-08, + "loss": 0.2837, + "step": 32430 + }, + { + "epoch": 0.94, + "grad_norm": 2.1282419473945264, + "learning_rate": 9.207232742892924e-08, + "loss": 0.2605, + "step": 32431 + }, + { + "epoch": 0.94, + "grad_norm": 1.569262485119454, + "learning_rate": 9.198262405687109e-08, + "loss": 0.271, + "step": 32432 + }, + { + "epoch": 0.94, + "grad_norm": 3.1787256051962745, + "learning_rate": 9.189296399812086e-08, + "loss": 0.2614, + "step": 32433 + }, + { + "epoch": 0.94, + "grad_norm": 1.3683443433404092, + "learning_rate": 9.180334725346963e-08, + "loss": 0.2819, + "step": 32434 + }, + { + "epoch": 0.94, + "grad_norm": 1.2709268673476568, + "learning_rate": 9.171377382370838e-08, + "loss": 0.2684, + "step": 32435 + }, + { + "epoch": 0.94, + "grad_norm": 1.304865789363133, + "learning_rate": 9.162424370962764e-08, + "loss": 0.2551, + "step": 32436 + }, + { + "epoch": 0.94, + "grad_norm": 1.8352278990345579, + "learning_rate": 9.153475691201674e-08, + "loss": 0.2474, + "step": 32437 + }, + { + "epoch": 0.94, + "grad_norm": 1.4250171918874752, + "learning_rate": 9.144531343166674e-08, + "loss": 0.2621, + "step": 32438 + }, + { + "epoch": 0.94, + "grad_norm": 1.5331695859854162, + "learning_rate": 9.135591326936588e-08, + "loss": 0.2882, + "step": 32439 + }, + { + "epoch": 0.94, + "grad_norm": 1.3645107953042173, + "learning_rate": 9.12665564259041e-08, + "loss": 0.2832, + "step": 32440 + }, + { + "epoch": 0.94, + "grad_norm": 1.3376896679081929, + "learning_rate": 9.117724290206909e-08, + "loss": 0.265, + "step": 32441 + }, + { + "epoch": 0.94, + "grad_norm": 1.6021482857553284, + "learning_rate": 9.108797269864911e-08, + "loss": 0.289, + "step": 32442 + }, + { + "epoch": 0.94, + "grad_norm": 1.3730159520227772, + "learning_rate": 9.099874581643242e-08, + "loss": 0.2509, + "step": 32443 + }, + { + "epoch": 0.94, + "grad_norm": 1.507164096420926, + "learning_rate": 9.090956225620617e-08, + "loss": 0.2579, + "step": 32444 + }, + { + "epoch": 0.94, + "grad_norm": 1.8933559729357405, + "learning_rate": 9.082042201875807e-08, + "loss": 0.2779, + "step": 32445 + }, + { + "epoch": 0.94, + "grad_norm": 1.2932328937326383, + "learning_rate": 9.073132510487359e-08, + "loss": 0.2748, + "step": 32446 + }, + { + "epoch": 0.94, + "grad_norm": 1.5808764047678108, + "learning_rate": 9.064227151533989e-08, + "loss": 0.2626, + "step": 32447 + }, + { + "epoch": 0.94, + "grad_norm": 1.3678047428278086, + "learning_rate": 9.055326125094244e-08, + "loss": 0.265, + "step": 32448 + }, + { + "epoch": 0.94, + "grad_norm": 1.4227343337956266, + "learning_rate": 9.046429431246673e-08, + "loss": 0.3444, + "step": 32449 + }, + { + "epoch": 0.94, + "grad_norm": 2.7573102201480872, + "learning_rate": 9.037537070069824e-08, + "loss": 0.266, + "step": 32450 + }, + { + "epoch": 0.94, + "grad_norm": 1.3404608711269699, + "learning_rate": 9.028649041642191e-08, + "loss": 0.2694, + "step": 32451 + }, + { + "epoch": 0.94, + "grad_norm": 1.4412344739107297, + "learning_rate": 9.019765346042152e-08, + "loss": 0.2684, + "step": 32452 + }, + { + "epoch": 0.94, + "grad_norm": 1.4885020960478086, + "learning_rate": 9.010885983348094e-08, + "loss": 0.2621, + "step": 32453 + }, + { + "epoch": 0.94, + "grad_norm": 1.6201319567342798, + "learning_rate": 9.002010953638508e-08, + "loss": 0.2587, + "step": 32454 + }, + { + "epoch": 0.94, + "grad_norm": 1.3131171135418747, + "learning_rate": 8.99314025699155e-08, + "loss": 0.2642, + "step": 32455 + }, + { + "epoch": 0.94, + "grad_norm": 1.4830264088672818, + "learning_rate": 8.984273893485607e-08, + "loss": 0.2493, + "step": 32456 + }, + { + "epoch": 0.94, + "grad_norm": 1.3689801550924614, + "learning_rate": 8.975411863198836e-08, + "loss": 0.2781, + "step": 32457 + }, + { + "epoch": 0.94, + "grad_norm": 1.2705745776978963, + "learning_rate": 8.966554166209562e-08, + "loss": 0.247, + "step": 32458 + }, + { + "epoch": 0.94, + "grad_norm": 1.4380704158343272, + "learning_rate": 8.957700802595892e-08, + "loss": 0.2777, + "step": 32459 + }, + { + "epoch": 0.94, + "grad_norm": 1.3065368411004321, + "learning_rate": 8.948851772435929e-08, + "loss": 0.2773, + "step": 32460 + }, + { + "epoch": 0.94, + "grad_norm": 1.3076802794819444, + "learning_rate": 8.940007075807777e-08, + "loss": 0.2554, + "step": 32461 + }, + { + "epoch": 0.94, + "grad_norm": 1.4728934231624704, + "learning_rate": 8.931166712789596e-08, + "loss": 0.2622, + "step": 32462 + }, + { + "epoch": 0.94, + "grad_norm": 1.2712435755969027, + "learning_rate": 8.922330683459268e-08, + "loss": 0.2679, + "step": 32463 + }, + { + "epoch": 0.94, + "grad_norm": 1.3038007726799923, + "learning_rate": 8.913498987894786e-08, + "loss": 0.269, + "step": 32464 + }, + { + "epoch": 0.94, + "grad_norm": 1.424589239070061, + "learning_rate": 8.904671626174144e-08, + "loss": 0.2903, + "step": 32465 + }, + { + "epoch": 0.94, + "grad_norm": 1.3460444507388536, + "learning_rate": 8.895848598375222e-08, + "loss": 0.2889, + "step": 32466 + }, + { + "epoch": 0.94, + "grad_norm": 1.4679968875909453, + "learning_rate": 8.887029904575906e-08, + "loss": 0.285, + "step": 32467 + }, + { + "epoch": 0.94, + "grad_norm": 1.3640342478442102, + "learning_rate": 8.878215544854018e-08, + "loss": 0.2871, + "step": 32468 + }, + { + "epoch": 0.94, + "grad_norm": 1.7815394237686684, + "learning_rate": 8.869405519287333e-08, + "loss": 0.2592, + "step": 32469 + }, + { + "epoch": 0.94, + "grad_norm": 1.3058917827197742, + "learning_rate": 8.860599827953509e-08, + "loss": 0.2558, + "step": 32470 + }, + { + "epoch": 0.94, + "grad_norm": 1.4682638536107169, + "learning_rate": 8.851798470930373e-08, + "loss": 0.2651, + "step": 32471 + }, + { + "epoch": 0.94, + "grad_norm": 1.369104037276674, + "learning_rate": 8.843001448295585e-08, + "loss": 0.2688, + "step": 32472 + }, + { + "epoch": 0.94, + "grad_norm": 1.3470226541543437, + "learning_rate": 8.834208760126805e-08, + "loss": 0.2634, + "step": 32473 + }, + { + "epoch": 0.94, + "grad_norm": 1.422577440171511, + "learning_rate": 8.825420406501529e-08, + "loss": 0.2791, + "step": 32474 + }, + { + "epoch": 0.94, + "grad_norm": 1.4008319009135084, + "learning_rate": 8.816636387497358e-08, + "loss": 0.253, + "step": 32475 + }, + { + "epoch": 0.94, + "grad_norm": 1.39370654472164, + "learning_rate": 8.807856703191786e-08, + "loss": 0.275, + "step": 32476 + }, + { + "epoch": 0.94, + "grad_norm": 2.5569670923235392, + "learning_rate": 8.799081353662365e-08, + "loss": 0.252, + "step": 32477 + }, + { + "epoch": 0.94, + "grad_norm": 1.3804033049117568, + "learning_rate": 8.790310338986529e-08, + "loss": 0.2793, + "step": 32478 + }, + { + "epoch": 0.94, + "grad_norm": 1.2398228810105143, + "learning_rate": 8.781543659241609e-08, + "loss": 0.2526, + "step": 32479 + }, + { + "epoch": 0.94, + "grad_norm": 1.5058485697514825, + "learning_rate": 8.772781314505041e-08, + "loss": 0.2697, + "step": 32480 + }, + { + "epoch": 0.94, + "grad_norm": 1.473426383026551, + "learning_rate": 8.764023304854152e-08, + "loss": 0.266, + "step": 32481 + }, + { + "epoch": 0.94, + "grad_norm": 1.3011748739841102, + "learning_rate": 8.755269630366159e-08, + "loss": 0.2833, + "step": 32482 + }, + { + "epoch": 0.94, + "grad_norm": 1.4778523968431296, + "learning_rate": 8.746520291118388e-08, + "loss": 0.2567, + "step": 32483 + }, + { + "epoch": 0.94, + "grad_norm": 1.3617910061384073, + "learning_rate": 8.737775287188e-08, + "loss": 0.2585, + "step": 32484 + }, + { + "epoch": 0.94, + "grad_norm": 1.3681330389089081, + "learning_rate": 8.72903461865221e-08, + "loss": 0.2711, + "step": 32485 + }, + { + "epoch": 0.94, + "grad_norm": 1.2792628377782551, + "learning_rate": 8.720298285588124e-08, + "loss": 0.2787, + "step": 32486 + }, + { + "epoch": 0.94, + "grad_norm": 1.2991628771901305, + "learning_rate": 8.711566288072848e-08, + "loss": 0.2468, + "step": 32487 + }, + { + "epoch": 0.94, + "grad_norm": 1.405695284736921, + "learning_rate": 8.702838626183485e-08, + "loss": 0.2879, + "step": 32488 + }, + { + "epoch": 0.94, + "grad_norm": 1.3790430286639033, + "learning_rate": 8.69411529999703e-08, + "loss": 0.2568, + "step": 32489 + }, + { + "epoch": 0.94, + "grad_norm": 1.321705033974215, + "learning_rate": 8.685396309590477e-08, + "loss": 0.2628, + "step": 32490 + }, + { + "epoch": 0.94, + "grad_norm": 1.3566528339891422, + "learning_rate": 8.676681655040653e-08, + "loss": 0.2852, + "step": 32491 + }, + { + "epoch": 0.94, + "grad_norm": 1.5216121677604972, + "learning_rate": 8.667971336424664e-08, + "loss": 0.2691, + "step": 32492 + }, + { + "epoch": 0.94, + "grad_norm": 1.451652041880686, + "learning_rate": 8.659265353819224e-08, + "loss": 0.2844, + "step": 32493 + }, + { + "epoch": 0.94, + "grad_norm": 1.263538701037858, + "learning_rate": 8.650563707301219e-08, + "loss": 0.2622, + "step": 32494 + }, + { + "epoch": 0.94, + "grad_norm": 1.3664260016943752, + "learning_rate": 8.641866396947418e-08, + "loss": 0.2788, + "step": 32495 + }, + { + "epoch": 0.94, + "grad_norm": 1.590395213577803, + "learning_rate": 8.633173422834707e-08, + "loss": 0.3004, + "step": 32496 + }, + { + "epoch": 0.94, + "grad_norm": 2.0039178261843067, + "learning_rate": 8.624484785039578e-08, + "loss": 0.2779, + "step": 32497 + }, + { + "epoch": 0.94, + "grad_norm": 1.4190848036692287, + "learning_rate": 8.615800483638859e-08, + "loss": 0.2702, + "step": 32498 + }, + { + "epoch": 0.94, + "grad_norm": 1.6212224973473195, + "learning_rate": 8.607120518709156e-08, + "loss": 0.2682, + "step": 32499 + }, + { + "epoch": 0.94, + "grad_norm": 1.2788656810019179, + "learning_rate": 8.598444890327073e-08, + "loss": 0.2714, + "step": 32500 + }, + { + "epoch": 0.94, + "grad_norm": 1.2941380008182426, + "learning_rate": 8.589773598569162e-08, + "loss": 0.2602, + "step": 32501 + }, + { + "epoch": 0.94, + "grad_norm": 1.2729304760715967, + "learning_rate": 8.581106643511916e-08, + "loss": 0.2393, + "step": 32502 + }, + { + "epoch": 0.94, + "grad_norm": 1.3423789085502134, + "learning_rate": 8.572444025231885e-08, + "loss": 0.2605, + "step": 32503 + }, + { + "epoch": 0.94, + "grad_norm": 1.3504084938865537, + "learning_rate": 8.563785743805508e-08, + "loss": 0.3174, + "step": 32504 + }, + { + "epoch": 0.94, + "grad_norm": 1.2860248538443158, + "learning_rate": 8.555131799309113e-08, + "loss": 0.2732, + "step": 32505 + }, + { + "epoch": 0.94, + "grad_norm": 1.5643210043197762, + "learning_rate": 8.546482191819194e-08, + "loss": 0.2823, + "step": 32506 + }, + { + "epoch": 0.94, + "grad_norm": 1.4383243446261849, + "learning_rate": 8.537836921411969e-08, + "loss": 0.2671, + "step": 32507 + }, + { + "epoch": 0.94, + "grad_norm": 1.411404136623222, + "learning_rate": 8.529195988163819e-08, + "loss": 0.2865, + "step": 32508 + }, + { + "epoch": 0.94, + "grad_norm": 1.4487243901390212, + "learning_rate": 8.520559392150962e-08, + "loss": 0.3117, + "step": 32509 + }, + { + "epoch": 0.94, + "grad_norm": 1.402281298839661, + "learning_rate": 8.511927133449616e-08, + "loss": 0.25, + "step": 32510 + }, + { + "epoch": 0.94, + "grad_norm": 1.3525468569500154, + "learning_rate": 8.503299212135996e-08, + "loss": 0.2752, + "step": 32511 + }, + { + "epoch": 0.94, + "grad_norm": 1.7259880681497792, + "learning_rate": 8.494675628286153e-08, + "loss": 0.2948, + "step": 32512 + }, + { + "epoch": 0.94, + "grad_norm": 1.4854445713987823, + "learning_rate": 8.486056381976304e-08, + "loss": 0.2989, + "step": 32513 + }, + { + "epoch": 0.94, + "grad_norm": 1.3390888623353452, + "learning_rate": 8.477441473282389e-08, + "loss": 0.2677, + "step": 32514 + }, + { + "epoch": 0.94, + "grad_norm": 5.633309918182481, + "learning_rate": 8.468830902280567e-08, + "loss": 0.256, + "step": 32515 + }, + { + "epoch": 0.94, + "grad_norm": 1.940133964126603, + "learning_rate": 8.46022466904678e-08, + "loss": 0.2485, + "step": 32516 + }, + { + "epoch": 0.94, + "grad_norm": 1.3546523201918006, + "learning_rate": 8.451622773656965e-08, + "loss": 0.2583, + "step": 32517 + }, + { + "epoch": 0.94, + "grad_norm": 1.3772312756638403, + "learning_rate": 8.44302521618695e-08, + "loss": 0.2482, + "step": 32518 + }, + { + "epoch": 0.94, + "grad_norm": 1.2310062216451523, + "learning_rate": 8.434431996712733e-08, + "loss": 0.2499, + "step": 32519 + }, + { + "epoch": 0.94, + "grad_norm": 0.9806617944550033, + "learning_rate": 8.425843115310139e-08, + "loss": 0.5647, + "step": 32520 + }, + { + "epoch": 0.94, + "grad_norm": 2.191423314847545, + "learning_rate": 8.417258572054888e-08, + "loss": 0.2891, + "step": 32521 + }, + { + "epoch": 0.94, + "grad_norm": 1.2667871302838936, + "learning_rate": 8.408678367022804e-08, + "loss": 0.2777, + "step": 32522 + }, + { + "epoch": 0.94, + "grad_norm": 1.332742115622648, + "learning_rate": 8.400102500289608e-08, + "loss": 0.266, + "step": 32523 + }, + { + "epoch": 0.94, + "grad_norm": 1.3686183226757915, + "learning_rate": 8.391530971930961e-08, + "loss": 0.2741, + "step": 32524 + }, + { + "epoch": 0.94, + "grad_norm": 1.6356253896544337, + "learning_rate": 8.382963782022468e-08, + "loss": 0.2725, + "step": 32525 + }, + { + "epoch": 0.94, + "grad_norm": 1.972755754759178, + "learning_rate": 8.374400930639792e-08, + "loss": 0.2553, + "step": 32526 + }, + { + "epoch": 0.94, + "grad_norm": 0.9455651563229877, + "learning_rate": 8.365842417858538e-08, + "loss": 0.5616, + "step": 32527 + }, + { + "epoch": 0.94, + "grad_norm": 1.356932350876237, + "learning_rate": 8.357288243754092e-08, + "loss": 0.2784, + "step": 32528 + }, + { + "epoch": 0.94, + "grad_norm": 1.3456444515790495, + "learning_rate": 8.348738408402057e-08, + "loss": 0.2793, + "step": 32529 + }, + { + "epoch": 0.94, + "grad_norm": 1.2104391744819896, + "learning_rate": 8.340192911877876e-08, + "loss": 0.2702, + "step": 32530 + }, + { + "epoch": 0.94, + "grad_norm": 1.764499591410516, + "learning_rate": 8.331651754256931e-08, + "loss": 0.285, + "step": 32531 + }, + { + "epoch": 0.94, + "grad_norm": 1.5589791334953829, + "learning_rate": 8.323114935614606e-08, + "loss": 0.2903, + "step": 32532 + }, + { + "epoch": 0.94, + "grad_norm": 1.2462531941084982, + "learning_rate": 8.314582456026233e-08, + "loss": 0.2641, + "step": 32533 + }, + { + "epoch": 0.94, + "grad_norm": 1.4086347233635093, + "learning_rate": 8.306054315567136e-08, + "loss": 0.252, + "step": 32534 + }, + { + "epoch": 0.94, + "grad_norm": 1.3689187748597147, + "learning_rate": 8.29753051431259e-08, + "loss": 0.2716, + "step": 32535 + }, + { + "epoch": 0.94, + "grad_norm": 1.3109282132698796, + "learning_rate": 8.289011052337814e-08, + "loss": 0.262, + "step": 32536 + }, + { + "epoch": 0.94, + "grad_norm": 1.300336594106533, + "learning_rate": 8.280495929717913e-08, + "loss": 0.2563, + "step": 32537 + }, + { + "epoch": 0.94, + "grad_norm": 0.9599081864078237, + "learning_rate": 8.271985146528105e-08, + "loss": 0.594, + "step": 32538 + }, + { + "epoch": 0.94, + "grad_norm": 1.4229986308606069, + "learning_rate": 8.263478702843497e-08, + "loss": 0.2843, + "step": 32539 + }, + { + "epoch": 0.94, + "grad_norm": 1.3144884102707155, + "learning_rate": 8.254976598739139e-08, + "loss": 0.2621, + "step": 32540 + }, + { + "epoch": 0.94, + "grad_norm": 1.4205837315965513, + "learning_rate": 8.246478834290083e-08, + "loss": 0.2626, + "step": 32541 + }, + { + "epoch": 0.94, + "grad_norm": 1.6384408631667193, + "learning_rate": 8.23798540957127e-08, + "loss": 0.2809, + "step": 32542 + }, + { + "epoch": 0.94, + "grad_norm": 1.3963556198498328, + "learning_rate": 8.229496324657694e-08, + "loss": 0.2848, + "step": 32543 + }, + { + "epoch": 0.94, + "grad_norm": 1.2083277700189983, + "learning_rate": 8.221011579624295e-08, + "loss": 0.2494, + "step": 32544 + }, + { + "epoch": 0.94, + "grad_norm": 1.507101118871604, + "learning_rate": 8.212531174545957e-08, + "loss": 0.2647, + "step": 32545 + }, + { + "epoch": 0.94, + "grad_norm": 1.3149381149983401, + "learning_rate": 8.204055109497455e-08, + "loss": 0.269, + "step": 32546 + }, + { + "epoch": 0.94, + "grad_norm": 1.513927448349492, + "learning_rate": 8.195583384553563e-08, + "loss": 0.2579, + "step": 32547 + }, + { + "epoch": 0.94, + "grad_norm": 1.4122603626284402, + "learning_rate": 8.187115999789163e-08, + "loss": 0.2778, + "step": 32548 + }, + { + "epoch": 0.94, + "grad_norm": 2.0349267594146996, + "learning_rate": 8.17865295527892e-08, + "loss": 0.2562, + "step": 32549 + }, + { + "epoch": 0.94, + "grad_norm": 1.33220947493332, + "learning_rate": 8.170194251097497e-08, + "loss": 0.2766, + "step": 32550 + }, + { + "epoch": 0.94, + "grad_norm": 1.4373894317685059, + "learning_rate": 8.161739887319553e-08, + "loss": 0.2516, + "step": 32551 + }, + { + "epoch": 0.94, + "grad_norm": 1.4457024312978397, + "learning_rate": 8.153289864019808e-08, + "loss": 0.2621, + "step": 32552 + }, + { + "epoch": 0.94, + "grad_norm": 1.336850913100536, + "learning_rate": 8.144844181272649e-08, + "loss": 0.2693, + "step": 32553 + }, + { + "epoch": 0.94, + "grad_norm": 1.3446194305796293, + "learning_rate": 8.13640283915268e-08, + "loss": 0.2749, + "step": 32554 + }, + { + "epoch": 0.94, + "grad_norm": 1.3019107605478781, + "learning_rate": 8.127965837734508e-08, + "loss": 0.2683, + "step": 32555 + }, + { + "epoch": 0.94, + "grad_norm": 1.0134026394298923, + "learning_rate": 8.119533177092409e-08, + "loss": 0.6207, + "step": 32556 + }, + { + "epoch": 0.94, + "grad_norm": 1.333463523524382, + "learning_rate": 8.111104857300934e-08, + "loss": 0.2581, + "step": 32557 + }, + { + "epoch": 0.94, + "grad_norm": 1.3066095323681908, + "learning_rate": 8.102680878434354e-08, + "loss": 0.261, + "step": 32558 + }, + { + "epoch": 0.94, + "grad_norm": 1.319730493065619, + "learning_rate": 8.094261240567114e-08, + "loss": 0.2601, + "step": 32559 + }, + { + "epoch": 0.94, + "grad_norm": 1.3129134722473979, + "learning_rate": 8.08584594377343e-08, + "loss": 0.2644, + "step": 32560 + }, + { + "epoch": 0.94, + "grad_norm": 1.2976902925380887, + "learning_rate": 8.077434988127686e-08, + "loss": 0.2798, + "step": 32561 + }, + { + "epoch": 0.94, + "grad_norm": 1.375670918305107, + "learning_rate": 8.06902837370399e-08, + "loss": 0.2705, + "step": 32562 + }, + { + "epoch": 0.94, + "grad_norm": 1.5167600343491034, + "learning_rate": 8.060626100576618e-08, + "loss": 0.2748, + "step": 32563 + }, + { + "epoch": 0.94, + "grad_norm": 1.2492178547928041, + "learning_rate": 8.05222816881962e-08, + "loss": 0.2882, + "step": 32564 + }, + { + "epoch": 0.94, + "grad_norm": 1.456223620220701, + "learning_rate": 8.043834578507215e-08, + "loss": 0.2955, + "step": 32565 + }, + { + "epoch": 0.94, + "grad_norm": 1.2646715222827574, + "learning_rate": 8.035445329713398e-08, + "loss": 0.263, + "step": 32566 + }, + { + "epoch": 0.94, + "grad_norm": 1.3560247768022715, + "learning_rate": 8.027060422512223e-08, + "loss": 0.2666, + "step": 32567 + }, + { + "epoch": 0.94, + "grad_norm": 2.0102572337571916, + "learning_rate": 8.018679856977741e-08, + "loss": 0.3017, + "step": 32568 + }, + { + "epoch": 0.94, + "grad_norm": 1.3262021467629832, + "learning_rate": 8.010303633183836e-08, + "loss": 0.2809, + "step": 32569 + }, + { + "epoch": 0.94, + "grad_norm": 1.3555418756424424, + "learning_rate": 8.00193175120445e-08, + "loss": 0.2712, + "step": 32570 + }, + { + "epoch": 0.94, + "grad_norm": 1.9043192212963844, + "learning_rate": 7.993564211113469e-08, + "loss": 0.3137, + "step": 32571 + }, + { + "epoch": 0.94, + "grad_norm": 1.404339730781162, + "learning_rate": 7.985201012984778e-08, + "loss": 0.2951, + "step": 32572 + }, + { + "epoch": 0.94, + "grad_norm": 1.2529600665602467, + "learning_rate": 7.97684215689215e-08, + "loss": 0.266, + "step": 32573 + }, + { + "epoch": 0.94, + "grad_norm": 1.295361168370017, + "learning_rate": 7.968487642909306e-08, + "loss": 0.2674, + "step": 32574 + }, + { + "epoch": 0.94, + "grad_norm": 1.3098288697862366, + "learning_rate": 7.960137471110019e-08, + "loss": 0.262, + "step": 32575 + }, + { + "epoch": 0.94, + "grad_norm": 3.279220869885311, + "learning_rate": 7.951791641568008e-08, + "loss": 0.2755, + "step": 32576 + }, + { + "epoch": 0.94, + "grad_norm": 1.3178712517916091, + "learning_rate": 7.94345015435688e-08, + "loss": 0.2743, + "step": 32577 + }, + { + "epoch": 0.94, + "grad_norm": 1.510067737645172, + "learning_rate": 7.935113009550244e-08, + "loss": 0.2657, + "step": 32578 + }, + { + "epoch": 0.94, + "grad_norm": 1.5054284216793945, + "learning_rate": 7.926780207221706e-08, + "loss": 0.2827, + "step": 32579 + }, + { + "epoch": 0.94, + "grad_norm": 1.2760072451680469, + "learning_rate": 7.918451747444822e-08, + "loss": 0.2659, + "step": 32580 + }, + { + "epoch": 0.95, + "grad_norm": 1.259412184356605, + "learning_rate": 7.910127630293085e-08, + "loss": 0.2535, + "step": 32581 + }, + { + "epoch": 0.95, + "grad_norm": 1.517223314631809, + "learning_rate": 7.901807855839883e-08, + "loss": 0.2683, + "step": 32582 + }, + { + "epoch": 0.95, + "grad_norm": 1.336729911787385, + "learning_rate": 7.893492424158711e-08, + "loss": 0.2797, + "step": 32583 + }, + { + "epoch": 0.95, + "grad_norm": 1.1499003955876355, + "learning_rate": 7.8851813353229e-08, + "loss": 0.2461, + "step": 32584 + }, + { + "epoch": 0.95, + "grad_norm": 1.5595073275209193, + "learning_rate": 7.876874589405836e-08, + "loss": 0.2431, + "step": 32585 + }, + { + "epoch": 0.95, + "grad_norm": 1.5786209924892134, + "learning_rate": 7.868572186480793e-08, + "loss": 0.2819, + "step": 32586 + }, + { + "epoch": 0.95, + "grad_norm": 1.209959890850478, + "learning_rate": 7.860274126621048e-08, + "loss": 0.2473, + "step": 32587 + }, + { + "epoch": 0.95, + "grad_norm": 1.279962568743984, + "learning_rate": 7.851980409899873e-08, + "loss": 0.295, + "step": 32588 + }, + { + "epoch": 0.95, + "grad_norm": 1.2922247885548748, + "learning_rate": 7.843691036390433e-08, + "loss": 0.2806, + "step": 32589 + }, + { + "epoch": 0.95, + "grad_norm": 1.295580131992971, + "learning_rate": 7.835406006165835e-08, + "loss": 0.2644, + "step": 32590 + }, + { + "epoch": 0.95, + "grad_norm": 1.4486530851888961, + "learning_rate": 7.827125319299301e-08, + "loss": 0.2832, + "step": 32591 + }, + { + "epoch": 0.95, + "grad_norm": 1.2659908721442306, + "learning_rate": 7.818848975863824e-08, + "loss": 0.2636, + "step": 32592 + }, + { + "epoch": 0.95, + "grad_norm": 1.5496759756401166, + "learning_rate": 7.81057697593246e-08, + "loss": 0.3057, + "step": 32593 + }, + { + "epoch": 0.95, + "grad_norm": 1.4800127667898142, + "learning_rate": 7.802309319578205e-08, + "loss": 0.2728, + "step": 32594 + }, + { + "epoch": 0.95, + "grad_norm": 1.8976314793757587, + "learning_rate": 7.794046006874056e-08, + "loss": 0.2955, + "step": 32595 + }, + { + "epoch": 0.95, + "grad_norm": 1.4342136311606284, + "learning_rate": 7.7857870378929e-08, + "loss": 0.2836, + "step": 32596 + }, + { + "epoch": 0.95, + "grad_norm": 1.609304250322669, + "learning_rate": 7.777532412707623e-08, + "loss": 0.2934, + "step": 32597 + }, + { + "epoch": 0.95, + "grad_norm": 1.357111065385712, + "learning_rate": 7.769282131391053e-08, + "loss": 0.2811, + "step": 32598 + }, + { + "epoch": 0.95, + "grad_norm": 1.3529986901789526, + "learning_rate": 7.761036194016137e-08, + "loss": 0.261, + "step": 32599 + }, + { + "epoch": 0.95, + "grad_norm": 1.2648070436770227, + "learning_rate": 7.752794600655422e-08, + "loss": 0.2484, + "step": 32600 + }, + { + "epoch": 0.95, + "grad_norm": 1.5889872256414996, + "learning_rate": 7.744557351381798e-08, + "loss": 0.2918, + "step": 32601 + }, + { + "epoch": 0.95, + "grad_norm": 1.3684673745357376, + "learning_rate": 7.73632444626793e-08, + "loss": 0.2644, + "step": 32602 + }, + { + "epoch": 0.95, + "grad_norm": 1.488158781903369, + "learning_rate": 7.728095885386422e-08, + "loss": 0.2867, + "step": 32603 + }, + { + "epoch": 0.95, + "grad_norm": 1.2432780378598263, + "learning_rate": 7.719871668809942e-08, + "loss": 0.2752, + "step": 32604 + }, + { + "epoch": 0.95, + "grad_norm": 1.3624075213717468, + "learning_rate": 7.711651796611097e-08, + "loss": 0.2773, + "step": 32605 + }, + { + "epoch": 0.95, + "grad_norm": 1.3496394803705771, + "learning_rate": 7.70343626886233e-08, + "loss": 0.2668, + "step": 32606 + }, + { + "epoch": 0.95, + "grad_norm": 1.45305064905244, + "learning_rate": 7.695225085636249e-08, + "loss": 0.2646, + "step": 32607 + }, + { + "epoch": 0.95, + "grad_norm": 1.5767874833017501, + "learning_rate": 7.68701824700524e-08, + "loss": 0.2702, + "step": 32608 + }, + { + "epoch": 0.95, + "grad_norm": 1.6169078710174347, + "learning_rate": 7.678815753041746e-08, + "loss": 0.2788, + "step": 32609 + }, + { + "epoch": 0.95, + "grad_norm": 1.3254618836780032, + "learning_rate": 7.670617603818265e-08, + "loss": 0.2795, + "step": 32610 + }, + { + "epoch": 0.95, + "grad_norm": 1.3773231743521308, + "learning_rate": 7.662423799406904e-08, + "loss": 0.2787, + "step": 32611 + }, + { + "epoch": 0.95, + "grad_norm": 3.1663515286475015, + "learning_rate": 7.654234339880163e-08, + "loss": 0.2576, + "step": 32612 + }, + { + "epoch": 0.95, + "grad_norm": 1.349836701673861, + "learning_rate": 7.646049225310314e-08, + "loss": 0.2553, + "step": 32613 + }, + { + "epoch": 0.95, + "grad_norm": 1.4534133960269604, + "learning_rate": 7.63786845576947e-08, + "loss": 0.2885, + "step": 32614 + }, + { + "epoch": 0.95, + "grad_norm": 1.3180807319913816, + "learning_rate": 7.629692031329905e-08, + "loss": 0.2689, + "step": 32615 + }, + { + "epoch": 0.95, + "grad_norm": 1.545043018001366, + "learning_rate": 7.621519952063839e-08, + "loss": 0.272, + "step": 32616 + }, + { + "epoch": 0.95, + "grad_norm": 1.4442708212575313, + "learning_rate": 7.613352218043268e-08, + "loss": 0.2767, + "step": 32617 + }, + { + "epoch": 0.95, + "grad_norm": 1.3616380060911935, + "learning_rate": 7.60518882934036e-08, + "loss": 0.2534, + "step": 32618 + }, + { + "epoch": 0.95, + "grad_norm": 1.2116529732554626, + "learning_rate": 7.597029786027055e-08, + "loss": 0.261, + "step": 32619 + }, + { + "epoch": 0.95, + "grad_norm": 1.6369253988074857, + "learning_rate": 7.588875088175462e-08, + "loss": 0.2406, + "step": 32620 + }, + { + "epoch": 0.95, + "grad_norm": 1.3160110151022382, + "learning_rate": 7.580724735857526e-08, + "loss": 0.2851, + "step": 32621 + }, + { + "epoch": 0.95, + "grad_norm": 1.6834047141712138, + "learning_rate": 7.57257872914513e-08, + "loss": 0.2769, + "step": 32622 + }, + { + "epoch": 0.95, + "grad_norm": 1.2993856795453893, + "learning_rate": 7.56443706811022e-08, + "loss": 0.2608, + "step": 32623 + }, + { + "epoch": 0.95, + "grad_norm": 1.5650232636264476, + "learning_rate": 7.55629975282457e-08, + "loss": 0.2543, + "step": 32624 + }, + { + "epoch": 0.95, + "grad_norm": 1.2723286925607422, + "learning_rate": 7.548166783360067e-08, + "loss": 0.2455, + "step": 32625 + }, + { + "epoch": 0.95, + "grad_norm": 1.4454296073432387, + "learning_rate": 7.540038159788487e-08, + "loss": 0.2825, + "step": 32626 + }, + { + "epoch": 0.95, + "grad_norm": 1.5557014600836718, + "learning_rate": 7.531913882181496e-08, + "loss": 0.2691, + "step": 32627 + }, + { + "epoch": 0.95, + "grad_norm": 1.5092518871168, + "learning_rate": 7.523793950610869e-08, + "loss": 0.2696, + "step": 32628 + }, + { + "epoch": 0.95, + "grad_norm": 1.384663309965905, + "learning_rate": 7.51567836514816e-08, + "loss": 0.2846, + "step": 32629 + }, + { + "epoch": 0.95, + "grad_norm": 1.3075386475070638, + "learning_rate": 7.50756712586509e-08, + "loss": 0.2888, + "step": 32630 + }, + { + "epoch": 0.95, + "grad_norm": 1.3125098878725296, + "learning_rate": 7.499460232833212e-08, + "loss": 0.2606, + "step": 32631 + }, + { + "epoch": 0.95, + "grad_norm": 1.2598628148860833, + "learning_rate": 7.491357686124023e-08, + "loss": 0.2661, + "step": 32632 + }, + { + "epoch": 0.95, + "grad_norm": 1.6090493101187373, + "learning_rate": 7.48325948580908e-08, + "loss": 0.2643, + "step": 32633 + }, + { + "epoch": 0.95, + "grad_norm": 0.9543203874924722, + "learning_rate": 7.475165631959824e-08, + "loss": 0.5513, + "step": 32634 + }, + { + "epoch": 0.95, + "grad_norm": 1.5537395604460653, + "learning_rate": 7.467076124647754e-08, + "loss": 0.2782, + "step": 32635 + }, + { + "epoch": 0.95, + "grad_norm": 1.2482454526877986, + "learning_rate": 7.458990963944146e-08, + "loss": 0.2667, + "step": 32636 + }, + { + "epoch": 0.95, + "grad_norm": 1.402454131473433, + "learning_rate": 7.450910149920499e-08, + "loss": 0.2493, + "step": 32637 + }, + { + "epoch": 0.95, + "grad_norm": 1.3294086553324531, + "learning_rate": 7.442833682647921e-08, + "loss": 0.2561, + "step": 32638 + }, + { + "epoch": 0.95, + "grad_norm": 1.202224557832427, + "learning_rate": 7.434761562197856e-08, + "loss": 0.2643, + "step": 32639 + }, + { + "epoch": 0.95, + "grad_norm": 1.3574857969573595, + "learning_rate": 7.426693788641415e-08, + "loss": 0.2928, + "step": 32640 + }, + { + "epoch": 0.95, + "grad_norm": 1.4312393453693144, + "learning_rate": 7.418630362049928e-08, + "loss": 0.2821, + "step": 32641 + }, + { + "epoch": 0.95, + "grad_norm": 1.3538970180515815, + "learning_rate": 7.410571282494506e-08, + "loss": 0.2909, + "step": 32642 + }, + { + "epoch": 0.95, + "grad_norm": 1.32886475712546, + "learning_rate": 7.402516550046201e-08, + "loss": 0.2755, + "step": 32643 + }, + { + "epoch": 0.95, + "grad_norm": 1.321802281007448, + "learning_rate": 7.394466164776182e-08, + "loss": 0.2477, + "step": 32644 + }, + { + "epoch": 0.95, + "grad_norm": 1.5546334837523317, + "learning_rate": 7.386420126755445e-08, + "loss": 0.2743, + "step": 32645 + }, + { + "epoch": 0.95, + "grad_norm": 1.2499491920502532, + "learning_rate": 7.3783784360551e-08, + "loss": 0.2491, + "step": 32646 + }, + { + "epoch": 0.95, + "grad_norm": 1.4632096473431448, + "learning_rate": 7.370341092745925e-08, + "loss": 0.2746, + "step": 32647 + }, + { + "epoch": 0.95, + "grad_norm": 1.36696059769269, + "learning_rate": 7.362308096898974e-08, + "loss": 0.2709, + "step": 32648 + }, + { + "epoch": 0.95, + "grad_norm": 1.3944973660767093, + "learning_rate": 7.354279448585133e-08, + "loss": 0.2647, + "step": 32649 + }, + { + "epoch": 0.95, + "grad_norm": 1.8636869436552053, + "learning_rate": 7.346255147875292e-08, + "loss": 0.2771, + "step": 32650 + }, + { + "epoch": 0.95, + "grad_norm": 1.6008819440730926, + "learning_rate": 7.338235194840116e-08, + "loss": 0.2671, + "step": 32651 + }, + { + "epoch": 0.95, + "grad_norm": 1.4732830070296288, + "learning_rate": 7.330219589550546e-08, + "loss": 0.2787, + "step": 32652 + }, + { + "epoch": 0.95, + "grad_norm": 1.2484559358474734, + "learning_rate": 7.322208332077252e-08, + "loss": 0.2538, + "step": 32653 + }, + { + "epoch": 0.95, + "grad_norm": 1.3366573552501544, + "learning_rate": 7.314201422490951e-08, + "loss": 0.2884, + "step": 32654 + }, + { + "epoch": 0.95, + "grad_norm": 2.0217799728666357, + "learning_rate": 7.306198860862257e-08, + "loss": 0.2764, + "step": 32655 + }, + { + "epoch": 0.95, + "grad_norm": 1.3339267528039573, + "learning_rate": 7.298200647261833e-08, + "loss": 0.2797, + "step": 32656 + }, + { + "epoch": 0.95, + "grad_norm": 1.4072015139558642, + "learning_rate": 7.290206781760234e-08, + "loss": 0.2606, + "step": 32657 + }, + { + "epoch": 0.95, + "grad_norm": 1.5715512123046305, + "learning_rate": 7.282217264428016e-08, + "loss": 0.2869, + "step": 32658 + }, + { + "epoch": 0.95, + "grad_norm": 1.4981866943693982, + "learning_rate": 7.274232095335731e-08, + "loss": 0.2643, + "step": 32659 + }, + { + "epoch": 0.95, + "grad_norm": 1.3348055136058177, + "learning_rate": 7.266251274553771e-08, + "loss": 0.2813, + "step": 32660 + }, + { + "epoch": 0.95, + "grad_norm": 1.2854648468956584, + "learning_rate": 7.258274802152687e-08, + "loss": 0.272, + "step": 32661 + }, + { + "epoch": 0.95, + "grad_norm": 1.4718727637691607, + "learning_rate": 7.250302678202703e-08, + "loss": 0.2757, + "step": 32662 + }, + { + "epoch": 0.95, + "grad_norm": 1.3105327717449349, + "learning_rate": 7.242334902774373e-08, + "loss": 0.2819, + "step": 32663 + }, + { + "epoch": 0.95, + "grad_norm": 1.3090513837501365, + "learning_rate": 7.234371475937807e-08, + "loss": 0.2688, + "step": 32664 + }, + { + "epoch": 0.95, + "grad_norm": 1.721541018127466, + "learning_rate": 7.22641239776345e-08, + "loss": 0.2695, + "step": 32665 + }, + { + "epoch": 0.95, + "grad_norm": 1.3307253747503685, + "learning_rate": 7.218457668321466e-08, + "loss": 0.2794, + "step": 32666 + }, + { + "epoch": 0.95, + "grad_norm": 1.3710471226846337, + "learning_rate": 7.210507287682023e-08, + "loss": 0.2741, + "step": 32667 + }, + { + "epoch": 0.95, + "grad_norm": 1.5801868309871396, + "learning_rate": 7.202561255915286e-08, + "loss": 0.2765, + "step": 32668 + }, + { + "epoch": 0.95, + "grad_norm": 1.6330628013146335, + "learning_rate": 7.194619573091477e-08, + "loss": 0.2684, + "step": 32669 + }, + { + "epoch": 0.95, + "grad_norm": 1.4288922333505787, + "learning_rate": 7.186682239280541e-08, + "loss": 0.2873, + "step": 32670 + }, + { + "epoch": 0.95, + "grad_norm": 1.2895456460878565, + "learning_rate": 7.178749254552641e-08, + "loss": 0.272, + "step": 32671 + }, + { + "epoch": 0.95, + "grad_norm": 1.2264077029781677, + "learning_rate": 7.170820618977781e-08, + "loss": 0.2642, + "step": 32672 + }, + { + "epoch": 0.95, + "grad_norm": 1.3352096315318693, + "learning_rate": 7.162896332625901e-08, + "loss": 0.2586, + "step": 32673 + }, + { + "epoch": 0.95, + "grad_norm": 1.7238148971534577, + "learning_rate": 7.154976395566892e-08, + "loss": 0.2743, + "step": 32674 + }, + { + "epoch": 0.95, + "grad_norm": 1.4409663813369546, + "learning_rate": 7.147060807870698e-08, + "loss": 0.2654, + "step": 32675 + }, + { + "epoch": 0.95, + "grad_norm": 0.9527000173334867, + "learning_rate": 7.139149569607151e-08, + "loss": 0.5452, + "step": 32676 + }, + { + "epoch": 0.95, + "grad_norm": 1.5801800401906339, + "learning_rate": 7.131242680846085e-08, + "loss": 0.2833, + "step": 32677 + }, + { + "epoch": 0.95, + "grad_norm": 1.3152210608470936, + "learning_rate": 7.12334014165722e-08, + "loss": 0.2595, + "step": 32678 + }, + { + "epoch": 0.95, + "grad_norm": 1.5320718767674641, + "learning_rate": 7.115441952110391e-08, + "loss": 0.2865, + "step": 32679 + }, + { + "epoch": 0.95, + "grad_norm": 1.2789675475037678, + "learning_rate": 7.107548112275264e-08, + "loss": 0.2607, + "step": 32680 + }, + { + "epoch": 0.95, + "grad_norm": 1.2344386697112226, + "learning_rate": 7.09965862222145e-08, + "loss": 0.26, + "step": 32681 + }, + { + "epoch": 0.95, + "grad_norm": 1.294071977008282, + "learning_rate": 7.091773482018672e-08, + "loss": 0.2812, + "step": 32682 + }, + { + "epoch": 0.95, + "grad_norm": 1.390327281939862, + "learning_rate": 7.083892691736428e-08, + "loss": 0.2725, + "step": 32683 + }, + { + "epoch": 0.95, + "grad_norm": 1.47616649857729, + "learning_rate": 7.076016251444329e-08, + "loss": 0.2708, + "step": 32684 + }, + { + "epoch": 0.95, + "grad_norm": 1.3961779060100263, + "learning_rate": 7.06814416121182e-08, + "loss": 0.2878, + "step": 32685 + }, + { + "epoch": 0.95, + "grad_norm": 1.4949550953942519, + "learning_rate": 7.060276421108403e-08, + "loss": 0.2783, + "step": 32686 + }, + { + "epoch": 0.95, + "grad_norm": 1.322302294946879, + "learning_rate": 7.05241303120352e-08, + "loss": 0.2679, + "step": 32687 + }, + { + "epoch": 0.95, + "grad_norm": 1.2054915056428788, + "learning_rate": 7.044553991566616e-08, + "loss": 0.2449, + "step": 32688 + }, + { + "epoch": 0.95, + "grad_norm": 1.6849241232126153, + "learning_rate": 7.036699302266914e-08, + "loss": 0.3246, + "step": 32689 + }, + { + "epoch": 0.95, + "grad_norm": 2.659373541721589, + "learning_rate": 7.028848963373858e-08, + "loss": 0.275, + "step": 32690 + }, + { + "epoch": 0.95, + "grad_norm": 1.379956916844427, + "learning_rate": 7.02100297495667e-08, + "loss": 0.2747, + "step": 32691 + }, + { + "epoch": 0.95, + "grad_norm": 1.6093347443052821, + "learning_rate": 7.013161337084628e-08, + "loss": 0.2616, + "step": 32692 + }, + { + "epoch": 0.95, + "grad_norm": 1.436922903467507, + "learning_rate": 7.0053240498269e-08, + "loss": 0.2713, + "step": 32693 + }, + { + "epoch": 0.95, + "grad_norm": 1.2985149361476307, + "learning_rate": 6.997491113252597e-08, + "loss": 0.2742, + "step": 32694 + }, + { + "epoch": 0.95, + "grad_norm": 1.2642829443916395, + "learning_rate": 6.98966252743094e-08, + "loss": 0.2693, + "step": 32695 + }, + { + "epoch": 0.95, + "grad_norm": 1.2649807495294654, + "learning_rate": 6.981838292430987e-08, + "loss": 0.2632, + "step": 32696 + }, + { + "epoch": 0.95, + "grad_norm": 1.4034974461364238, + "learning_rate": 6.974018408321737e-08, + "loss": 0.2754, + "step": 32697 + }, + { + "epoch": 0.95, + "grad_norm": 1.3243832457453435, + "learning_rate": 6.966202875172301e-08, + "loss": 0.2594, + "step": 32698 + }, + { + "epoch": 0.95, + "grad_norm": 1.285683905478748, + "learning_rate": 6.958391693051569e-08, + "loss": 0.2705, + "step": 32699 + }, + { + "epoch": 0.95, + "grad_norm": 1.543352767904254, + "learning_rate": 6.950584862028487e-08, + "loss": 0.2812, + "step": 32700 + }, + { + "epoch": 0.95, + "grad_norm": 1.4925018919112554, + "learning_rate": 6.942782382171997e-08, + "loss": 0.2801, + "step": 32701 + }, + { + "epoch": 0.95, + "grad_norm": 1.3229159600255431, + "learning_rate": 6.93498425355088e-08, + "loss": 0.2684, + "step": 32702 + }, + { + "epoch": 0.95, + "grad_norm": 1.3375634226587627, + "learning_rate": 6.927190476234025e-08, + "loss": 0.2479, + "step": 32703 + }, + { + "epoch": 0.95, + "grad_norm": 1.4543211027836889, + "learning_rate": 6.919401050290153e-08, + "loss": 0.2591, + "step": 32704 + }, + { + "epoch": 0.95, + "grad_norm": 1.6988575426277006, + "learning_rate": 6.9116159757881e-08, + "loss": 0.2804, + "step": 32705 + }, + { + "epoch": 0.95, + "grad_norm": 1.3809460920073644, + "learning_rate": 6.90383525279642e-08, + "loss": 0.2641, + "step": 32706 + }, + { + "epoch": 0.95, + "grad_norm": 1.2524250983933871, + "learning_rate": 6.896058881383949e-08, + "loss": 0.2497, + "step": 32707 + }, + { + "epoch": 0.95, + "grad_norm": 1.381569283209156, + "learning_rate": 6.888286861619187e-08, + "loss": 0.2605, + "step": 32708 + }, + { + "epoch": 0.95, + "grad_norm": 1.3899929212696636, + "learning_rate": 6.880519193570745e-08, + "loss": 0.2748, + "step": 32709 + }, + { + "epoch": 0.95, + "grad_norm": 1.3988561739803733, + "learning_rate": 6.872755877307235e-08, + "loss": 0.2668, + "step": 32710 + }, + { + "epoch": 0.95, + "grad_norm": 1.7549709019473416, + "learning_rate": 6.864996912897104e-08, + "loss": 0.2665, + "step": 32711 + }, + { + "epoch": 0.95, + "grad_norm": 1.651448837100554, + "learning_rate": 6.857242300408851e-08, + "loss": 0.278, + "step": 32712 + }, + { + "epoch": 0.95, + "grad_norm": 1.4289231994625502, + "learning_rate": 6.849492039910866e-08, + "loss": 0.2574, + "step": 32713 + }, + { + "epoch": 0.95, + "grad_norm": 1.318235847548285, + "learning_rate": 6.84174613147165e-08, + "loss": 0.2941, + "step": 32714 + }, + { + "epoch": 0.95, + "grad_norm": 1.2882420096190528, + "learning_rate": 6.834004575159426e-08, + "loss": 0.2632, + "step": 32715 + }, + { + "epoch": 0.95, + "grad_norm": 1.327966868055251, + "learning_rate": 6.826267371042638e-08, + "loss": 0.2816, + "step": 32716 + }, + { + "epoch": 0.95, + "grad_norm": 2.0732814742828105, + "learning_rate": 6.818534519189457e-08, + "loss": 0.2719, + "step": 32717 + }, + { + "epoch": 0.95, + "grad_norm": 1.4870170875384656, + "learning_rate": 6.81080601966827e-08, + "loss": 0.2738, + "step": 32718 + }, + { + "epoch": 0.95, + "grad_norm": 1.290212351508383, + "learning_rate": 6.803081872547134e-08, + "loss": 0.282, + "step": 32719 + }, + { + "epoch": 0.95, + "grad_norm": 1.4734505987021458, + "learning_rate": 6.795362077894275e-08, + "loss": 0.2642, + "step": 32720 + }, + { + "epoch": 0.95, + "grad_norm": 2.6752687529800494, + "learning_rate": 6.787646635777855e-08, + "loss": 0.2896, + "step": 32721 + }, + { + "epoch": 0.95, + "grad_norm": 1.478709119137972, + "learning_rate": 6.779935546265881e-08, + "loss": 0.2705, + "step": 32722 + }, + { + "epoch": 0.95, + "grad_norm": 1.5899542331165477, + "learning_rate": 6.772228809426406e-08, + "loss": 0.2714, + "step": 32723 + }, + { + "epoch": 0.95, + "grad_norm": 1.6577717689169864, + "learning_rate": 6.764526425327545e-08, + "loss": 0.295, + "step": 32724 + }, + { + "epoch": 0.95, + "grad_norm": 2.049938760640875, + "learning_rate": 6.756828394037184e-08, + "loss": 0.2606, + "step": 32725 + }, + { + "epoch": 0.95, + "grad_norm": 1.3427761210604265, + "learning_rate": 6.749134715623329e-08, + "loss": 0.2704, + "step": 32726 + }, + { + "epoch": 0.95, + "grad_norm": 1.6359415914573054, + "learning_rate": 6.741445390153756e-08, + "loss": 0.271, + "step": 32727 + }, + { + "epoch": 0.95, + "grad_norm": 1.3613212226938407, + "learning_rate": 6.733760417696466e-08, + "loss": 0.2563, + "step": 32728 + }, + { + "epoch": 0.95, + "grad_norm": 1.318830361497871, + "learning_rate": 6.726079798319185e-08, + "loss": 0.2598, + "step": 32729 + }, + { + "epoch": 0.95, + "grad_norm": 1.4414601340374378, + "learning_rate": 6.718403532089746e-08, + "loss": 0.2767, + "step": 32730 + }, + { + "epoch": 0.95, + "grad_norm": 1.5550237586522402, + "learning_rate": 6.710731619075816e-08, + "loss": 0.2783, + "step": 32731 + }, + { + "epoch": 0.95, + "grad_norm": 1.9524723328375009, + "learning_rate": 6.703064059345176e-08, + "loss": 0.248, + "step": 32732 + }, + { + "epoch": 0.95, + "grad_norm": 1.434257478097167, + "learning_rate": 6.695400852965495e-08, + "loss": 0.2634, + "step": 32733 + }, + { + "epoch": 0.95, + "grad_norm": 1.2412236477486605, + "learning_rate": 6.687742000004326e-08, + "loss": 0.2556, + "step": 32734 + }, + { + "epoch": 0.95, + "grad_norm": 1.5203433959918606, + "learning_rate": 6.680087500529341e-08, + "loss": 0.2748, + "step": 32735 + }, + { + "epoch": 0.95, + "grad_norm": 1.2908666091565797, + "learning_rate": 6.67243735460804e-08, + "loss": 0.2682, + "step": 32736 + }, + { + "epoch": 0.95, + "grad_norm": 1.5913479726848878, + "learning_rate": 6.664791562308037e-08, + "loss": 0.2891, + "step": 32737 + }, + { + "epoch": 0.95, + "grad_norm": 1.2782721351532196, + "learning_rate": 6.65715012369661e-08, + "loss": 0.254, + "step": 32738 + }, + { + "epoch": 0.95, + "grad_norm": 1.3844604653751043, + "learning_rate": 6.649513038841315e-08, + "loss": 0.251, + "step": 32739 + }, + { + "epoch": 0.95, + "grad_norm": 1.7635466151866401, + "learning_rate": 6.641880307809601e-08, + "loss": 0.3273, + "step": 32740 + }, + { + "epoch": 0.95, + "grad_norm": 1.7578153832036865, + "learning_rate": 6.634251930668689e-08, + "loss": 0.2526, + "step": 32741 + }, + { + "epoch": 0.95, + "grad_norm": 1.9058748318074044, + "learning_rate": 6.626627907486027e-08, + "loss": 0.2573, + "step": 32742 + }, + { + "epoch": 0.95, + "grad_norm": 1.36756307676446, + "learning_rate": 6.61900823832884e-08, + "loss": 0.2582, + "step": 32743 + }, + { + "epoch": 0.95, + "grad_norm": 1.311096676114014, + "learning_rate": 6.611392923264403e-08, + "loss": 0.2635, + "step": 32744 + }, + { + "epoch": 0.95, + "grad_norm": 1.3854120710461322, + "learning_rate": 6.60378196235989e-08, + "loss": 0.2738, + "step": 32745 + }, + { + "epoch": 0.95, + "grad_norm": 1.2899850473331251, + "learning_rate": 6.596175355682466e-08, + "loss": 0.2708, + "step": 32746 + }, + { + "epoch": 0.95, + "grad_norm": 1.2666241107522678, + "learning_rate": 6.588573103299245e-08, + "loss": 0.271, + "step": 32747 + }, + { + "epoch": 0.95, + "grad_norm": 1.3520276415134722, + "learning_rate": 6.58097520527734e-08, + "loss": 0.2711, + "step": 32748 + }, + { + "epoch": 0.95, + "grad_norm": 1.4185312330955988, + "learning_rate": 6.573381661683865e-08, + "loss": 0.2869, + "step": 32749 + }, + { + "epoch": 0.95, + "grad_norm": 1.2061065750963331, + "learning_rate": 6.565792472585708e-08, + "loss": 0.2599, + "step": 32750 + }, + { + "epoch": 0.95, + "grad_norm": 1.3480612444712154, + "learning_rate": 6.558207638049929e-08, + "loss": 0.2558, + "step": 32751 + }, + { + "epoch": 0.95, + "grad_norm": 1.435482885488647, + "learning_rate": 6.55062715814342e-08, + "loss": 0.3061, + "step": 32752 + }, + { + "epoch": 0.95, + "grad_norm": 1.3452082546220485, + "learning_rate": 6.543051032933124e-08, + "loss": 0.2578, + "step": 32753 + }, + { + "epoch": 0.95, + "grad_norm": 1.2657319728652876, + "learning_rate": 6.535479262485822e-08, + "loss": 0.2685, + "step": 32754 + }, + { + "epoch": 0.95, + "grad_norm": 1.3969821420852095, + "learning_rate": 6.527911846868463e-08, + "loss": 0.2675, + "step": 32755 + }, + { + "epoch": 0.95, + "grad_norm": 1.5583960746595693, + "learning_rate": 6.520348786147768e-08, + "loss": 0.2872, + "step": 32756 + }, + { + "epoch": 0.95, + "grad_norm": 1.4649000194314532, + "learning_rate": 6.512790080390407e-08, + "loss": 0.2829, + "step": 32757 + }, + { + "epoch": 0.95, + "grad_norm": 1.5477710383539882, + "learning_rate": 6.505235729663162e-08, + "loss": 0.2816, + "step": 32758 + }, + { + "epoch": 0.95, + "grad_norm": 1.3141401033186288, + "learning_rate": 6.4976857340327e-08, + "loss": 0.2751, + "step": 32759 + }, + { + "epoch": 0.95, + "grad_norm": 1.4062781354187175, + "learning_rate": 6.490140093565634e-08, + "loss": 0.2807, + "step": 32760 + }, + { + "epoch": 0.95, + "grad_norm": 1.2693552076399317, + "learning_rate": 6.482598808328521e-08, + "loss": 0.2716, + "step": 32761 + }, + { + "epoch": 0.95, + "grad_norm": 2.5989604927780325, + "learning_rate": 6.475061878387978e-08, + "loss": 0.2571, + "step": 32762 + }, + { + "epoch": 0.95, + "grad_norm": 1.3957676363041969, + "learning_rate": 6.467529303810505e-08, + "loss": 0.2616, + "step": 32763 + }, + { + "epoch": 0.95, + "grad_norm": 3.0075886562953924, + "learning_rate": 6.460001084662548e-08, + "loss": 0.2797, + "step": 32764 + }, + { + "epoch": 0.95, + "grad_norm": 1.3815859462515454, + "learning_rate": 6.452477221010555e-08, + "loss": 0.2799, + "step": 32765 + }, + { + "epoch": 0.95, + "grad_norm": 1.4801991723071886, + "learning_rate": 6.444957712920918e-08, + "loss": 0.2851, + "step": 32766 + }, + { + "epoch": 0.95, + "grad_norm": 1.3195463219913164, + "learning_rate": 6.437442560460027e-08, + "loss": 0.2806, + "step": 32767 + }, + { + "epoch": 0.95, + "grad_norm": 1.2724987939233472, + "learning_rate": 6.429931763694164e-08, + "loss": 0.2761, + "step": 32768 + }, + { + "epoch": 0.95, + "grad_norm": 1.6097010808861434, + "learning_rate": 6.422425322689608e-08, + "loss": 0.2853, + "step": 32769 + }, + { + "epoch": 0.95, + "grad_norm": 1.710723329372176, + "learning_rate": 6.414923237512638e-08, + "loss": 0.2645, + "step": 32770 + }, + { + "epoch": 0.95, + "grad_norm": 1.4504526869239107, + "learning_rate": 6.407425508229482e-08, + "loss": 0.2726, + "step": 32771 + }, + { + "epoch": 0.95, + "grad_norm": 1.5089427910440751, + "learning_rate": 6.399932134906195e-08, + "loss": 0.3304, + "step": 32772 + }, + { + "epoch": 0.95, + "grad_norm": 1.3898901577465148, + "learning_rate": 6.392443117609059e-08, + "loss": 0.2386, + "step": 32773 + }, + { + "epoch": 0.95, + "grad_norm": 1.3575591356000563, + "learning_rate": 6.384958456404078e-08, + "loss": 0.2634, + "step": 32774 + }, + { + "epoch": 0.95, + "grad_norm": 1.3733461295597906, + "learning_rate": 6.377478151357308e-08, + "loss": 0.2605, + "step": 32775 + }, + { + "epoch": 0.95, + "grad_norm": 0.9658875098280827, + "learning_rate": 6.370002202534754e-08, + "loss": 0.5494, + "step": 32776 + }, + { + "epoch": 0.95, + "grad_norm": 1.4869690580380293, + "learning_rate": 6.362530610002415e-08, + "loss": 0.2636, + "step": 32777 + }, + { + "epoch": 0.95, + "grad_norm": 1.303372851064566, + "learning_rate": 6.355063373826242e-08, + "loss": 0.2721, + "step": 32778 + }, + { + "epoch": 0.95, + "grad_norm": 1.5251895666854725, + "learning_rate": 6.34760049407207e-08, + "loss": 0.2744, + "step": 32779 + }, + { + "epoch": 0.95, + "grad_norm": 1.3758425595596877, + "learning_rate": 6.340141970805791e-08, + "loss": 0.2876, + "step": 32780 + }, + { + "epoch": 0.95, + "grad_norm": 1.4153019865332257, + "learning_rate": 6.332687804093296e-08, + "loss": 0.2875, + "step": 32781 + }, + { + "epoch": 0.95, + "grad_norm": 1.4124929415576135, + "learning_rate": 6.325237994000256e-08, + "loss": 0.3057, + "step": 32782 + }, + { + "epoch": 0.95, + "grad_norm": 1.2911500701464553, + "learning_rate": 6.317792540592505e-08, + "loss": 0.2482, + "step": 32783 + }, + { + "epoch": 0.95, + "grad_norm": 1.3229368786596065, + "learning_rate": 6.310351443935659e-08, + "loss": 0.2807, + "step": 32784 + }, + { + "epoch": 0.95, + "grad_norm": 1.3992775926574343, + "learning_rate": 6.302914704095497e-08, + "loss": 0.3041, + "step": 32785 + }, + { + "epoch": 0.95, + "grad_norm": 1.5199176982817935, + "learning_rate": 6.295482321137525e-08, + "loss": 0.271, + "step": 32786 + }, + { + "epoch": 0.95, + "grad_norm": 1.4008229989977043, + "learning_rate": 6.288054295127466e-08, + "loss": 0.2529, + "step": 32787 + }, + { + "epoch": 0.95, + "grad_norm": 1.7089881709636452, + "learning_rate": 6.280630626130768e-08, + "loss": 0.2747, + "step": 32788 + }, + { + "epoch": 0.95, + "grad_norm": 1.37151299808487, + "learning_rate": 6.273211314212934e-08, + "loss": 0.2841, + "step": 32789 + }, + { + "epoch": 0.95, + "grad_norm": 1.4600190345682493, + "learning_rate": 6.26579635943958e-08, + "loss": 0.2449, + "step": 32790 + }, + { + "epoch": 0.95, + "grad_norm": 1.2944473913011736, + "learning_rate": 6.258385761875984e-08, + "loss": 0.2781, + "step": 32791 + }, + { + "epoch": 0.95, + "grad_norm": 1.629154228742529, + "learning_rate": 6.250979521587652e-08, + "loss": 0.2617, + "step": 32792 + }, + { + "epoch": 0.95, + "grad_norm": 1.4074052690861245, + "learning_rate": 6.24357763863992e-08, + "loss": 0.2448, + "step": 32793 + }, + { + "epoch": 0.95, + "grad_norm": 1.335785718383112, + "learning_rate": 6.236180113098011e-08, + "loss": 0.2531, + "step": 32794 + }, + { + "epoch": 0.95, + "grad_norm": 1.3147357502913155, + "learning_rate": 6.228786945027376e-08, + "loss": 0.255, + "step": 32795 + }, + { + "epoch": 0.95, + "grad_norm": 1.9442655113889553, + "learning_rate": 6.221398134493129e-08, + "loss": 0.2748, + "step": 32796 + }, + { + "epoch": 0.95, + "grad_norm": 1.3316681286075425, + "learning_rate": 6.214013681560493e-08, + "loss": 0.3109, + "step": 32797 + }, + { + "epoch": 0.95, + "grad_norm": 1.62601379680558, + "learning_rate": 6.206633586294697e-08, + "loss": 0.2772, + "step": 32798 + }, + { + "epoch": 0.95, + "grad_norm": 1.8426655024360485, + "learning_rate": 6.199257848760854e-08, + "loss": 0.2842, + "step": 32799 + }, + { + "epoch": 0.95, + "grad_norm": 2.6594159665938624, + "learning_rate": 6.191886469024078e-08, + "loss": 0.2771, + "step": 32800 + }, + { + "epoch": 0.95, + "grad_norm": 1.2571330493126716, + "learning_rate": 6.184519447149317e-08, + "loss": 0.2617, + "step": 32801 + }, + { + "epoch": 0.95, + "grad_norm": 1.6643165344895585, + "learning_rate": 6.177156783201632e-08, + "loss": 0.2814, + "step": 32802 + }, + { + "epoch": 0.95, + "grad_norm": 1.3055494647197434, + "learning_rate": 6.169798477246081e-08, + "loss": 0.266, + "step": 32803 + }, + { + "epoch": 0.95, + "grad_norm": 1.391182256265952, + "learning_rate": 6.1624445293475e-08, + "loss": 0.2758, + "step": 32804 + }, + { + "epoch": 0.95, + "grad_norm": 1.337709114585917, + "learning_rate": 6.155094939570838e-08, + "loss": 0.2633, + "step": 32805 + }, + { + "epoch": 0.95, + "grad_norm": 1.715095968115233, + "learning_rate": 6.147749707980933e-08, + "loss": 0.2633, + "step": 32806 + }, + { + "epoch": 0.95, + "grad_norm": 1.4615026899587207, + "learning_rate": 6.140408834642619e-08, + "loss": 0.2563, + "step": 32807 + }, + { + "epoch": 0.95, + "grad_norm": 1.3778020887634972, + "learning_rate": 6.133072319620626e-08, + "loss": 0.2747, + "step": 32808 + }, + { + "epoch": 0.95, + "grad_norm": 1.4972652302951175, + "learning_rate": 6.125740162979788e-08, + "loss": 0.2677, + "step": 32809 + }, + { + "epoch": 0.95, + "grad_norm": 1.4203105618126777, + "learning_rate": 6.118412364784776e-08, + "loss": 0.2705, + "step": 32810 + }, + { + "epoch": 0.95, + "grad_norm": 3.627224799128115, + "learning_rate": 6.111088925100317e-08, + "loss": 0.2779, + "step": 32811 + }, + { + "epoch": 0.95, + "grad_norm": 1.416892220563851, + "learning_rate": 6.103769843990858e-08, + "loss": 0.2642, + "step": 32812 + }, + { + "epoch": 0.95, + "grad_norm": 1.4909123477223754, + "learning_rate": 6.096455121521183e-08, + "loss": 0.2761, + "step": 32813 + }, + { + "epoch": 0.95, + "grad_norm": 1.516635279789135, + "learning_rate": 6.089144757755739e-08, + "loss": 0.2316, + "step": 32814 + }, + { + "epoch": 0.95, + "grad_norm": 1.4197082949456905, + "learning_rate": 6.081838752759084e-08, + "loss": 0.2416, + "step": 32815 + }, + { + "epoch": 0.95, + "grad_norm": 1.267734803374103, + "learning_rate": 6.074537106595668e-08, + "loss": 0.2665, + "step": 32816 + }, + { + "epoch": 0.95, + "grad_norm": 1.3107340026802217, + "learning_rate": 6.067239819329995e-08, + "loss": 0.2554, + "step": 32817 + }, + { + "epoch": 0.95, + "grad_norm": 1.2495729517828205, + "learning_rate": 6.059946891026347e-08, + "loss": 0.2625, + "step": 32818 + }, + { + "epoch": 0.95, + "grad_norm": 1.3826890351426282, + "learning_rate": 6.052658321749172e-08, + "loss": 0.264, + "step": 32819 + }, + { + "epoch": 0.95, + "grad_norm": 1.444542278528307, + "learning_rate": 6.045374111562808e-08, + "loss": 0.2758, + "step": 32820 + }, + { + "epoch": 0.95, + "grad_norm": 1.4426842133430067, + "learning_rate": 6.038094260531425e-08, + "loss": 0.2651, + "step": 32821 + }, + { + "epoch": 0.95, + "grad_norm": 1.3034602976927052, + "learning_rate": 6.03081876871936e-08, + "loss": 0.2828, + "step": 32822 + }, + { + "epoch": 0.95, + "grad_norm": 1.3027329547741802, + "learning_rate": 6.023547636190841e-08, + "loss": 0.2812, + "step": 32823 + }, + { + "epoch": 0.95, + "grad_norm": 1.4296619011985507, + "learning_rate": 6.016280863009927e-08, + "loss": 0.2789, + "step": 32824 + }, + { + "epoch": 0.95, + "grad_norm": 1.515650932739604, + "learning_rate": 6.009018449240844e-08, + "loss": 0.2576, + "step": 32825 + }, + { + "epoch": 0.95, + "grad_norm": 1.7368811743064907, + "learning_rate": 6.001760394947653e-08, + "loss": 0.2588, + "step": 32826 + }, + { + "epoch": 0.95, + "grad_norm": 1.278515737544399, + "learning_rate": 5.994506700194413e-08, + "loss": 0.2756, + "step": 32827 + }, + { + "epoch": 0.95, + "grad_norm": 1.4038349357054796, + "learning_rate": 5.987257365045184e-08, + "loss": 0.2787, + "step": 32828 + }, + { + "epoch": 0.95, + "grad_norm": 1.2559918512129007, + "learning_rate": 5.980012389563806e-08, + "loss": 0.2591, + "step": 32829 + }, + { + "epoch": 0.95, + "grad_norm": 1.2977005501753502, + "learning_rate": 5.972771773814334e-08, + "loss": 0.2555, + "step": 32830 + }, + { + "epoch": 0.95, + "grad_norm": 1.3717729924075863, + "learning_rate": 5.96553551786061e-08, + "loss": 0.272, + "step": 32831 + }, + { + "epoch": 0.95, + "grad_norm": 1.4172739065276245, + "learning_rate": 5.9583036217665256e-08, + "loss": 0.2481, + "step": 32832 + }, + { + "epoch": 0.95, + "grad_norm": 1.364569076920858, + "learning_rate": 5.951076085595919e-08, + "loss": 0.273, + "step": 32833 + }, + { + "epoch": 0.95, + "grad_norm": 1.417373235634706, + "learning_rate": 5.943852909412517e-08, + "loss": 0.2678, + "step": 32834 + }, + { + "epoch": 0.95, + "grad_norm": 1.3281452560652516, + "learning_rate": 5.936634093280103e-08, + "loss": 0.2688, + "step": 32835 + }, + { + "epoch": 0.95, + "grad_norm": 1.2731773722013142, + "learning_rate": 5.929419637262401e-08, + "loss": 0.2665, + "step": 32836 + }, + { + "epoch": 0.95, + "grad_norm": 1.5017968688783168, + "learning_rate": 5.92220954142303e-08, + "loss": 0.265, + "step": 32837 + }, + { + "epoch": 0.95, + "grad_norm": 1.2865783259504564, + "learning_rate": 5.9150038058256586e-08, + "loss": 0.2624, + "step": 32838 + }, + { + "epoch": 0.95, + "grad_norm": 1.453239310013675, + "learning_rate": 5.9078024305338486e-08, + "loss": 0.2956, + "step": 32839 + }, + { + "epoch": 0.95, + "grad_norm": 1.2761389966166539, + "learning_rate": 5.900605415611105e-08, + "loss": 0.2586, + "step": 32840 + }, + { + "epoch": 0.95, + "grad_norm": 0.9262255708227606, + "learning_rate": 5.8934127611210426e-08, + "loss": 0.5655, + "step": 32841 + }, + { + "epoch": 0.95, + "grad_norm": 1.2455821223632793, + "learning_rate": 5.886224467127111e-08, + "loss": 0.2592, + "step": 32842 + }, + { + "epoch": 0.95, + "grad_norm": 1.3905616179039535, + "learning_rate": 5.8790405336927034e-08, + "loss": 0.2657, + "step": 32843 + }, + { + "epoch": 0.95, + "grad_norm": 2.3128251914193685, + "learning_rate": 5.87186096088127e-08, + "loss": 0.2575, + "step": 32844 + }, + { + "epoch": 0.95, + "grad_norm": 1.6444324970183644, + "learning_rate": 5.864685748756094e-08, + "loss": 0.2924, + "step": 32845 + }, + { + "epoch": 0.95, + "grad_norm": 1.431677555233424, + "learning_rate": 5.8575148973806225e-08, + "loss": 0.2883, + "step": 32846 + }, + { + "epoch": 0.95, + "grad_norm": 1.305278933062135, + "learning_rate": 5.8503484068180296e-08, + "loss": 0.2583, + "step": 32847 + }, + { + "epoch": 0.95, + "grad_norm": 1.2916043213611434, + "learning_rate": 5.843186277131596e-08, + "loss": 0.2611, + "step": 32848 + }, + { + "epoch": 0.95, + "grad_norm": 1.3750749828508577, + "learning_rate": 5.8360285083845504e-08, + "loss": 0.272, + "step": 32849 + }, + { + "epoch": 0.95, + "grad_norm": 2.109597094157675, + "learning_rate": 5.828875100639952e-08, + "loss": 0.2536, + "step": 32850 + }, + { + "epoch": 0.95, + "grad_norm": 1.3590298664906593, + "learning_rate": 5.821726053961086e-08, + "loss": 0.2863, + "step": 32851 + }, + { + "epoch": 0.95, + "grad_norm": 1.3317548867187758, + "learning_rate": 5.814581368410954e-08, + "loss": 0.2603, + "step": 32852 + }, + { + "epoch": 0.95, + "grad_norm": 1.817079584350329, + "learning_rate": 5.80744104405262e-08, + "loss": 0.2678, + "step": 32853 + }, + { + "epoch": 0.95, + "grad_norm": 1.2795267176320901, + "learning_rate": 5.800305080949087e-08, + "loss": 0.2618, + "step": 32854 + }, + { + "epoch": 0.95, + "grad_norm": 1.2320043732373025, + "learning_rate": 5.793173479163417e-08, + "loss": 0.2571, + "step": 32855 + }, + { + "epoch": 0.95, + "grad_norm": 1.410425588113753, + "learning_rate": 5.786046238758392e-08, + "loss": 0.2797, + "step": 32856 + }, + { + "epoch": 0.95, + "grad_norm": 1.010582128593081, + "learning_rate": 5.7789233597970175e-08, + "loss": 0.655, + "step": 32857 + }, + { + "epoch": 0.95, + "grad_norm": 1.3459915888965561, + "learning_rate": 5.7718048423421325e-08, + "loss": 0.2754, + "step": 32858 + }, + { + "epoch": 0.95, + "grad_norm": 1.3522359673470967, + "learning_rate": 5.76469068645652e-08, + "loss": 0.2595, + "step": 32859 + }, + { + "epoch": 0.95, + "grad_norm": 1.642389795904602, + "learning_rate": 5.757580892203019e-08, + "loss": 0.2628, + "step": 32860 + }, + { + "epoch": 0.95, + "grad_norm": 1.2469516896075188, + "learning_rate": 5.7504754596443556e-08, + "loss": 0.2751, + "step": 32861 + }, + { + "epoch": 0.95, + "grad_norm": 1.2994714368117288, + "learning_rate": 5.743374388843259e-08, + "loss": 0.2565, + "step": 32862 + }, + { + "epoch": 0.95, + "grad_norm": 1.3843589591934646, + "learning_rate": 5.73627767986229e-08, + "loss": 0.2625, + "step": 32863 + }, + { + "epoch": 0.95, + "grad_norm": 1.428158927694273, + "learning_rate": 5.729185332764176e-08, + "loss": 0.2631, + "step": 32864 + }, + { + "epoch": 0.95, + "grad_norm": 1.803918201582286, + "learning_rate": 5.722097347611533e-08, + "loss": 0.2752, + "step": 32865 + }, + { + "epoch": 0.95, + "grad_norm": 1.3985800070410799, + "learning_rate": 5.715013724466867e-08, + "loss": 0.2711, + "step": 32866 + }, + { + "epoch": 0.95, + "grad_norm": 1.79675602459159, + "learning_rate": 5.707934463392628e-08, + "loss": 0.27, + "step": 32867 + }, + { + "epoch": 0.95, + "grad_norm": 1.2771916191159534, + "learning_rate": 5.700859564451377e-08, + "loss": 0.2582, + "step": 32868 + }, + { + "epoch": 0.95, + "grad_norm": 1.4510043845958627, + "learning_rate": 5.6937890277055095e-08, + "loss": 0.2939, + "step": 32869 + }, + { + "epoch": 0.95, + "grad_norm": 1.3211569107898955, + "learning_rate": 5.686722853217474e-08, + "loss": 0.2637, + "step": 32870 + }, + { + "epoch": 0.95, + "grad_norm": 1.2926787338882761, + "learning_rate": 5.6796610410495556e-08, + "loss": 0.2485, + "step": 32871 + }, + { + "epoch": 0.95, + "grad_norm": 1.430875230818443, + "learning_rate": 5.6726035912641475e-08, + "loss": 0.2972, + "step": 32872 + }, + { + "epoch": 0.95, + "grad_norm": 1.3544961240939448, + "learning_rate": 5.6655505039234783e-08, + "loss": 0.2668, + "step": 32873 + }, + { + "epoch": 0.95, + "grad_norm": 1.3743622825171666, + "learning_rate": 5.658501779089831e-08, + "loss": 0.2864, + "step": 32874 + }, + { + "epoch": 0.95, + "grad_norm": 1.423802368615065, + "learning_rate": 5.651457416825323e-08, + "loss": 0.2671, + "step": 32875 + }, + { + "epoch": 0.95, + "grad_norm": 1.481430826311472, + "learning_rate": 5.6444174171922385e-08, + "loss": 0.2688, + "step": 32876 + }, + { + "epoch": 0.95, + "grad_norm": 1.2149090540935148, + "learning_rate": 5.637381780252638e-08, + "loss": 0.2504, + "step": 32877 + }, + { + "epoch": 0.95, + "grad_norm": 1.4125776946754784, + "learning_rate": 5.630350506068638e-08, + "loss": 0.2719, + "step": 32878 + }, + { + "epoch": 0.95, + "grad_norm": 1.6701549893166476, + "learning_rate": 5.623323594702301e-08, + "loss": 0.2927, + "step": 32879 + }, + { + "epoch": 0.95, + "grad_norm": 1.3135408688649572, + "learning_rate": 5.6163010462155775e-08, + "loss": 0.2598, + "step": 32880 + }, + { + "epoch": 0.95, + "grad_norm": 1.2391605154532785, + "learning_rate": 5.609282860670473e-08, + "loss": 0.2574, + "step": 32881 + }, + { + "epoch": 0.95, + "grad_norm": 1.3440508644855342, + "learning_rate": 5.6022690381289934e-08, + "loss": 0.2656, + "step": 32882 + }, + { + "epoch": 0.95, + "grad_norm": 1.324085183292711, + "learning_rate": 5.595259578652923e-08, + "loss": 0.2548, + "step": 32883 + }, + { + "epoch": 0.95, + "grad_norm": 1.7954987347298383, + "learning_rate": 5.588254482304212e-08, + "loss": 0.2582, + "step": 32884 + }, + { + "epoch": 0.95, + "grad_norm": 1.3559918652748613, + "learning_rate": 5.581253749144588e-08, + "loss": 0.282, + "step": 32885 + }, + { + "epoch": 0.95, + "grad_norm": 0.957950891440137, + "learning_rate": 5.5742573792359476e-08, + "loss": 0.5396, + "step": 32886 + }, + { + "epoch": 0.95, + "grad_norm": 1.3019709026725612, + "learning_rate": 5.5672653726399074e-08, + "loss": 0.2565, + "step": 32887 + }, + { + "epoch": 0.95, + "grad_norm": 1.3743903390002525, + "learning_rate": 5.560277729418251e-08, + "loss": 0.2754, + "step": 32888 + }, + { + "epoch": 0.95, + "grad_norm": 1.5527819047140223, + "learning_rate": 5.553294449632651e-08, + "loss": 0.273, + "step": 32889 + }, + { + "epoch": 0.95, + "grad_norm": 1.6279081226050875, + "learning_rate": 5.546315533344726e-08, + "loss": 0.2483, + "step": 32890 + }, + { + "epoch": 0.95, + "grad_norm": 1.2722056207262256, + "learning_rate": 5.539340980616037e-08, + "loss": 0.2569, + "step": 32891 + }, + { + "epoch": 0.95, + "grad_norm": 1.320403492990033, + "learning_rate": 5.53237079150809e-08, + "loss": 0.2641, + "step": 32892 + }, + { + "epoch": 0.95, + "grad_norm": 1.305597982773517, + "learning_rate": 5.525404966082559e-08, + "loss": 0.2647, + "step": 32893 + }, + { + "epoch": 0.95, + "grad_norm": 1.2891415804471713, + "learning_rate": 5.518443504400783e-08, + "loss": 0.2615, + "step": 32894 + }, + { + "epoch": 0.95, + "grad_norm": 1.436897954145508, + "learning_rate": 5.511486406524158e-08, + "loss": 0.2632, + "step": 32895 + }, + { + "epoch": 0.95, + "grad_norm": 1.2115415333161517, + "learning_rate": 5.50453367251419e-08, + "loss": 0.2577, + "step": 32896 + }, + { + "epoch": 0.95, + "grad_norm": 1.3468082625900557, + "learning_rate": 5.497585302432218e-08, + "loss": 0.2433, + "step": 32897 + }, + { + "epoch": 0.95, + "grad_norm": 1.3609611180741576, + "learning_rate": 5.490641296339527e-08, + "loss": 0.2548, + "step": 32898 + }, + { + "epoch": 0.95, + "grad_norm": 2.2957296570927754, + "learning_rate": 5.483701654297402e-08, + "loss": 0.2552, + "step": 32899 + }, + { + "epoch": 0.95, + "grad_norm": 1.8391052881588699, + "learning_rate": 5.4767663763671266e-08, + "loss": 0.2569, + "step": 32900 + }, + { + "epoch": 0.95, + "grad_norm": 1.2315628509971488, + "learning_rate": 5.4698354626098735e-08, + "loss": 0.2521, + "step": 32901 + }, + { + "epoch": 0.95, + "grad_norm": 1.6812563866218968, + "learning_rate": 5.4629089130868176e-08, + "loss": 0.2829, + "step": 32902 + }, + { + "epoch": 0.95, + "grad_norm": 1.0207912581999876, + "learning_rate": 5.4559867278590196e-08, + "loss": 0.622, + "step": 32903 + }, + { + "epoch": 0.95, + "grad_norm": 1.3324633709696712, + "learning_rate": 5.449068906987654e-08, + "loss": 0.2636, + "step": 32904 + }, + { + "epoch": 0.95, + "grad_norm": 0.9199959361527489, + "learning_rate": 5.442155450533726e-08, + "loss": 0.5898, + "step": 32905 + }, + { + "epoch": 0.95, + "grad_norm": 1.286753069925469, + "learning_rate": 5.4352463585582994e-08, + "loss": 0.2415, + "step": 32906 + }, + { + "epoch": 0.95, + "grad_norm": 0.9523161852246375, + "learning_rate": 5.428341631122269e-08, + "loss": 0.5549, + "step": 32907 + }, + { + "epoch": 0.95, + "grad_norm": 7.76257454552149, + "learning_rate": 5.4214412682866425e-08, + "loss": 0.2521, + "step": 32908 + }, + { + "epoch": 0.95, + "grad_norm": 1.2702205104535282, + "learning_rate": 5.414545270112259e-08, + "loss": 0.2481, + "step": 32909 + }, + { + "epoch": 0.95, + "grad_norm": 1.2543633775250869, + "learning_rate": 5.407653636660071e-08, + "loss": 0.2654, + "step": 32910 + }, + { + "epoch": 0.95, + "grad_norm": 1.2991618085743295, + "learning_rate": 5.400766367990751e-08, + "loss": 0.2801, + "step": 32911 + }, + { + "epoch": 0.95, + "grad_norm": 1.2250186078874696, + "learning_rate": 5.3938834641651946e-08, + "loss": 0.279, + "step": 32912 + }, + { + "epoch": 0.95, + "grad_norm": 1.3061184216731758, + "learning_rate": 5.387004925244077e-08, + "loss": 0.2795, + "step": 32913 + }, + { + "epoch": 0.95, + "grad_norm": 1.384532142338974, + "learning_rate": 5.3801307512881264e-08, + "loss": 0.274, + "step": 32914 + }, + { + "epoch": 0.95, + "grad_norm": 1.231237575798653, + "learning_rate": 5.373260942358016e-08, + "loss": 0.2569, + "step": 32915 + }, + { + "epoch": 0.95, + "grad_norm": 1.3092282885641497, + "learning_rate": 5.366395498514365e-08, + "loss": 0.3069, + "step": 32916 + }, + { + "epoch": 0.95, + "grad_norm": 1.5618279107420463, + "learning_rate": 5.3595344198177356e-08, + "loss": 0.297, + "step": 32917 + }, + { + "epoch": 0.95, + "grad_norm": 1.2049485416802408, + "learning_rate": 5.352677706328746e-08, + "loss": 0.258, + "step": 32918 + }, + { + "epoch": 0.95, + "grad_norm": 1.420322989873817, + "learning_rate": 5.345825358107848e-08, + "loss": 0.2841, + "step": 32919 + }, + { + "epoch": 0.95, + "grad_norm": 1.3076851705542436, + "learning_rate": 5.3389773752155484e-08, + "loss": 0.2482, + "step": 32920 + }, + { + "epoch": 0.95, + "grad_norm": 1.4882761669030313, + "learning_rate": 5.3321337577122434e-08, + "loss": 0.2787, + "step": 32921 + }, + { + "epoch": 0.95, + "grad_norm": 1.6227253035376221, + "learning_rate": 5.3252945056583296e-08, + "loss": 0.2642, + "step": 32922 + }, + { + "epoch": 0.95, + "grad_norm": 1.3394142876002069, + "learning_rate": 5.3184596191142024e-08, + "loss": 0.2686, + "step": 32923 + }, + { + "epoch": 0.95, + "grad_norm": 1.72539944383242, + "learning_rate": 5.311629098140092e-08, + "loss": 0.2596, + "step": 32924 + }, + { + "epoch": 0.95, + "grad_norm": 1.669391554239256, + "learning_rate": 5.3048029427963945e-08, + "loss": 0.2849, + "step": 32925 + }, + { + "epoch": 0.96, + "grad_norm": 1.3746567011778108, + "learning_rate": 5.297981153143339e-08, + "loss": 0.2849, + "step": 32926 + }, + { + "epoch": 0.96, + "grad_norm": 2.2571165830570394, + "learning_rate": 5.291163729241044e-08, + "loss": 0.2834, + "step": 32927 + }, + { + "epoch": 0.96, + "grad_norm": 1.4190695591562343, + "learning_rate": 5.284350671149685e-08, + "loss": 0.2681, + "step": 32928 + }, + { + "epoch": 0.96, + "grad_norm": 1.5560929835891593, + "learning_rate": 5.27754197892949e-08, + "loss": 0.2729, + "step": 32929 + }, + { + "epoch": 0.96, + "grad_norm": 1.4918475130650244, + "learning_rate": 5.270737652640412e-08, + "loss": 0.2748, + "step": 32930 + }, + { + "epoch": 0.96, + "grad_norm": 2.3768412075905263, + "learning_rate": 5.2639376923426246e-08, + "loss": 0.2585, + "step": 32931 + }, + { + "epoch": 0.96, + "grad_norm": 1.3404193163751708, + "learning_rate": 5.257142098096024e-08, + "loss": 0.2808, + "step": 32932 + }, + { + "epoch": 0.96, + "grad_norm": 1.4290690611861296, + "learning_rate": 5.2503508699606744e-08, + "loss": 0.265, + "step": 32933 + }, + { + "epoch": 0.96, + "grad_norm": 0.9219670993302623, + "learning_rate": 5.243564007996416e-08, + "loss": 0.5572, + "step": 32934 + }, + { + "epoch": 0.96, + "grad_norm": 1.3184644040179903, + "learning_rate": 5.236781512263201e-08, + "loss": 0.2505, + "step": 32935 + }, + { + "epoch": 0.96, + "grad_norm": 1.9103636662000836, + "learning_rate": 5.2300033828209254e-08, + "loss": 0.2521, + "step": 32936 + }, + { + "epoch": 0.96, + "grad_norm": 1.2496344582793388, + "learning_rate": 5.22322961972932e-08, + "loss": 0.2652, + "step": 32937 + }, + { + "epoch": 0.96, + "grad_norm": 1.8112470348680008, + "learning_rate": 5.2164602230482254e-08, + "loss": 0.2483, + "step": 32938 + }, + { + "epoch": 0.96, + "grad_norm": 1.8977405843164217, + "learning_rate": 5.209695192837372e-08, + "loss": 0.2591, + "step": 32939 + }, + { + "epoch": 0.96, + "grad_norm": 1.3056120604034904, + "learning_rate": 5.2029345291563785e-08, + "loss": 0.2958, + "step": 32940 + }, + { + "epoch": 0.96, + "grad_norm": 1.4646422958138356, + "learning_rate": 5.19617823206503e-08, + "loss": 0.2745, + "step": 32941 + }, + { + "epoch": 0.96, + "grad_norm": 0.9563975458922994, + "learning_rate": 5.189426301622891e-08, + "loss": 0.5143, + "step": 32942 + }, + { + "epoch": 0.96, + "grad_norm": 1.4557155878686914, + "learning_rate": 5.182678737889524e-08, + "loss": 0.2897, + "step": 32943 + }, + { + "epoch": 0.96, + "grad_norm": 1.7348304094363585, + "learning_rate": 5.175935540924548e-08, + "loss": 0.2752, + "step": 32944 + }, + { + "epoch": 0.96, + "grad_norm": 1.2912400553899, + "learning_rate": 5.169196710787416e-08, + "loss": 0.2765, + "step": 32945 + }, + { + "epoch": 0.96, + "grad_norm": 1.4400953677331743, + "learning_rate": 5.162462247537636e-08, + "loss": 0.2651, + "step": 32946 + }, + { + "epoch": 0.96, + "grad_norm": 1.3452947920829166, + "learning_rate": 5.155732151234549e-08, + "loss": 0.2776, + "step": 32947 + }, + { + "epoch": 0.96, + "grad_norm": 1.352892032560645, + "learning_rate": 5.1490064219377186e-08, + "loss": 0.2813, + "step": 32948 + }, + { + "epoch": 0.96, + "grad_norm": 1.2805437006908125, + "learning_rate": 5.14228505970632e-08, + "loss": 0.2404, + "step": 32949 + }, + { + "epoch": 0.96, + "grad_norm": 1.8658154330575165, + "learning_rate": 5.1355680645997496e-08, + "loss": 0.2786, + "step": 32950 + }, + { + "epoch": 0.96, + "grad_norm": 1.743666823839135, + "learning_rate": 5.1288554366772936e-08, + "loss": 0.2553, + "step": 32951 + }, + { + "epoch": 0.96, + "grad_norm": 1.2560936732720842, + "learning_rate": 5.122147175998182e-08, + "loss": 0.2581, + "step": 32952 + }, + { + "epoch": 0.96, + "grad_norm": 1.3173654190457944, + "learning_rate": 5.11544328262159e-08, + "loss": 0.2566, + "step": 32953 + }, + { + "epoch": 0.96, + "grad_norm": 2.3274637991493887, + "learning_rate": 5.1087437566066934e-08, + "loss": 0.2904, + "step": 32954 + }, + { + "epoch": 0.96, + "grad_norm": 1.436250128371267, + "learning_rate": 5.1020485980126654e-08, + "loss": 0.269, + "step": 32955 + }, + { + "epoch": 0.96, + "grad_norm": 1.30317346453397, + "learning_rate": 5.095357806898571e-08, + "loss": 0.2978, + "step": 32956 + }, + { + "epoch": 0.96, + "grad_norm": 1.3009666465101737, + "learning_rate": 5.0886713833233625e-08, + "loss": 0.2652, + "step": 32957 + }, + { + "epoch": 0.96, + "grad_norm": 1.5459722337063149, + "learning_rate": 5.081989327346159e-08, + "loss": 0.27, + "step": 32958 + }, + { + "epoch": 0.96, + "grad_norm": 1.2600277082965983, + "learning_rate": 5.0753116390258594e-08, + "loss": 0.2613, + "step": 32959 + }, + { + "epoch": 0.96, + "grad_norm": 1.3021154002041493, + "learning_rate": 5.0686383184214705e-08, + "loss": 0.2765, + "step": 32960 + }, + { + "epoch": 0.96, + "grad_norm": 1.196793264414384, + "learning_rate": 5.061969365591779e-08, + "loss": 0.2469, + "step": 32961 + }, + { + "epoch": 0.96, + "grad_norm": 1.3476945714260393, + "learning_rate": 5.0553047805957376e-08, + "loss": 0.3004, + "step": 32962 + }, + { + "epoch": 0.96, + "grad_norm": 1.468948501045609, + "learning_rate": 5.048644563492133e-08, + "loss": 0.2575, + "step": 32963 + }, + { + "epoch": 0.96, + "grad_norm": 1.3750030350998508, + "learning_rate": 5.041988714339696e-08, + "loss": 0.2688, + "step": 32964 + }, + { + "epoch": 0.96, + "grad_norm": 2.8496822756202937, + "learning_rate": 5.035337233197268e-08, + "loss": 0.2808, + "step": 32965 + }, + { + "epoch": 0.96, + "grad_norm": 1.2967251987153814, + "learning_rate": 5.028690120123414e-08, + "loss": 0.2952, + "step": 32966 + }, + { + "epoch": 0.96, + "grad_norm": 1.3463945079812394, + "learning_rate": 5.022047375176864e-08, + "loss": 0.2952, + "step": 32967 + }, + { + "epoch": 0.96, + "grad_norm": 1.3726893967816922, + "learning_rate": 5.015408998416238e-08, + "loss": 0.2902, + "step": 32968 + }, + { + "epoch": 0.96, + "grad_norm": 1.4540160816922827, + "learning_rate": 5.0087749899001556e-08, + "loss": 0.2565, + "step": 32969 + }, + { + "epoch": 0.96, + "grad_norm": 1.2844179375900238, + "learning_rate": 5.002145349687071e-08, + "loss": 0.2662, + "step": 32970 + }, + { + "epoch": 0.96, + "grad_norm": 1.6612205906803548, + "learning_rate": 4.995520077835603e-08, + "loss": 0.2696, + "step": 32971 + }, + { + "epoch": 0.96, + "grad_norm": 1.4641190996270892, + "learning_rate": 4.988899174404094e-08, + "loss": 0.2812, + "step": 32972 + }, + { + "epoch": 0.96, + "grad_norm": 1.2659206348614054, + "learning_rate": 4.982282639451108e-08, + "loss": 0.2771, + "step": 32973 + }, + { + "epoch": 0.96, + "grad_norm": 1.4026613359681022, + "learning_rate": 4.9756704730349326e-08, + "loss": 0.2793, + "step": 32974 + }, + { + "epoch": 0.96, + "grad_norm": 1.3312946645548374, + "learning_rate": 4.96906267521402e-08, + "loss": 0.2667, + "step": 32975 + }, + { + "epoch": 0.96, + "grad_norm": 1.4145073294685921, + "learning_rate": 4.962459246046547e-08, + "loss": 0.253, + "step": 32976 + }, + { + "epoch": 0.96, + "grad_norm": 1.826978965629948, + "learning_rate": 4.955860185590911e-08, + "loss": 0.268, + "step": 32977 + }, + { + "epoch": 0.96, + "grad_norm": 1.4984176862490464, + "learning_rate": 4.949265493905231e-08, + "loss": 0.2649, + "step": 32978 + }, + { + "epoch": 0.96, + "grad_norm": 1.7264811576025325, + "learning_rate": 4.9426751710478504e-08, + "loss": 0.2579, + "step": 32979 + }, + { + "epoch": 0.96, + "grad_norm": 1.2985163582332375, + "learning_rate": 4.9360892170767784e-08, + "loss": 0.2797, + "step": 32980 + }, + { + "epoch": 0.96, + "grad_norm": 1.622398631230221, + "learning_rate": 4.9295076320502454e-08, + "loss": 0.2674, + "step": 32981 + }, + { + "epoch": 0.96, + "grad_norm": 1.9928520172802628, + "learning_rate": 4.922930416026317e-08, + "loss": 0.2647, + "step": 32982 + }, + { + "epoch": 0.96, + "grad_norm": 1.5953311704330648, + "learning_rate": 4.916357569063002e-08, + "loss": 0.2796, + "step": 32983 + }, + { + "epoch": 0.96, + "grad_norm": 1.4202028490218175, + "learning_rate": 4.909789091218364e-08, + "loss": 0.2851, + "step": 32984 + }, + { + "epoch": 0.96, + "grad_norm": 1.3663435735330811, + "learning_rate": 4.903224982550303e-08, + "loss": 0.2906, + "step": 32985 + }, + { + "epoch": 0.96, + "grad_norm": 1.8444649884035977, + "learning_rate": 4.8966652431167715e-08, + "loss": 0.2649, + "step": 32986 + }, + { + "epoch": 0.96, + "grad_norm": 1.8041480887343164, + "learning_rate": 4.890109872975613e-08, + "loss": 0.2597, + "step": 32987 + }, + { + "epoch": 0.96, + "grad_norm": 1.2665838338716102, + "learning_rate": 4.883558872184779e-08, + "loss": 0.2576, + "step": 32988 + }, + { + "epoch": 0.96, + "grad_norm": 1.3564314026994506, + "learning_rate": 4.877012240802004e-08, + "loss": 0.2557, + "step": 32989 + }, + { + "epoch": 0.96, + "grad_norm": 1.349211809111268, + "learning_rate": 4.8704699788851285e-08, + "loss": 0.2428, + "step": 32990 + }, + { + "epoch": 0.96, + "grad_norm": 1.5493684136660113, + "learning_rate": 4.863932086491829e-08, + "loss": 0.2883, + "step": 32991 + }, + { + "epoch": 0.96, + "grad_norm": 2.1209466198737164, + "learning_rate": 4.8573985636798384e-08, + "loss": 0.2788, + "step": 32992 + }, + { + "epoch": 0.96, + "grad_norm": 0.9551542320147762, + "learning_rate": 4.8508694105067756e-08, + "loss": 0.5907, + "step": 32993 + }, + { + "epoch": 0.96, + "grad_norm": 1.257336289613562, + "learning_rate": 4.844344627030262e-08, + "loss": 0.2401, + "step": 32994 + }, + { + "epoch": 0.96, + "grad_norm": 1.3702007288467837, + "learning_rate": 4.8378242133079175e-08, + "loss": 0.2609, + "step": 32995 + }, + { + "epoch": 0.96, + "grad_norm": 1.4705815211000162, + "learning_rate": 4.831308169397309e-08, + "loss": 0.2848, + "step": 32996 + }, + { + "epoch": 0.96, + "grad_norm": 1.6792152992705633, + "learning_rate": 4.8247964953558323e-08, + "loss": 0.2786, + "step": 32997 + }, + { + "epoch": 0.96, + "grad_norm": 1.2942634325497326, + "learning_rate": 4.818289191241055e-08, + "loss": 0.2617, + "step": 32998 + }, + { + "epoch": 0.96, + "grad_norm": 1.2718214619027588, + "learning_rate": 4.8117862571103736e-08, + "loss": 0.251, + "step": 32999 + }, + { + "epoch": 0.96, + "grad_norm": 1.383136977824036, + "learning_rate": 4.8052876930211325e-08, + "loss": 0.2812, + "step": 33000 + }, + { + "epoch": 0.96, + "grad_norm": 1.2930541209032176, + "learning_rate": 4.798793499030785e-08, + "loss": 0.2711, + "step": 33001 + }, + { + "epoch": 0.96, + "grad_norm": 1.4656530998376893, + "learning_rate": 4.792303675196563e-08, + "loss": 0.248, + "step": 33002 + }, + { + "epoch": 0.96, + "grad_norm": 1.6313236570622465, + "learning_rate": 4.785818221575755e-08, + "loss": 0.2817, + "step": 33003 + }, + { + "epoch": 0.96, + "grad_norm": 1.6658975171879147, + "learning_rate": 4.779337138225593e-08, + "loss": 0.2637, + "step": 33004 + }, + { + "epoch": 0.96, + "grad_norm": 1.4231115999379942, + "learning_rate": 4.772860425203252e-08, + "loss": 0.2569, + "step": 33005 + }, + { + "epoch": 0.96, + "grad_norm": 1.397265899381401, + "learning_rate": 4.7663880825659094e-08, + "loss": 0.2813, + "step": 33006 + }, + { + "epoch": 0.96, + "grad_norm": 1.3155379752265144, + "learning_rate": 4.759920110370686e-08, + "loss": 0.2645, + "step": 33007 + }, + { + "epoch": 0.96, + "grad_norm": 1.355647761264109, + "learning_rate": 4.7534565086746476e-08, + "loss": 0.2912, + "step": 33008 + }, + { + "epoch": 0.96, + "grad_norm": 1.3195542759647614, + "learning_rate": 4.7469972775348594e-08, + "loss": 0.2867, + "step": 33009 + }, + { + "epoch": 0.96, + "grad_norm": 1.7075377837305263, + "learning_rate": 4.740542417008331e-08, + "loss": 0.2532, + "step": 33010 + }, + { + "epoch": 0.96, + "grad_norm": 1.4796479963269107, + "learning_rate": 4.7340919271520737e-08, + "loss": 0.2837, + "step": 33011 + }, + { + "epoch": 0.96, + "grad_norm": 1.3148567169169156, + "learning_rate": 4.727645808022874e-08, + "loss": 0.2667, + "step": 33012 + }, + { + "epoch": 0.96, + "grad_norm": 1.374061238390214, + "learning_rate": 4.7212040596776863e-08, + "loss": 0.2653, + "step": 33013 + }, + { + "epoch": 0.96, + "grad_norm": 1.35574359229527, + "learning_rate": 4.714766682173355e-08, + "loss": 0.269, + "step": 33014 + }, + { + "epoch": 0.96, + "grad_norm": 1.4629315922463046, + "learning_rate": 4.708333675566723e-08, + "loss": 0.2582, + "step": 33015 + }, + { + "epoch": 0.96, + "grad_norm": 1.3922862630742099, + "learning_rate": 4.701905039914578e-08, + "loss": 0.2599, + "step": 33016 + }, + { + "epoch": 0.96, + "grad_norm": 1.3566363819707172, + "learning_rate": 4.6954807752735423e-08, + "loss": 0.2704, + "step": 33017 + }, + { + "epoch": 0.96, + "grad_norm": 1.2925071347964527, + "learning_rate": 4.689060881700458e-08, + "loss": 0.2703, + "step": 33018 + }, + { + "epoch": 0.96, + "grad_norm": 0.958881398268842, + "learning_rate": 4.682645359251836e-08, + "loss": 0.5716, + "step": 33019 + }, + { + "epoch": 0.96, + "grad_norm": 1.2971479199888432, + "learning_rate": 4.676234207984465e-08, + "loss": 0.2631, + "step": 33020 + }, + { + "epoch": 0.96, + "grad_norm": 2.6946557195812937, + "learning_rate": 4.669827427954743e-08, + "loss": 0.2603, + "step": 33021 + }, + { + "epoch": 0.96, + "grad_norm": 1.4606343635052645, + "learning_rate": 4.663425019219292e-08, + "loss": 0.2645, + "step": 33022 + }, + { + "epoch": 0.96, + "grad_norm": 1.357827377017353, + "learning_rate": 4.657026981834623e-08, + "loss": 0.2442, + "step": 33023 + }, + { + "epoch": 0.96, + "grad_norm": 1.268217926220686, + "learning_rate": 4.650633315857189e-08, + "loss": 0.2463, + "step": 33024 + }, + { + "epoch": 0.96, + "grad_norm": 1.274085898772469, + "learning_rate": 4.6442440213433914e-08, + "loss": 0.2556, + "step": 33025 + }, + { + "epoch": 0.96, + "grad_norm": 1.2932360902674953, + "learning_rate": 4.637859098349684e-08, + "loss": 0.2701, + "step": 33026 + }, + { + "epoch": 0.96, + "grad_norm": 1.4009841469811195, + "learning_rate": 4.6314785469323e-08, + "loss": 0.2658, + "step": 33027 + }, + { + "epoch": 0.96, + "grad_norm": 1.2423303539123258, + "learning_rate": 4.625102367147694e-08, + "loss": 0.2763, + "step": 33028 + }, + { + "epoch": 0.96, + "grad_norm": 1.4341407885241828, + "learning_rate": 4.618730559051987e-08, + "loss": 0.2822, + "step": 33029 + }, + { + "epoch": 0.96, + "grad_norm": 1.2618059939448143, + "learning_rate": 4.612363122701524e-08, + "loss": 0.2754, + "step": 33030 + }, + { + "epoch": 0.96, + "grad_norm": 1.4132943535116351, + "learning_rate": 4.6060000581524266e-08, + "loss": 0.2867, + "step": 33031 + }, + { + "epoch": 0.96, + "grad_norm": 1.1837982014815756, + "learning_rate": 4.599641365460872e-08, + "loss": 0.2706, + "step": 33032 + }, + { + "epoch": 0.96, + "grad_norm": 0.9350887844005151, + "learning_rate": 4.5932870446829815e-08, + "loss": 0.5801, + "step": 33033 + }, + { + "epoch": 0.96, + "grad_norm": 1.9050022803493287, + "learning_rate": 4.586937095874821e-08, + "loss": 0.2877, + "step": 33034 + }, + { + "epoch": 0.96, + "grad_norm": 1.3351387343581254, + "learning_rate": 4.580591519092459e-08, + "loss": 0.2635, + "step": 33035 + }, + { + "epoch": 0.96, + "grad_norm": 1.3621780245045838, + "learning_rate": 4.574250314391848e-08, + "loss": 0.2482, + "step": 33036 + }, + { + "epoch": 0.96, + "grad_norm": 1.2673152817357511, + "learning_rate": 4.567913481828945e-08, + "loss": 0.2671, + "step": 33037 + }, + { + "epoch": 0.96, + "grad_norm": 1.4067784042650298, + "learning_rate": 4.56158102145976e-08, + "loss": 0.2587, + "step": 33038 + }, + { + "epoch": 0.96, + "grad_norm": 1.6566390317652127, + "learning_rate": 4.5552529333401376e-08, + "loss": 0.2717, + "step": 33039 + }, + { + "epoch": 0.96, + "grad_norm": 1.2866345730634967, + "learning_rate": 4.548929217525866e-08, + "loss": 0.27, + "step": 33040 + }, + { + "epoch": 0.96, + "grad_norm": 1.4044161550590588, + "learning_rate": 4.5426098740727345e-08, + "loss": 0.2795, + "step": 33041 + }, + { + "epoch": 0.96, + "grad_norm": 1.3466141773246563, + "learning_rate": 4.536294903036642e-08, + "loss": 0.2637, + "step": 33042 + }, + { + "epoch": 0.96, + "grad_norm": 1.3054974343852226, + "learning_rate": 4.5299843044732116e-08, + "loss": 0.2666, + "step": 33043 + }, + { + "epoch": 0.96, + "grad_norm": 1.4076438028645653, + "learning_rate": 4.5236780784381764e-08, + "loss": 0.2828, + "step": 33044 + }, + { + "epoch": 0.96, + "grad_norm": 1.3087326062668672, + "learning_rate": 4.517376224987158e-08, + "loss": 0.2658, + "step": 33045 + }, + { + "epoch": 0.96, + "grad_norm": 1.2769290827529363, + "learning_rate": 4.511078744175834e-08, + "loss": 0.2763, + "step": 33046 + }, + { + "epoch": 0.96, + "grad_norm": 1.2286661779862678, + "learning_rate": 4.504785636059717e-08, + "loss": 0.2413, + "step": 33047 + }, + { + "epoch": 0.96, + "grad_norm": 1.7754283997088072, + "learning_rate": 4.4984969006943714e-08, + "loss": 0.2634, + "step": 33048 + }, + { + "epoch": 0.96, + "grad_norm": 1.3630112184766336, + "learning_rate": 4.49221253813531e-08, + "loss": 0.2774, + "step": 33049 + }, + { + "epoch": 0.96, + "grad_norm": 1.3555351121091945, + "learning_rate": 4.4859325484379324e-08, + "loss": 0.2678, + "step": 33050 + }, + { + "epoch": 0.96, + "grad_norm": 1.3712008486821596, + "learning_rate": 4.479656931657694e-08, + "loss": 0.2783, + "step": 33051 + }, + { + "epoch": 0.96, + "grad_norm": 1.26842445004355, + "learning_rate": 4.47338568784994e-08, + "loss": 0.2629, + "step": 33052 + }, + { + "epoch": 0.96, + "grad_norm": 1.3471196223457378, + "learning_rate": 4.467118817070126e-08, + "loss": 0.2422, + "step": 33053 + }, + { + "epoch": 0.96, + "grad_norm": 1.3019581348597717, + "learning_rate": 4.4608563193734854e-08, + "loss": 0.276, + "step": 33054 + }, + { + "epoch": 0.96, + "grad_norm": 1.2645290297124692, + "learning_rate": 4.454598194815252e-08, + "loss": 0.2776, + "step": 33055 + }, + { + "epoch": 0.96, + "grad_norm": 1.3256915290503217, + "learning_rate": 4.448344443450714e-08, + "loss": 0.2569, + "step": 33056 + }, + { + "epoch": 0.96, + "grad_norm": 1.4186698968730693, + "learning_rate": 4.4420950653349945e-08, + "loss": 0.2832, + "step": 33057 + }, + { + "epoch": 0.96, + "grad_norm": 1.2718083461629586, + "learning_rate": 4.435850060523328e-08, + "loss": 0.2652, + "step": 33058 + }, + { + "epoch": 0.96, + "grad_norm": 1.4282371823490427, + "learning_rate": 4.42960942907078e-08, + "loss": 0.264, + "step": 33059 + }, + { + "epoch": 0.96, + "grad_norm": 0.9431517466167716, + "learning_rate": 4.423373171032419e-08, + "loss": 0.5702, + "step": 33060 + }, + { + "epoch": 0.96, + "grad_norm": 1.2475533974478568, + "learning_rate": 4.4171412864632556e-08, + "loss": 0.2478, + "step": 33061 + }, + { + "epoch": 0.96, + "grad_norm": 0.9400348814237824, + "learning_rate": 4.410913775418302e-08, + "loss": 0.5634, + "step": 33062 + }, + { + "epoch": 0.96, + "grad_norm": 1.2920729313860546, + "learning_rate": 4.404690637952569e-08, + "loss": 0.2875, + "step": 33063 + }, + { + "epoch": 0.96, + "grad_norm": 1.529714008481616, + "learning_rate": 4.398471874120958e-08, + "loss": 0.2915, + "step": 33064 + }, + { + "epoch": 0.96, + "grad_norm": 1.0551133023214678, + "learning_rate": 4.3922574839783125e-08, + "loss": 0.5479, + "step": 33065 + }, + { + "epoch": 0.96, + "grad_norm": 1.2936715013757871, + "learning_rate": 4.38604746757948e-08, + "loss": 0.2593, + "step": 33066 + }, + { + "epoch": 0.96, + "grad_norm": 1.3962498099553513, + "learning_rate": 4.379841824979303e-08, + "loss": 0.2686, + "step": 33067 + }, + { + "epoch": 0.96, + "grad_norm": 1.9413419939862977, + "learning_rate": 4.373640556232517e-08, + "loss": 0.2697, + "step": 33068 + }, + { + "epoch": 0.96, + "grad_norm": 1.310159364826064, + "learning_rate": 4.3674436613938e-08, + "loss": 0.276, + "step": 33069 + }, + { + "epoch": 0.96, + "grad_norm": 1.0039477748996728, + "learning_rate": 4.3612511405179416e-08, + "loss": 0.5728, + "step": 33070 + }, + { + "epoch": 0.96, + "grad_norm": 1.5822447510100293, + "learning_rate": 4.355062993659509e-08, + "loss": 0.2849, + "step": 33071 + }, + { + "epoch": 0.96, + "grad_norm": 1.859358183222589, + "learning_rate": 4.3488792208731814e-08, + "loss": 0.2849, + "step": 33072 + }, + { + "epoch": 0.96, + "grad_norm": 1.3762100512609303, + "learning_rate": 4.3426998222134697e-08, + "loss": 0.2763, + "step": 33073 + }, + { + "epoch": 0.96, + "grad_norm": 1.3795408256811448, + "learning_rate": 4.3365247977349424e-08, + "loss": 0.283, + "step": 33074 + }, + { + "epoch": 0.96, + "grad_norm": 1.3260435620381796, + "learning_rate": 4.330354147492111e-08, + "loss": 0.2581, + "step": 33075 + }, + { + "epoch": 0.96, + "grad_norm": 1.4212408487847679, + "learning_rate": 4.3241878715393206e-08, + "loss": 0.2462, + "step": 33076 + }, + { + "epoch": 0.96, + "grad_norm": 1.5640046987190799, + "learning_rate": 4.318025969931139e-08, + "loss": 0.2913, + "step": 33077 + }, + { + "epoch": 0.96, + "grad_norm": 1.3440141404465793, + "learning_rate": 4.311868442721856e-08, + "loss": 0.2665, + "step": 33078 + }, + { + "epoch": 0.96, + "grad_norm": 1.2736829935925065, + "learning_rate": 4.305715289965817e-08, + "loss": 0.2707, + "step": 33079 + }, + { + "epoch": 0.96, + "grad_norm": 1.4770992794332027, + "learning_rate": 4.299566511717368e-08, + "loss": 0.2739, + "step": 33080 + }, + { + "epoch": 0.96, + "grad_norm": 1.352281264183495, + "learning_rate": 4.293422108030687e-08, + "loss": 0.2528, + "step": 33081 + }, + { + "epoch": 0.96, + "grad_norm": 0.8937184771820939, + "learning_rate": 4.28728207896012e-08, + "loss": 0.5538, + "step": 33082 + }, + { + "epoch": 0.96, + "grad_norm": 1.3443469571749105, + "learning_rate": 4.28114642455979e-08, + "loss": 0.2735, + "step": 33083 + }, + { + "epoch": 0.96, + "grad_norm": 1.458121303321487, + "learning_rate": 4.275015144883821e-08, + "loss": 0.2537, + "step": 33084 + }, + { + "epoch": 0.96, + "grad_norm": 1.4797629238097447, + "learning_rate": 4.268888239986335e-08, + "loss": 0.2869, + "step": 33085 + }, + { + "epoch": 0.96, + "grad_norm": 1.3605917231125626, + "learning_rate": 4.262765709921457e-08, + "loss": 0.2606, + "step": 33086 + }, + { + "epoch": 0.96, + "grad_norm": 1.5312891137637268, + "learning_rate": 4.256647554743088e-08, + "loss": 0.2634, + "step": 33087 + }, + { + "epoch": 0.96, + "grad_norm": 1.3205337455373793, + "learning_rate": 4.2505337745053496e-08, + "loss": 0.2898, + "step": 33088 + }, + { + "epoch": 0.96, + "grad_norm": 1.2731044974726138, + "learning_rate": 4.2444243692621456e-08, + "loss": 0.2682, + "step": 33089 + }, + { + "epoch": 0.96, + "grad_norm": 1.323920750626334, + "learning_rate": 4.238319339067376e-08, + "loss": 0.2756, + "step": 33090 + }, + { + "epoch": 0.96, + "grad_norm": 1.3281533172122344, + "learning_rate": 4.232218683974998e-08, + "loss": 0.2855, + "step": 33091 + }, + { + "epoch": 0.96, + "grad_norm": 1.3362288917251723, + "learning_rate": 4.226122404038746e-08, + "loss": 0.2809, + "step": 33092 + }, + { + "epoch": 0.96, + "grad_norm": 1.6389785565118884, + "learning_rate": 4.220030499312466e-08, + "loss": 0.2797, + "step": 33093 + }, + { + "epoch": 0.96, + "grad_norm": 1.2761193165876683, + "learning_rate": 4.2139429698498936e-08, + "loss": 0.2583, + "step": 33094 + }, + { + "epoch": 0.96, + "grad_norm": 1.0067658791746135, + "learning_rate": 4.207859815704818e-08, + "loss": 0.5484, + "step": 33095 + }, + { + "epoch": 0.96, + "grad_norm": 1.3890428983254681, + "learning_rate": 4.201781036930863e-08, + "loss": 0.2797, + "step": 33096 + }, + { + "epoch": 0.96, + "grad_norm": 1.4613421799748727, + "learning_rate": 4.195706633581709e-08, + "loss": 0.2704, + "step": 33097 + }, + { + "epoch": 0.96, + "grad_norm": 1.273999072007282, + "learning_rate": 4.189636605710867e-08, + "loss": 0.2474, + "step": 33098 + }, + { + "epoch": 0.96, + "grad_norm": 0.92329122194919, + "learning_rate": 4.1835709533720736e-08, + "loss": 0.5494, + "step": 33099 + }, + { + "epoch": 0.96, + "grad_norm": 1.319709529512635, + "learning_rate": 4.17750967661873e-08, + "loss": 0.2636, + "step": 33100 + }, + { + "epoch": 0.96, + "grad_norm": 1.6597147638493808, + "learning_rate": 4.171452775504348e-08, + "loss": 0.2858, + "step": 33101 + }, + { + "epoch": 0.96, + "grad_norm": 1.3914978143942365, + "learning_rate": 4.165400250082441e-08, + "loss": 0.282, + "step": 33102 + }, + { + "epoch": 0.96, + "grad_norm": 1.3597661739008509, + "learning_rate": 4.159352100406355e-08, + "loss": 0.2733, + "step": 33103 + }, + { + "epoch": 0.96, + "grad_norm": 1.3408028173808926, + "learning_rate": 4.1533083265294374e-08, + "loss": 0.3012, + "step": 33104 + }, + { + "epoch": 0.96, + "grad_norm": 1.3131123234758713, + "learning_rate": 4.1472689285051436e-08, + "loss": 0.2777, + "step": 33105 + }, + { + "epoch": 0.96, + "grad_norm": 0.9279298751607952, + "learning_rate": 4.141233906386655e-08, + "loss": 0.5378, + "step": 33106 + }, + { + "epoch": 0.96, + "grad_norm": 1.8390842096744957, + "learning_rate": 4.1352032602273164e-08, + "loss": 0.2543, + "step": 33107 + }, + { + "epoch": 0.96, + "grad_norm": 1.3786047536857333, + "learning_rate": 4.129176990080308e-08, + "loss": 0.2912, + "step": 33108 + }, + { + "epoch": 0.96, + "grad_norm": 1.2877530082915474, + "learning_rate": 4.123155095998754e-08, + "loss": 0.2675, + "step": 33109 + }, + { + "epoch": 0.96, + "grad_norm": 1.4576090244128566, + "learning_rate": 4.117137578035946e-08, + "loss": 0.2587, + "step": 33110 + }, + { + "epoch": 0.96, + "grad_norm": 1.4463888246214298, + "learning_rate": 4.11112443624484e-08, + "loss": 0.2557, + "step": 33111 + }, + { + "epoch": 0.96, + "grad_norm": 1.27213499618191, + "learning_rate": 4.105115670678617e-08, + "loss": 0.2643, + "step": 33112 + }, + { + "epoch": 0.96, + "grad_norm": 1.4641211482487606, + "learning_rate": 4.099111281390178e-08, + "loss": 0.2933, + "step": 33113 + }, + { + "epoch": 0.96, + "grad_norm": 1.373035485561062, + "learning_rate": 4.0931112684325926e-08, + "loss": 0.2709, + "step": 33114 + }, + { + "epoch": 0.96, + "grad_norm": 1.549346047842831, + "learning_rate": 4.087115631858873e-08, + "loss": 0.2597, + "step": 33115 + }, + { + "epoch": 0.96, + "grad_norm": 1.6194921222465049, + "learning_rate": 4.0811243717218116e-08, + "loss": 0.2719, + "step": 33116 + }, + { + "epoch": 0.96, + "grad_norm": 1.4171160681071129, + "learning_rate": 4.07513748807431e-08, + "loss": 0.278, + "step": 33117 + }, + { + "epoch": 0.96, + "grad_norm": 1.8684830713206675, + "learning_rate": 4.06915498096927e-08, + "loss": 0.3187, + "step": 33118 + }, + { + "epoch": 0.96, + "grad_norm": 1.2484658165099667, + "learning_rate": 4.0631768504593715e-08, + "loss": 0.2577, + "step": 33119 + }, + { + "epoch": 0.96, + "grad_norm": 1.4085255247524453, + "learning_rate": 4.057203096597462e-08, + "loss": 0.2498, + "step": 33120 + }, + { + "epoch": 0.96, + "grad_norm": 1.3737454009094108, + "learning_rate": 4.051233719436276e-08, + "loss": 0.266, + "step": 33121 + }, + { + "epoch": 0.96, + "grad_norm": 1.2434175081078707, + "learning_rate": 4.045268719028439e-08, + "loss": 0.2766, + "step": 33122 + }, + { + "epoch": 0.96, + "grad_norm": 1.3814700559956574, + "learning_rate": 4.039308095426631e-08, + "loss": 0.2905, + "step": 33123 + }, + { + "epoch": 0.96, + "grad_norm": 1.4333550111886897, + "learning_rate": 4.033351848683364e-08, + "loss": 0.2761, + "step": 33124 + }, + { + "epoch": 0.96, + "grad_norm": 2.2803789184978736, + "learning_rate": 4.0273999788513205e-08, + "loss": 0.3063, + "step": 33125 + }, + { + "epoch": 0.96, + "grad_norm": 1.0184052154022916, + "learning_rate": 4.021452485982957e-08, + "loss": 0.6068, + "step": 33126 + }, + { + "epoch": 0.96, + "grad_norm": 1.2477899320292145, + "learning_rate": 4.0155093701307326e-08, + "loss": 0.2721, + "step": 33127 + }, + { + "epoch": 0.96, + "grad_norm": 1.4270566296010836, + "learning_rate": 4.0095706313472146e-08, + "loss": 0.2699, + "step": 33128 + }, + { + "epoch": 0.96, + "grad_norm": 1.3534070240466358, + "learning_rate": 4.003636269684696e-08, + "loss": 0.2585, + "step": 33129 + }, + { + "epoch": 0.96, + "grad_norm": 1.2486774073732565, + "learning_rate": 3.997706285195635e-08, + "loss": 0.2655, + "step": 33130 + }, + { + "epoch": 0.96, + "grad_norm": 1.3690767025735602, + "learning_rate": 3.991780677932267e-08, + "loss": 0.3166, + "step": 33131 + }, + { + "epoch": 0.96, + "grad_norm": 1.3949700163441279, + "learning_rate": 3.985859447946938e-08, + "loss": 0.2626, + "step": 33132 + }, + { + "epoch": 0.96, + "grad_norm": 1.5462637799812382, + "learning_rate": 3.9799425952919415e-08, + "loss": 0.2691, + "step": 33133 + }, + { + "epoch": 0.96, + "grad_norm": 1.3135006032853271, + "learning_rate": 3.974030120019401e-08, + "loss": 0.2561, + "step": 33134 + }, + { + "epoch": 0.96, + "grad_norm": 2.2501476979015425, + "learning_rate": 3.9681220221816085e-08, + "loss": 0.26, + "step": 33135 + }, + { + "epoch": 0.96, + "grad_norm": 1.3947256809306607, + "learning_rate": 3.962218301830578e-08, + "loss": 0.2717, + "step": 33136 + }, + { + "epoch": 0.96, + "grad_norm": 1.3535015959376433, + "learning_rate": 3.95631895901849e-08, + "loss": 0.2728, + "step": 33137 + }, + { + "epoch": 0.96, + "grad_norm": 1.2699623122248163, + "learning_rate": 3.95042399379747e-08, + "loss": 0.2724, + "step": 33138 + }, + { + "epoch": 0.96, + "grad_norm": 1.4463658821169838, + "learning_rate": 3.944533406219364e-08, + "loss": 0.2742, + "step": 33139 + }, + { + "epoch": 0.96, + "grad_norm": 1.28129260716649, + "learning_rate": 3.9386471963362984e-08, + "loss": 0.2746, + "step": 33140 + }, + { + "epoch": 0.96, + "grad_norm": 1.3927837080467653, + "learning_rate": 3.932765364200175e-08, + "loss": 0.2821, + "step": 33141 + }, + { + "epoch": 0.96, + "grad_norm": 1.3476141067879348, + "learning_rate": 3.9268879098628977e-08, + "loss": 0.2724, + "step": 33142 + }, + { + "epoch": 0.96, + "grad_norm": 1.2763081330389885, + "learning_rate": 3.9210148333763135e-08, + "loss": 0.2592, + "step": 33143 + }, + { + "epoch": 0.96, + "grad_norm": 1.2809537622148641, + "learning_rate": 3.9151461347923246e-08, + "loss": 0.2628, + "step": 33144 + }, + { + "epoch": 0.96, + "grad_norm": 1.3964626065279075, + "learning_rate": 3.909281814162613e-08, + "loss": 0.2502, + "step": 33145 + }, + { + "epoch": 0.96, + "grad_norm": 1.5914680046754819, + "learning_rate": 3.903421871539026e-08, + "loss": 0.2946, + "step": 33146 + }, + { + "epoch": 0.96, + "grad_norm": 1.2713362395791448, + "learning_rate": 3.897566306973244e-08, + "loss": 0.3006, + "step": 33147 + }, + { + "epoch": 0.96, + "grad_norm": 1.853134446885077, + "learning_rate": 3.891715120516948e-08, + "loss": 0.2511, + "step": 33148 + }, + { + "epoch": 0.96, + "grad_norm": 1.277988856980012, + "learning_rate": 3.8858683122217635e-08, + "loss": 0.2621, + "step": 33149 + }, + { + "epoch": 0.96, + "grad_norm": 1.2866633400895755, + "learning_rate": 3.88002588213926e-08, + "loss": 0.2598, + "step": 33150 + }, + { + "epoch": 0.96, + "grad_norm": 1.2005739261201733, + "learning_rate": 3.8741878303210635e-08, + "loss": 0.2707, + "step": 33151 + }, + { + "epoch": 0.96, + "grad_norm": 1.4104960557584192, + "learning_rate": 3.8683541568186875e-08, + "loss": 0.2546, + "step": 33152 + }, + { + "epoch": 0.96, + "grad_norm": 1.2684424991995327, + "learning_rate": 3.862524861683536e-08, + "loss": 0.2708, + "step": 33153 + }, + { + "epoch": 0.96, + "grad_norm": 1.4801406485885735, + "learning_rate": 3.856699944967179e-08, + "loss": 0.2838, + "step": 33154 + }, + { + "epoch": 0.96, + "grad_norm": 1.270442442191336, + "learning_rate": 3.8508794067208535e-08, + "loss": 0.2529, + "step": 33155 + }, + { + "epoch": 0.96, + "grad_norm": 1.5661717362431793, + "learning_rate": 3.8450632469960726e-08, + "loss": 0.2722, + "step": 33156 + }, + { + "epoch": 0.96, + "grad_norm": 1.93031294530721, + "learning_rate": 3.839251465844185e-08, + "loss": 0.2667, + "step": 33157 + }, + { + "epoch": 0.96, + "grad_norm": 1.420746096883508, + "learning_rate": 3.833444063316316e-08, + "loss": 0.2608, + "step": 33158 + }, + { + "epoch": 0.96, + "grad_norm": 1.3470925940882337, + "learning_rate": 3.827641039463814e-08, + "loss": 0.2632, + "step": 33159 + }, + { + "epoch": 0.96, + "grad_norm": 1.4118692915061362, + "learning_rate": 3.821842394337916e-08, + "loss": 0.2505, + "step": 33160 + }, + { + "epoch": 0.96, + "grad_norm": 1.398267390561797, + "learning_rate": 3.816048127989747e-08, + "loss": 0.2739, + "step": 33161 + }, + { + "epoch": 0.96, + "grad_norm": 1.3224243184197733, + "learning_rate": 3.810258240470432e-08, + "loss": 0.3115, + "step": 33162 + }, + { + "epoch": 0.96, + "grad_norm": 1.2938762035325646, + "learning_rate": 3.8044727318310994e-08, + "loss": 0.2579, + "step": 33163 + }, + { + "epoch": 0.96, + "grad_norm": 1.3237553975458354, + "learning_rate": 3.798691602122817e-08, + "loss": 0.2662, + "step": 33164 + }, + { + "epoch": 0.96, + "grad_norm": 1.2554005988443493, + "learning_rate": 3.792914851396601e-08, + "loss": 0.2376, + "step": 33165 + }, + { + "epoch": 0.96, + "grad_norm": 1.6861148706782916, + "learning_rate": 3.787142479703409e-08, + "loss": 0.2601, + "step": 33166 + }, + { + "epoch": 0.96, + "grad_norm": 1.3751627054430775, + "learning_rate": 3.781374487094147e-08, + "loss": 0.2656, + "step": 33167 + }, + { + "epoch": 0.96, + "grad_norm": 1.205353888904531, + "learning_rate": 3.775610873619828e-08, + "loss": 0.2598, + "step": 33168 + }, + { + "epoch": 0.96, + "grad_norm": 1.472350977147475, + "learning_rate": 3.7698516393311904e-08, + "loss": 0.2619, + "step": 33169 + }, + { + "epoch": 0.96, + "grad_norm": 1.3491173795324038, + "learning_rate": 3.764096784279136e-08, + "loss": 0.2641, + "step": 33170 + }, + { + "epoch": 0.96, + "grad_norm": 1.2774426488770272, + "learning_rate": 3.75834630851446e-08, + "loss": 0.2597, + "step": 33171 + }, + { + "epoch": 0.96, + "grad_norm": 1.4536789347659425, + "learning_rate": 3.7526002120878426e-08, + "loss": 0.2937, + "step": 33172 + }, + { + "epoch": 0.96, + "grad_norm": 1.3572917185538609, + "learning_rate": 3.7468584950500764e-08, + "loss": 0.2643, + "step": 33173 + }, + { + "epoch": 0.96, + "grad_norm": 1.295670021020532, + "learning_rate": 3.741121157451732e-08, + "loss": 0.2745, + "step": 33174 + }, + { + "epoch": 0.96, + "grad_norm": 1.2623714355166153, + "learning_rate": 3.735388199343548e-08, + "loss": 0.2629, + "step": 33175 + }, + { + "epoch": 0.96, + "grad_norm": 1.2919668665792354, + "learning_rate": 3.7296596207760384e-08, + "loss": 0.2711, + "step": 33176 + }, + { + "epoch": 0.96, + "grad_norm": 1.449573120951523, + "learning_rate": 3.723935421799829e-08, + "loss": 0.2633, + "step": 33177 + }, + { + "epoch": 0.96, + "grad_norm": 1.598051281235673, + "learning_rate": 3.7182156024653806e-08, + "loss": 0.2647, + "step": 33178 + }, + { + "epoch": 0.96, + "grad_norm": 1.28320098635701, + "learning_rate": 3.712500162823152e-08, + "loss": 0.2645, + "step": 33179 + }, + { + "epoch": 0.96, + "grad_norm": 1.2869338591404922, + "learning_rate": 3.7067891029236024e-08, + "loss": 0.2643, + "step": 33180 + }, + { + "epoch": 0.96, + "grad_norm": 1.580678968378389, + "learning_rate": 3.7010824228171925e-08, + "loss": 0.2914, + "step": 33181 + }, + { + "epoch": 0.96, + "grad_norm": 1.3225187183282572, + "learning_rate": 3.695380122554215e-08, + "loss": 0.2694, + "step": 33182 + }, + { + "epoch": 0.96, + "grad_norm": 1.453029351202423, + "learning_rate": 3.6896822021850187e-08, + "loss": 0.3014, + "step": 33183 + }, + { + "epoch": 0.96, + "grad_norm": 1.3734576177230169, + "learning_rate": 3.683988661759841e-08, + "loss": 0.2748, + "step": 33184 + }, + { + "epoch": 0.96, + "grad_norm": 1.3326197367033257, + "learning_rate": 3.67829950132903e-08, + "loss": 0.2853, + "step": 33185 + }, + { + "epoch": 0.96, + "grad_norm": 1.5311746359613168, + "learning_rate": 3.6726147209427134e-08, + "loss": 0.2709, + "step": 33186 + }, + { + "epoch": 0.96, + "grad_norm": 1.2900489776143862, + "learning_rate": 3.6669343206510725e-08, + "loss": 0.2544, + "step": 33187 + }, + { + "epoch": 0.96, + "grad_norm": 1.2739621491105875, + "learning_rate": 3.6612583005042335e-08, + "loss": 0.2534, + "step": 33188 + }, + { + "epoch": 0.96, + "grad_norm": 1.7513887143614888, + "learning_rate": 3.655586660552324e-08, + "loss": 0.2584, + "step": 33189 + }, + { + "epoch": 0.96, + "grad_norm": 1.3261701154051706, + "learning_rate": 3.6499194008453584e-08, + "loss": 0.265, + "step": 33190 + }, + { + "epoch": 0.96, + "grad_norm": 1.4810455954980737, + "learning_rate": 3.6442565214333536e-08, + "loss": 0.2644, + "step": 33191 + }, + { + "epoch": 0.96, + "grad_norm": 1.5817395249263149, + "learning_rate": 3.638598022366269e-08, + "loss": 0.2838, + "step": 33192 + }, + { + "epoch": 0.96, + "grad_norm": 0.9392446930776004, + "learning_rate": 3.6329439036941196e-08, + "loss": 0.5911, + "step": 33193 + }, + { + "epoch": 0.96, + "grad_norm": 1.3126609546471568, + "learning_rate": 3.6272941654666995e-08, + "loss": 0.2668, + "step": 33194 + }, + { + "epoch": 0.96, + "grad_norm": 2.4694357026434206, + "learning_rate": 3.621648807733913e-08, + "loss": 0.2737, + "step": 33195 + }, + { + "epoch": 0.96, + "grad_norm": 1.2855572376793836, + "learning_rate": 3.6160078305456094e-08, + "loss": 0.2696, + "step": 33196 + }, + { + "epoch": 0.96, + "grad_norm": 1.2699232545290366, + "learning_rate": 3.610371233951526e-08, + "loss": 0.2533, + "step": 33197 + }, + { + "epoch": 0.96, + "grad_norm": 1.3147519127054839, + "learning_rate": 3.6047390180014576e-08, + "loss": 0.255, + "step": 33198 + }, + { + "epoch": 0.96, + "grad_norm": 1.539647845818378, + "learning_rate": 3.59911118274503e-08, + "loss": 0.2676, + "step": 33199 + }, + { + "epoch": 0.96, + "grad_norm": 1.3188926676733275, + "learning_rate": 3.5934877282319815e-08, + "loss": 0.2731, + "step": 33200 + }, + { + "epoch": 0.96, + "grad_norm": 4.960966255316051, + "learning_rate": 3.587868654511883e-08, + "loss": 0.2664, + "step": 33201 + }, + { + "epoch": 0.96, + "grad_norm": 1.3576741223382716, + "learning_rate": 3.5822539616343634e-08, + "loss": 0.2463, + "step": 33202 + }, + { + "epoch": 0.96, + "grad_norm": 1.35114598538717, + "learning_rate": 3.5766436496489365e-08, + "loss": 0.2787, + "step": 33203 + }, + { + "epoch": 0.96, + "grad_norm": 1.7247849075661397, + "learning_rate": 3.57103771860523e-08, + "loss": 0.282, + "step": 33204 + }, + { + "epoch": 0.96, + "grad_norm": 1.5810671118683695, + "learning_rate": 3.5654361685525385e-08, + "loss": 0.2592, + "step": 33205 + }, + { + "epoch": 0.96, + "grad_norm": 1.3741484917449271, + "learning_rate": 3.559838999540377e-08, + "loss": 0.2516, + "step": 33206 + }, + { + "epoch": 0.96, + "grad_norm": 1.800311485851184, + "learning_rate": 3.554246211618151e-08, + "loss": 0.26, + "step": 33207 + }, + { + "epoch": 0.96, + "grad_norm": 1.2973352594092007, + "learning_rate": 3.548657804835209e-08, + "loss": 0.2718, + "step": 33208 + }, + { + "epoch": 0.96, + "grad_norm": 1.4905619307189755, + "learning_rate": 3.543073779240902e-08, + "loss": 0.2842, + "step": 33209 + }, + { + "epoch": 0.96, + "grad_norm": 1.3725850371472768, + "learning_rate": 3.537494134884467e-08, + "loss": 0.2943, + "step": 33210 + }, + { + "epoch": 0.96, + "grad_norm": 1.256856489032732, + "learning_rate": 3.5319188718150876e-08, + "loss": 0.2718, + "step": 33211 + }, + { + "epoch": 0.96, + "grad_norm": 1.4737202220008019, + "learning_rate": 3.526347990082113e-08, + "loss": 0.2725, + "step": 33212 + }, + { + "epoch": 0.96, + "grad_norm": 1.3347839340753944, + "learning_rate": 3.520781489734615e-08, + "loss": 0.2659, + "step": 33213 + }, + { + "epoch": 0.96, + "grad_norm": 1.2986679730801596, + "learning_rate": 3.5152193708217206e-08, + "loss": 0.2794, + "step": 33214 + }, + { + "epoch": 0.96, + "grad_norm": 1.4306261837640528, + "learning_rate": 3.509661633392503e-08, + "loss": 0.2785, + "step": 33215 + }, + { + "epoch": 0.96, + "grad_norm": 1.533026333747617, + "learning_rate": 3.504108277496032e-08, + "loss": 0.2618, + "step": 33216 + }, + { + "epoch": 0.96, + "grad_norm": 1.3071236583283703, + "learning_rate": 3.498559303181326e-08, + "loss": 0.2523, + "step": 33217 + }, + { + "epoch": 0.96, + "grad_norm": 1.2952849205726111, + "learning_rate": 3.493014710497289e-08, + "loss": 0.2878, + "step": 33218 + }, + { + "epoch": 0.96, + "grad_norm": 1.8513880687671775, + "learning_rate": 3.487474499492993e-08, + "loss": 0.2724, + "step": 33219 + }, + { + "epoch": 0.96, + "grad_norm": 5.7159581344551285, + "learning_rate": 3.481938670217178e-08, + "loss": 0.2731, + "step": 33220 + }, + { + "epoch": 0.96, + "grad_norm": 1.3587228949172072, + "learning_rate": 3.476407222718803e-08, + "loss": 0.2636, + "step": 33221 + }, + { + "epoch": 0.96, + "grad_norm": 1.7773066729318283, + "learning_rate": 3.470880157046608e-08, + "loss": 0.259, + "step": 33222 + }, + { + "epoch": 0.96, + "grad_norm": 1.2773786795705249, + "learning_rate": 3.465357473249442e-08, + "loss": 0.2779, + "step": 33223 + }, + { + "epoch": 0.96, + "grad_norm": 1.3601926798080448, + "learning_rate": 3.4598391713759895e-08, + "loss": 0.2634, + "step": 33224 + }, + { + "epoch": 0.96, + "grad_norm": 1.5412582771847656, + "learning_rate": 3.454325251474933e-08, + "loss": 0.3166, + "step": 33225 + }, + { + "epoch": 0.96, + "grad_norm": 1.3930010461331388, + "learning_rate": 3.448815713595011e-08, + "loss": 0.2488, + "step": 33226 + }, + { + "epoch": 0.96, + "grad_norm": 1.7437844271274567, + "learning_rate": 3.443310557784796e-08, + "loss": 0.2656, + "step": 33227 + }, + { + "epoch": 0.96, + "grad_norm": 1.283710592740447, + "learning_rate": 3.4378097840928605e-08, + "loss": 0.2599, + "step": 33228 + }, + { + "epoch": 0.96, + "grad_norm": 1.4083118660370046, + "learning_rate": 3.432313392567721e-08, + "loss": 0.2655, + "step": 33229 + }, + { + "epoch": 0.96, + "grad_norm": 1.2219165568539698, + "learning_rate": 3.42682138325795e-08, + "loss": 0.2629, + "step": 33230 + }, + { + "epoch": 0.96, + "grad_norm": 1.3061505099941944, + "learning_rate": 3.421333756212064e-08, + "loss": 0.2757, + "step": 33231 + }, + { + "epoch": 0.96, + "grad_norm": 1.52989055106288, + "learning_rate": 3.415850511478358e-08, + "loss": 0.2622, + "step": 33232 + }, + { + "epoch": 0.96, + "grad_norm": 1.3833308883180506, + "learning_rate": 3.410371649105238e-08, + "loss": 0.278, + "step": 33233 + }, + { + "epoch": 0.96, + "grad_norm": 1.4135307999256952, + "learning_rate": 3.404897169141164e-08, + "loss": 0.2815, + "step": 33234 + }, + { + "epoch": 0.96, + "grad_norm": 1.2813237661197379, + "learning_rate": 3.39942707163432e-08, + "loss": 0.2585, + "step": 33235 + }, + { + "epoch": 0.96, + "grad_norm": 1.4639433286053454, + "learning_rate": 3.3939613566330573e-08, + "loss": 0.3047, + "step": 33236 + }, + { + "epoch": 0.96, + "grad_norm": 1.4270684161897436, + "learning_rate": 3.3885000241856145e-08, + "loss": 0.269, + "step": 33237 + }, + { + "epoch": 0.96, + "grad_norm": 1.7549004079950763, + "learning_rate": 3.383043074340175e-08, + "loss": 0.2702, + "step": 33238 + }, + { + "epoch": 0.96, + "grad_norm": 1.2069721649097997, + "learning_rate": 3.3775905071448675e-08, + "loss": 0.2412, + "step": 33239 + }, + { + "epoch": 0.96, + "grad_norm": 1.4044884416026355, + "learning_rate": 3.372142322647876e-08, + "loss": 0.2726, + "step": 33240 + }, + { + "epoch": 0.96, + "grad_norm": 1.3504760564357656, + "learning_rate": 3.366698520897216e-08, + "loss": 0.271, + "step": 33241 + }, + { + "epoch": 0.96, + "grad_norm": 1.4594461357538506, + "learning_rate": 3.3612591019409055e-08, + "loss": 0.2508, + "step": 33242 + }, + { + "epoch": 0.96, + "grad_norm": 1.30795878219363, + "learning_rate": 3.355824065827018e-08, + "loss": 0.255, + "step": 33243 + }, + { + "epoch": 0.96, + "grad_norm": 1.35529610455722, + "learning_rate": 3.350393412603514e-08, + "loss": 0.2446, + "step": 33244 + }, + { + "epoch": 0.96, + "grad_norm": 1.500950306901506, + "learning_rate": 3.3449671423183003e-08, + "loss": 0.2926, + "step": 33245 + }, + { + "epoch": 0.96, + "grad_norm": 1.25080570273461, + "learning_rate": 3.3395452550192277e-08, + "loss": 0.2797, + "step": 33246 + }, + { + "epoch": 0.96, + "grad_norm": 1.1989141910247603, + "learning_rate": 3.3341277507542016e-08, + "loss": 0.2452, + "step": 33247 + }, + { + "epoch": 0.96, + "grad_norm": 1.4413070820985077, + "learning_rate": 3.328714629571017e-08, + "loss": 0.2797, + "step": 33248 + }, + { + "epoch": 0.96, + "grad_norm": 1.2886849736345645, + "learning_rate": 3.323305891517414e-08, + "loss": 0.2782, + "step": 33249 + }, + { + "epoch": 0.96, + "grad_norm": 1.2760113414031389, + "learning_rate": 3.317901536641133e-08, + "loss": 0.2775, + "step": 33250 + }, + { + "epoch": 0.96, + "grad_norm": 1.418778496662258, + "learning_rate": 3.312501564989912e-08, + "loss": 0.263, + "step": 33251 + }, + { + "epoch": 0.96, + "grad_norm": 1.4248977891942791, + "learning_rate": 3.30710597661138e-08, + "loss": 0.2544, + "step": 33252 + }, + { + "epoch": 0.96, + "grad_norm": 1.4176653026210477, + "learning_rate": 3.30171477155311e-08, + "loss": 0.2555, + "step": 33253 + }, + { + "epoch": 0.96, + "grad_norm": 1.3777729289325336, + "learning_rate": 3.296327949862732e-08, + "loss": 0.2676, + "step": 33254 + }, + { + "epoch": 0.96, + "grad_norm": 3.2056026691652098, + "learning_rate": 3.2909455115878173e-08, + "loss": 0.2645, + "step": 33255 + }, + { + "epoch": 0.96, + "grad_norm": 1.4260081977236698, + "learning_rate": 3.2855674567757736e-08, + "loss": 0.2805, + "step": 33256 + }, + { + "epoch": 0.96, + "grad_norm": 1.4229741620704166, + "learning_rate": 3.2801937854741173e-08, + "loss": 0.2555, + "step": 33257 + }, + { + "epoch": 0.96, + "grad_norm": 1.3846773061185778, + "learning_rate": 3.274824497730256e-08, + "loss": 0.2676, + "step": 33258 + }, + { + "epoch": 0.96, + "grad_norm": 1.2969802264779864, + "learning_rate": 3.269459593591595e-08, + "loss": 0.287, + "step": 33259 + }, + { + "epoch": 0.96, + "grad_norm": 1.305832317046418, + "learning_rate": 3.264099073105431e-08, + "loss": 0.2469, + "step": 33260 + }, + { + "epoch": 0.96, + "grad_norm": 1.5554201268596193, + "learning_rate": 3.2587429363191145e-08, + "loss": 0.2457, + "step": 33261 + }, + { + "epoch": 0.96, + "grad_norm": 3.156683709525343, + "learning_rate": 3.253391183279941e-08, + "loss": 0.2951, + "step": 33262 + }, + { + "epoch": 0.96, + "grad_norm": 1.4482079610057947, + "learning_rate": 3.24804381403504e-08, + "loss": 0.2569, + "step": 33263 + }, + { + "epoch": 0.96, + "grad_norm": 1.8016375771598734, + "learning_rate": 3.242700828631706e-08, + "loss": 0.2895, + "step": 33264 + }, + { + "epoch": 0.96, + "grad_norm": 1.4655878266530398, + "learning_rate": 3.2373622271170135e-08, + "loss": 0.2856, + "step": 33265 + }, + { + "epoch": 0.96, + "grad_norm": 1.269827242363895, + "learning_rate": 3.232028009538146e-08, + "loss": 0.2615, + "step": 33266 + }, + { + "epoch": 0.96, + "grad_norm": 0.9322051348173166, + "learning_rate": 3.2266981759421224e-08, + "loss": 0.6067, + "step": 33267 + }, + { + "epoch": 0.96, + "grad_norm": 1.4163146501780517, + "learning_rate": 3.221372726376015e-08, + "loss": 0.2749, + "step": 33268 + }, + { + "epoch": 0.96, + "grad_norm": 1.2892385528060186, + "learning_rate": 3.216051660886787e-08, + "loss": 0.2629, + "step": 33269 + }, + { + "epoch": 0.96, + "grad_norm": 1.3296958763702755, + "learning_rate": 3.210734979521402e-08, + "loss": 0.2772, + "step": 33270 + }, + { + "epoch": 0.97, + "grad_norm": 1.5662066263964791, + "learning_rate": 3.20542268232682e-08, + "loss": 0.2685, + "step": 33271 + }, + { + "epoch": 0.97, + "grad_norm": 1.469973938587402, + "learning_rate": 3.200114769349838e-08, + "loss": 0.2632, + "step": 33272 + }, + { + "epoch": 0.97, + "grad_norm": 1.3227301837811931, + "learning_rate": 3.1948112406374186e-08, + "loss": 0.2742, + "step": 33273 + }, + { + "epoch": 0.97, + "grad_norm": 1.3836340995335599, + "learning_rate": 3.189512096236302e-08, + "loss": 0.2882, + "step": 33274 + }, + { + "epoch": 0.97, + "grad_norm": 1.3726667376039214, + "learning_rate": 3.1842173361932295e-08, + "loss": 0.2693, + "step": 33275 + }, + { + "epoch": 0.97, + "grad_norm": 1.4140509627114088, + "learning_rate": 3.17892696055494e-08, + "loss": 0.27, + "step": 33276 + }, + { + "epoch": 0.97, + "grad_norm": 1.4154086462805227, + "learning_rate": 3.173640969368175e-08, + "loss": 0.2818, + "step": 33277 + }, + { + "epoch": 0.97, + "grad_norm": 1.4690009018205703, + "learning_rate": 3.1683593626794515e-08, + "loss": 0.2592, + "step": 33278 + }, + { + "epoch": 0.97, + "grad_norm": 1.349472085006678, + "learning_rate": 3.163082140535567e-08, + "loss": 0.2829, + "step": 33279 + }, + { + "epoch": 0.97, + "grad_norm": 1.281412517582784, + "learning_rate": 3.1578093029829284e-08, + "loss": 0.2635, + "step": 33280 + }, + { + "epoch": 0.97, + "grad_norm": 1.4071723865229822, + "learning_rate": 3.152540850068164e-08, + "loss": 0.2749, + "step": 33281 + }, + { + "epoch": 0.97, + "grad_norm": 1.223564073663936, + "learning_rate": 3.147276781837738e-08, + "loss": 0.2546, + "step": 33282 + }, + { + "epoch": 0.97, + "grad_norm": 1.3035847714966529, + "learning_rate": 3.1420170983381125e-08, + "loss": 0.27, + "step": 33283 + }, + { + "epoch": 0.97, + "grad_norm": 1.3103298963530738, + "learning_rate": 3.136761799615695e-08, + "loss": 0.3009, + "step": 33284 + }, + { + "epoch": 0.97, + "grad_norm": 1.470787315065924, + "learning_rate": 3.131510885716837e-08, + "loss": 0.2572, + "step": 33285 + }, + { + "epoch": 0.97, + "grad_norm": 1.2102514570709257, + "learning_rate": 3.126264356687947e-08, + "loss": 0.2557, + "step": 33286 + }, + { + "epoch": 0.97, + "grad_norm": 1.3678860355175821, + "learning_rate": 3.121022212575264e-08, + "loss": 0.268, + "step": 33287 + }, + { + "epoch": 0.97, + "grad_norm": 1.2386661586734051, + "learning_rate": 3.115784453425086e-08, + "loss": 0.2581, + "step": 33288 + }, + { + "epoch": 0.97, + "grad_norm": 1.2875065877107608, + "learning_rate": 3.110551079283597e-08, + "loss": 0.2589, + "step": 33289 + }, + { + "epoch": 0.97, + "grad_norm": 1.286975958993169, + "learning_rate": 3.1053220901970383e-08, + "loss": 0.2377, + "step": 33290 + }, + { + "epoch": 0.97, + "grad_norm": 1.3461553201322767, + "learning_rate": 3.100097486211484e-08, + "loss": 0.2453, + "step": 33291 + }, + { + "epoch": 0.97, + "grad_norm": 1.3780656462329084, + "learning_rate": 3.09487726737312e-08, + "loss": 0.2826, + "step": 33292 + }, + { + "epoch": 0.97, + "grad_norm": 1.2652237506910786, + "learning_rate": 3.089661433727964e-08, + "loss": 0.2499, + "step": 33293 + }, + { + "epoch": 0.97, + "grad_norm": 1.3783526445989456, + "learning_rate": 3.0844499853220356e-08, + "loss": 0.256, + "step": 33294 + }, + { + "epoch": 0.97, + "grad_norm": 1.2779630735368057, + "learning_rate": 3.079242922201409e-08, + "loss": 0.2654, + "step": 33295 + }, + { + "epoch": 0.97, + "grad_norm": 1.3223006660785843, + "learning_rate": 3.074040244411936e-08, + "loss": 0.267, + "step": 33296 + }, + { + "epoch": 0.97, + "grad_norm": 1.5202561252208477, + "learning_rate": 3.068841951999524e-08, + "loss": 0.2765, + "step": 33297 + }, + { + "epoch": 0.97, + "grad_norm": 1.3300186642140586, + "learning_rate": 3.063648045010137e-08, + "loss": 0.2742, + "step": 33298 + }, + { + "epoch": 0.97, + "grad_norm": 1.29508876852887, + "learning_rate": 3.058458523489571e-08, + "loss": 0.2643, + "step": 33299 + }, + { + "epoch": 0.97, + "grad_norm": 1.3861829812924642, + "learning_rate": 3.053273387483624e-08, + "loss": 0.266, + "step": 33300 + }, + { + "epoch": 0.97, + "grad_norm": 1.6592667117771416, + "learning_rate": 3.0480926370380915e-08, + "loss": 0.2847, + "step": 33301 + }, + { + "epoch": 0.97, + "grad_norm": 1.4531794338971118, + "learning_rate": 3.042916272198604e-08, + "loss": 0.2848, + "step": 33302 + }, + { + "epoch": 0.97, + "grad_norm": 1.7289854644346827, + "learning_rate": 3.037744293010902e-08, + "loss": 0.2512, + "step": 33303 + }, + { + "epoch": 0.97, + "grad_norm": 1.3861025455030807, + "learning_rate": 3.0325766995206176e-08, + "loss": 0.2615, + "step": 33304 + }, + { + "epoch": 0.97, + "grad_norm": 1.3436590923280811, + "learning_rate": 3.027413491773379e-08, + "loss": 0.2574, + "step": 33305 + }, + { + "epoch": 0.97, + "grad_norm": 1.4222574360165974, + "learning_rate": 3.022254669814706e-08, + "loss": 0.2626, + "step": 33306 + }, + { + "epoch": 0.97, + "grad_norm": 1.3053380810668738, + "learning_rate": 3.017100233690118e-08, + "loss": 0.2597, + "step": 33307 + }, + { + "epoch": 0.97, + "grad_norm": 1.4763024632416064, + "learning_rate": 3.011950183445189e-08, + "loss": 0.2489, + "step": 33308 + }, + { + "epoch": 0.97, + "grad_norm": 0.9245818615670826, + "learning_rate": 3.0068045191252727e-08, + "loss": 0.6184, + "step": 33309 + }, + { + "epoch": 0.97, + "grad_norm": 1.3515734075043508, + "learning_rate": 3.0016632407758873e-08, + "loss": 0.2527, + "step": 33310 + }, + { + "epoch": 0.97, + "grad_norm": 1.4124295927874133, + "learning_rate": 2.996526348442275e-08, + "loss": 0.2695, + "step": 33311 + }, + { + "epoch": 0.97, + "grad_norm": 1.4855220716569955, + "learning_rate": 2.991393842169843e-08, + "loss": 0.2748, + "step": 33312 + }, + { + "epoch": 0.97, + "grad_norm": 1.3570944154717022, + "learning_rate": 2.986265722003889e-08, + "loss": 0.2573, + "step": 33313 + }, + { + "epoch": 0.97, + "grad_norm": 1.3761875822781648, + "learning_rate": 2.9811419879896554e-08, + "loss": 0.2619, + "step": 33314 + }, + { + "epoch": 0.97, + "grad_norm": 1.518904183307001, + "learning_rate": 2.9760226401723268e-08, + "loss": 0.2667, + "step": 33315 + }, + { + "epoch": 0.97, + "grad_norm": 1.3160985475654365, + "learning_rate": 2.9709076785971456e-08, + "loss": 0.2541, + "step": 33316 + }, + { + "epoch": 0.97, + "grad_norm": 1.434283226858243, + "learning_rate": 2.965797103309187e-08, + "loss": 0.2639, + "step": 33317 + }, + { + "epoch": 0.97, + "grad_norm": 1.280262547169369, + "learning_rate": 2.9606909143536366e-08, + "loss": 0.2603, + "step": 33318 + }, + { + "epoch": 0.97, + "grad_norm": 1.4873588955832293, + "learning_rate": 2.9555891117754588e-08, + "loss": 0.294, + "step": 33319 + }, + { + "epoch": 0.97, + "grad_norm": 1.521101851680722, + "learning_rate": 2.950491695619673e-08, + "loss": 0.2644, + "step": 33320 + }, + { + "epoch": 0.97, + "grad_norm": 0.9526554233026453, + "learning_rate": 2.9453986659314094e-08, + "loss": 0.6118, + "step": 33321 + }, + { + "epoch": 0.97, + "grad_norm": 1.81726536061149, + "learning_rate": 2.940310022755466e-08, + "loss": 0.2306, + "step": 33322 + }, + { + "epoch": 0.97, + "grad_norm": 1.237361609770053, + "learning_rate": 2.9352257661368066e-08, + "loss": 0.2607, + "step": 33323 + }, + { + "epoch": 0.97, + "grad_norm": 1.7853103456102564, + "learning_rate": 2.9301458961202843e-08, + "loss": 0.2796, + "step": 33324 + }, + { + "epoch": 0.97, + "grad_norm": 4.907006909884214, + "learning_rate": 2.9250704127507522e-08, + "loss": 0.267, + "step": 33325 + }, + { + "epoch": 0.97, + "grad_norm": 1.2485407247728115, + "learning_rate": 2.9199993160730078e-08, + "loss": 0.2562, + "step": 33326 + }, + { + "epoch": 0.97, + "grad_norm": 1.3672337112594561, + "learning_rate": 2.9149326061317373e-08, + "loss": 0.2491, + "step": 33327 + }, + { + "epoch": 0.97, + "grad_norm": 1.3891673608338806, + "learning_rate": 2.9098702829717386e-08, + "loss": 0.274, + "step": 33328 + }, + { + "epoch": 0.97, + "grad_norm": 1.5722148782446277, + "learning_rate": 2.9048123466375867e-08, + "loss": 0.2705, + "step": 33329 + }, + { + "epoch": 0.97, + "grad_norm": 1.3009151139959783, + "learning_rate": 2.8997587971740794e-08, + "loss": 0.2797, + "step": 33330 + }, + { + "epoch": 0.97, + "grad_norm": 1.5403367900013425, + "learning_rate": 2.894709634625681e-08, + "loss": 0.2636, + "step": 33331 + }, + { + "epoch": 0.97, + "grad_norm": 1.7675803791242997, + "learning_rate": 2.889664859036967e-08, + "loss": 0.2834, + "step": 33332 + }, + { + "epoch": 0.97, + "grad_norm": 1.421595164169718, + "learning_rate": 2.8846244704524575e-08, + "loss": 0.2562, + "step": 33333 + }, + { + "epoch": 0.97, + "grad_norm": 1.1633645265517998, + "learning_rate": 2.879588468916672e-08, + "loss": 0.2442, + "step": 33334 + }, + { + "epoch": 0.97, + "grad_norm": 1.350232344906265, + "learning_rate": 2.87455685447402e-08, + "loss": 0.2615, + "step": 33335 + }, + { + "epoch": 0.97, + "grad_norm": 1.3697245086877357, + "learning_rate": 2.86952962716891e-08, + "loss": 0.2997, + "step": 33336 + }, + { + "epoch": 0.97, + "grad_norm": 1.3523899898614173, + "learning_rate": 2.864506787045751e-08, + "loss": 0.2704, + "step": 33337 + }, + { + "epoch": 0.97, + "grad_norm": 1.7812374132337003, + "learning_rate": 2.8594883341487856e-08, + "loss": 0.2723, + "step": 33338 + }, + { + "epoch": 0.97, + "grad_norm": 1.2975766467682124, + "learning_rate": 2.8544742685224226e-08, + "loss": 0.2807, + "step": 33339 + }, + { + "epoch": 0.97, + "grad_norm": 1.4833458925245757, + "learning_rate": 2.849464590210793e-08, + "loss": 0.264, + "step": 33340 + }, + { + "epoch": 0.97, + "grad_norm": 1.5110404333705898, + "learning_rate": 2.8444592992581954e-08, + "loss": 0.2751, + "step": 33341 + }, + { + "epoch": 0.97, + "grad_norm": 3.4479039052886713, + "learning_rate": 2.839458395708705e-08, + "loss": 0.2753, + "step": 33342 + }, + { + "epoch": 0.97, + "grad_norm": 0.9756984381332349, + "learning_rate": 2.834461879606565e-08, + "loss": 0.5527, + "step": 33343 + }, + { + "epoch": 0.97, + "grad_norm": 0.9056893537568311, + "learning_rate": 2.8294697509957393e-08, + "loss": 0.5786, + "step": 33344 + }, + { + "epoch": 0.97, + "grad_norm": 1.2832710991703813, + "learning_rate": 2.824482009920415e-08, + "loss": 0.2992, + "step": 33345 + }, + { + "epoch": 0.97, + "grad_norm": 1.4669910918674036, + "learning_rate": 2.8194986564245576e-08, + "loss": 0.2586, + "step": 33346 + }, + { + "epoch": 0.97, + "grad_norm": 1.4372538942646405, + "learning_rate": 2.8145196905520755e-08, + "loss": 0.274, + "step": 33347 + }, + { + "epoch": 0.97, + "grad_norm": 1.365478436216134, + "learning_rate": 2.8095451123470453e-08, + "loss": 0.2738, + "step": 33348 + }, + { + "epoch": 0.97, + "grad_norm": 1.7651703031102322, + "learning_rate": 2.804574921853265e-08, + "loss": 0.272, + "step": 33349 + }, + { + "epoch": 0.97, + "grad_norm": 1.3408106980746297, + "learning_rate": 2.7996091191146435e-08, + "loss": 0.2526, + "step": 33350 + }, + { + "epoch": 0.97, + "grad_norm": 1.2797637983659584, + "learning_rate": 2.7946477041749242e-08, + "loss": 0.2458, + "step": 33351 + }, + { + "epoch": 0.97, + "grad_norm": 1.4132198657856487, + "learning_rate": 2.7896906770780164e-08, + "loss": 0.2736, + "step": 33352 + }, + { + "epoch": 0.97, + "grad_norm": 1.349233090811401, + "learning_rate": 2.784738037867607e-08, + "loss": 0.2789, + "step": 33353 + }, + { + "epoch": 0.97, + "grad_norm": 1.469364363982193, + "learning_rate": 2.7797897865873834e-08, + "loss": 0.2557, + "step": 33354 + }, + { + "epoch": 0.97, + "grad_norm": 1.4002622245969987, + "learning_rate": 2.7748459232810333e-08, + "loss": 0.2742, + "step": 33355 + }, + { + "epoch": 0.97, + "grad_norm": 1.6323328985647179, + "learning_rate": 2.7699064479921323e-08, + "loss": 0.2646, + "step": 33356 + }, + { + "epoch": 0.97, + "grad_norm": 1.2575637711676613, + "learning_rate": 2.764971360764368e-08, + "loss": 0.2726, + "step": 33357 + }, + { + "epoch": 0.97, + "grad_norm": 1.3999178205708191, + "learning_rate": 2.7600406616413165e-08, + "loss": 0.2647, + "step": 33358 + }, + { + "epoch": 0.97, + "grad_norm": 1.3997359221919277, + "learning_rate": 2.7551143506663324e-08, + "loss": 0.259, + "step": 33359 + }, + { + "epoch": 0.97, + "grad_norm": 1.2534654859700665, + "learning_rate": 2.7501924278829918e-08, + "loss": 0.2696, + "step": 33360 + }, + { + "epoch": 0.97, + "grad_norm": 1.4144886367819032, + "learning_rate": 2.7452748933347597e-08, + "loss": 0.2786, + "step": 33361 + }, + { + "epoch": 0.97, + "grad_norm": 1.4099549865053995, + "learning_rate": 2.7403617470649346e-08, + "loss": 0.284, + "step": 33362 + }, + { + "epoch": 0.97, + "grad_norm": 1.2617746847555797, + "learning_rate": 2.7354529891169824e-08, + "loss": 0.2546, + "step": 33363 + }, + { + "epoch": 0.97, + "grad_norm": 1.3349853772359772, + "learning_rate": 2.7305486195341456e-08, + "loss": 0.2583, + "step": 33364 + }, + { + "epoch": 0.97, + "grad_norm": 1.9490524552373445, + "learning_rate": 2.7256486383597236e-08, + "loss": 0.2523, + "step": 33365 + }, + { + "epoch": 0.97, + "grad_norm": 1.5033655505404357, + "learning_rate": 2.7207530456370146e-08, + "loss": 0.2786, + "step": 33366 + }, + { + "epoch": 0.97, + "grad_norm": 1.4271982113167154, + "learning_rate": 2.7158618414091507e-08, + "loss": 0.2833, + "step": 33367 + }, + { + "epoch": 0.97, + "grad_norm": 1.8663338783168304, + "learning_rate": 2.7109750257193202e-08, + "loss": 0.2845, + "step": 33368 + }, + { + "epoch": 0.97, + "grad_norm": 1.3299836268470921, + "learning_rate": 2.7060925986106545e-08, + "loss": 0.2666, + "step": 33369 + }, + { + "epoch": 0.97, + "grad_norm": 1.5741357032010035, + "learning_rate": 2.7012145601262308e-08, + "loss": 0.2743, + "step": 33370 + }, + { + "epoch": 0.97, + "grad_norm": 1.4882503958586915, + "learning_rate": 2.6963409103091254e-08, + "loss": 0.2657, + "step": 33371 + }, + { + "epoch": 0.97, + "grad_norm": 1.3907166176414707, + "learning_rate": 2.691471649202304e-08, + "loss": 0.2534, + "step": 33372 + }, + { + "epoch": 0.97, + "grad_norm": 1.4517298333139186, + "learning_rate": 2.686606776848788e-08, + "loss": 0.267, + "step": 33373 + }, + { + "epoch": 0.97, + "grad_norm": 1.3580223918201109, + "learning_rate": 2.681746293291432e-08, + "loss": 0.2824, + "step": 33374 + }, + { + "epoch": 0.97, + "grad_norm": 1.2597456553707413, + "learning_rate": 2.6768901985732565e-08, + "loss": 0.2777, + "step": 33375 + }, + { + "epoch": 0.97, + "grad_norm": 1.3683684633695214, + "learning_rate": 2.672038492737006e-08, + "loss": 0.3271, + "step": 33376 + }, + { + "epoch": 0.97, + "grad_norm": 1.3131884063762904, + "learning_rate": 2.667191175825534e-08, + "loss": 0.2785, + "step": 33377 + }, + { + "epoch": 0.97, + "grad_norm": 1.3548245170663324, + "learning_rate": 2.6623482478815854e-08, + "loss": 0.257, + "step": 33378 + }, + { + "epoch": 0.97, + "grad_norm": 1.4867091491305227, + "learning_rate": 2.6575097089479584e-08, + "loss": 0.3049, + "step": 33379 + }, + { + "epoch": 0.97, + "grad_norm": 1.4924289779816962, + "learning_rate": 2.6526755590673414e-08, + "loss": 0.277, + "step": 33380 + }, + { + "epoch": 0.97, + "grad_norm": 1.2415978770291396, + "learning_rate": 2.647845798282367e-08, + "loss": 0.2745, + "step": 33381 + }, + { + "epoch": 0.97, + "grad_norm": 1.24676604028856, + "learning_rate": 2.6430204266356675e-08, + "loss": 0.2634, + "step": 33382 + }, + { + "epoch": 0.97, + "grad_norm": 1.2726239807450015, + "learning_rate": 2.63819944416982e-08, + "loss": 0.3029, + "step": 33383 + }, + { + "epoch": 0.97, + "grad_norm": 1.3018343833457005, + "learning_rate": 2.6333828509274018e-08, + "loss": 0.2647, + "step": 33384 + }, + { + "epoch": 0.97, + "grad_norm": 1.5697931103874276, + "learning_rate": 2.6285706469508788e-08, + "loss": 0.2698, + "step": 33385 + }, + { + "epoch": 0.97, + "grad_norm": 1.362887671613786, + "learning_rate": 2.6237628322827725e-08, + "loss": 0.2984, + "step": 33386 + }, + { + "epoch": 0.97, + "grad_norm": 1.4308131584941912, + "learning_rate": 2.618959406965438e-08, + "loss": 0.2702, + "step": 33387 + }, + { + "epoch": 0.97, + "grad_norm": 1.2595624925227056, + "learning_rate": 2.6141603710412855e-08, + "loss": 0.2573, + "step": 33388 + }, + { + "epoch": 0.97, + "grad_norm": 1.42548861360289, + "learning_rate": 2.609365724552726e-08, + "loss": 0.2475, + "step": 33389 + }, + { + "epoch": 0.97, + "grad_norm": 2.681055368603141, + "learning_rate": 2.604575467542003e-08, + "loss": 0.2908, + "step": 33390 + }, + { + "epoch": 0.97, + "grad_norm": 1.7407386386553823, + "learning_rate": 2.599789600051472e-08, + "loss": 0.2509, + "step": 33391 + }, + { + "epoch": 0.97, + "grad_norm": 1.3913450537798213, + "learning_rate": 2.5950081221232658e-08, + "loss": 0.2783, + "step": 33392 + }, + { + "epoch": 0.97, + "grad_norm": 1.3749640251519448, + "learning_rate": 2.5902310337996284e-08, + "loss": 0.2976, + "step": 33393 + }, + { + "epoch": 0.97, + "grad_norm": 1.5221778721349124, + "learning_rate": 2.5854583351228037e-08, + "loss": 0.2542, + "step": 33394 + }, + { + "epoch": 0.97, + "grad_norm": 1.4993970254996705, + "learning_rate": 2.5806900261347578e-08, + "loss": 0.2917, + "step": 33395 + }, + { + "epoch": 0.97, + "grad_norm": 1.6356968519885249, + "learning_rate": 2.5759261068776243e-08, + "loss": 0.268, + "step": 33396 + }, + { + "epoch": 0.97, + "grad_norm": 1.4343051651951635, + "learning_rate": 2.5711665773934803e-08, + "loss": 0.2759, + "step": 33397 + }, + { + "epoch": 0.97, + "grad_norm": 1.5608598477980673, + "learning_rate": 2.5664114377243478e-08, + "loss": 0.2722, + "step": 33398 + }, + { + "epoch": 0.97, + "grad_norm": 2.390539255914746, + "learning_rate": 2.561660687912082e-08, + "loss": 0.2645, + "step": 33399 + }, + { + "epoch": 0.97, + "grad_norm": 1.4524651477940238, + "learning_rate": 2.5569143279987053e-08, + "loss": 0.2998, + "step": 33400 + }, + { + "epoch": 0.97, + "grad_norm": 1.4978003902088486, + "learning_rate": 2.5521723580260727e-08, + "loss": 0.2871, + "step": 33401 + }, + { + "epoch": 0.97, + "grad_norm": 1.4721264110269665, + "learning_rate": 2.5474347780360954e-08, + "loss": 0.2689, + "step": 33402 + }, + { + "epoch": 0.97, + "grad_norm": 1.484712415126253, + "learning_rate": 2.542701588070462e-08, + "loss": 0.2753, + "step": 33403 + }, + { + "epoch": 0.97, + "grad_norm": 1.5767764093269094, + "learning_rate": 2.537972788171028e-08, + "loss": 0.2719, + "step": 33404 + }, + { + "epoch": 0.97, + "grad_norm": 1.2851352025299783, + "learning_rate": 2.5332483783794827e-08, + "loss": 0.2973, + "step": 33405 + }, + { + "epoch": 0.97, + "grad_norm": 1.5570156234767387, + "learning_rate": 2.5285283587375697e-08, + "loss": 0.2743, + "step": 33406 + }, + { + "epoch": 0.97, + "grad_norm": 1.3697949367047233, + "learning_rate": 2.5238127292868674e-08, + "loss": 0.2662, + "step": 33407 + }, + { + "epoch": 0.97, + "grad_norm": 1.354014270331964, + "learning_rate": 2.5191014900690646e-08, + "loss": 0.2754, + "step": 33408 + }, + { + "epoch": 0.97, + "grad_norm": 1.3151762245250493, + "learning_rate": 2.514394641125739e-08, + "loss": 0.2766, + "step": 33409 + }, + { + "epoch": 0.97, + "grad_norm": 2.587151809385669, + "learning_rate": 2.509692182498358e-08, + "loss": 0.2547, + "step": 33410 + }, + { + "epoch": 0.97, + "grad_norm": 1.3247491548574992, + "learning_rate": 2.5049941142284985e-08, + "loss": 0.2735, + "step": 33411 + }, + { + "epoch": 0.97, + "grad_norm": 1.2547172434104663, + "learning_rate": 2.5003004363575168e-08, + "loss": 0.2679, + "step": 33412 + }, + { + "epoch": 0.97, + "grad_norm": 1.2269785071232597, + "learning_rate": 2.495611148926991e-08, + "loss": 0.2628, + "step": 33413 + }, + { + "epoch": 0.97, + "grad_norm": 1.2738773760573365, + "learning_rate": 2.490926251978165e-08, + "loss": 0.272, + "step": 33414 + }, + { + "epoch": 0.97, + "grad_norm": 1.2792122225293134, + "learning_rate": 2.4862457455524513e-08, + "loss": 0.2781, + "step": 33415 + }, + { + "epoch": 0.97, + "grad_norm": 1.398980297792467, + "learning_rate": 2.481569629691094e-08, + "loss": 0.2732, + "step": 33416 + }, + { + "epoch": 0.97, + "grad_norm": 1.2935856550476457, + "learning_rate": 2.476897904435449e-08, + "loss": 0.2566, + "step": 33417 + }, + { + "epoch": 0.97, + "grad_norm": 1.019830855150363, + "learning_rate": 2.4722305698267058e-08, + "loss": 0.5714, + "step": 33418 + }, + { + "epoch": 0.97, + "grad_norm": 1.4814211616412063, + "learning_rate": 2.4675676259059976e-08, + "loss": 0.2972, + "step": 33419 + }, + { + "epoch": 0.97, + "grad_norm": 1.2963050485430425, + "learning_rate": 2.4629090727145698e-08, + "loss": 0.2469, + "step": 33420 + }, + { + "epoch": 0.97, + "grad_norm": 1.4358536143900924, + "learning_rate": 2.4582549102935005e-08, + "loss": 0.2654, + "step": 33421 + }, + { + "epoch": 0.97, + "grad_norm": 1.348825881228888, + "learning_rate": 2.4536051386838122e-08, + "loss": 0.2715, + "step": 33422 + }, + { + "epoch": 0.97, + "grad_norm": 1.3531811498711541, + "learning_rate": 2.4489597579265835e-08, + "loss": 0.26, + "step": 33423 + }, + { + "epoch": 0.97, + "grad_norm": 1.2637232859439345, + "learning_rate": 2.4443187680628368e-08, + "loss": 0.2619, + "step": 33424 + }, + { + "epoch": 0.97, + "grad_norm": 1.3361854306749312, + "learning_rate": 2.4396821691334837e-08, + "loss": 0.2693, + "step": 33425 + }, + { + "epoch": 0.97, + "grad_norm": 1.1956725297857378, + "learning_rate": 2.4350499611794365e-08, + "loss": 0.2533, + "step": 33426 + }, + { + "epoch": 0.97, + "grad_norm": 2.5556250654515877, + "learning_rate": 2.4304221442416066e-08, + "loss": 0.2946, + "step": 33427 + }, + { + "epoch": 0.97, + "grad_norm": 1.284749916614964, + "learning_rate": 2.4257987183607944e-08, + "loss": 0.2528, + "step": 33428 + }, + { + "epoch": 0.97, + "grad_norm": 1.2529206972258944, + "learning_rate": 2.4211796835778566e-08, + "loss": 0.2558, + "step": 33429 + }, + { + "epoch": 0.97, + "grad_norm": 1.4064303603362878, + "learning_rate": 2.4165650399335384e-08, + "loss": 0.2521, + "step": 33430 + }, + { + "epoch": 0.97, + "grad_norm": 1.3458477535897677, + "learning_rate": 2.411954787468529e-08, + "loss": 0.2708, + "step": 33431 + }, + { + "epoch": 0.97, + "grad_norm": 1.7734586233190333, + "learning_rate": 2.407348926223574e-08, + "loss": 0.256, + "step": 33432 + }, + { + "epoch": 0.97, + "grad_norm": 1.3816632450140227, + "learning_rate": 2.4027474562392518e-08, + "loss": 0.2897, + "step": 33433 + }, + { + "epoch": 0.97, + "grad_norm": 1.8117469615460755, + "learning_rate": 2.3981503775561965e-08, + "loss": 0.2637, + "step": 33434 + }, + { + "epoch": 0.97, + "grad_norm": 1.3071896531017777, + "learning_rate": 2.393557690214987e-08, + "loss": 0.2712, + "step": 33435 + }, + { + "epoch": 0.97, + "grad_norm": 1.3121945709243532, + "learning_rate": 2.3889693942561464e-08, + "loss": 0.2642, + "step": 33436 + }, + { + "epoch": 0.97, + "grad_norm": 2.4883070190972125, + "learning_rate": 2.384385489720198e-08, + "loss": 0.2622, + "step": 33437 + }, + { + "epoch": 0.97, + "grad_norm": 1.2692712887945963, + "learning_rate": 2.3798059766475533e-08, + "loss": 0.2636, + "step": 33438 + }, + { + "epoch": 0.97, + "grad_norm": 1.3171657434913127, + "learning_rate": 2.3752308550786253e-08, + "loss": 0.2715, + "step": 33439 + }, + { + "epoch": 0.97, + "grad_norm": 1.325829507941055, + "learning_rate": 2.370660125053825e-08, + "loss": 0.2595, + "step": 33440 + }, + { + "epoch": 0.97, + "grad_norm": 1.2962269769752566, + "learning_rate": 2.3660937866135104e-08, + "loss": 0.2479, + "step": 33441 + }, + { + "epoch": 0.97, + "grad_norm": 1.3215208712866815, + "learning_rate": 2.3615318397978704e-08, + "loss": 0.2643, + "step": 33442 + }, + { + "epoch": 0.97, + "grad_norm": 1.5142621086118027, + "learning_rate": 2.356974284647262e-08, + "loss": 0.252, + "step": 33443 + }, + { + "epoch": 0.97, + "grad_norm": 1.3404822257022866, + "learning_rate": 2.3524211212018756e-08, + "loss": 0.2785, + "step": 33444 + }, + { + "epoch": 0.97, + "grad_norm": 1.386868404204068, + "learning_rate": 2.347872349501845e-08, + "loss": 0.2675, + "step": 33445 + }, + { + "epoch": 0.97, + "grad_norm": 0.9170990107113246, + "learning_rate": 2.3433279695874166e-08, + "loss": 0.5639, + "step": 33446 + }, + { + "epoch": 0.97, + "grad_norm": 1.434258220479425, + "learning_rate": 2.3387879814986136e-08, + "loss": 0.2678, + "step": 33447 + }, + { + "epoch": 0.97, + "grad_norm": 2.090632918804681, + "learning_rate": 2.3342523852755706e-08, + "loss": 0.2664, + "step": 33448 + }, + { + "epoch": 0.97, + "grad_norm": 1.5302064890921703, + "learning_rate": 2.3297211809582555e-08, + "loss": 0.2792, + "step": 33449 + }, + { + "epoch": 0.97, + "grad_norm": 1.3687939063709293, + "learning_rate": 2.3251943685866364e-08, + "loss": 0.2747, + "step": 33450 + }, + { + "epoch": 0.97, + "grad_norm": 1.5916510055534554, + "learning_rate": 2.3206719482007368e-08, + "loss": 0.2548, + "step": 33451 + }, + { + "epoch": 0.97, + "grad_norm": 1.3008965588365788, + "learning_rate": 2.3161539198404138e-08, + "loss": 0.2706, + "step": 33452 + }, + { + "epoch": 0.97, + "grad_norm": 1.3770294339207907, + "learning_rate": 2.31164028354558e-08, + "loss": 0.2676, + "step": 33453 + }, + { + "epoch": 0.97, + "grad_norm": 1.2573932296684538, + "learning_rate": 2.3071310393559808e-08, + "loss": 0.2599, + "step": 33454 + }, + { + "epoch": 0.97, + "grad_norm": 1.5678116121820957, + "learning_rate": 2.3026261873115296e-08, + "loss": 0.2648, + "step": 33455 + }, + { + "epoch": 0.97, + "grad_norm": 1.4259425994250183, + "learning_rate": 2.2981257274519165e-08, + "loss": 0.2774, + "step": 33456 + }, + { + "epoch": 0.97, + "grad_norm": 1.2829275936252948, + "learning_rate": 2.2936296598168317e-08, + "loss": 0.268, + "step": 33457 + }, + { + "epoch": 0.97, + "grad_norm": 1.343682391443543, + "learning_rate": 2.289137984445966e-08, + "loss": 0.242, + "step": 33458 + }, + { + "epoch": 0.97, + "grad_norm": 1.4689820307588515, + "learning_rate": 2.2846507013790654e-08, + "loss": 0.2851, + "step": 33459 + }, + { + "epoch": 0.97, + "grad_norm": 1.4123633306089916, + "learning_rate": 2.2801678106555982e-08, + "loss": 0.2734, + "step": 33460 + }, + { + "epoch": 0.97, + "grad_norm": 1.3248243019068444, + "learning_rate": 2.2756893123151436e-08, + "loss": 0.2391, + "step": 33461 + }, + { + "epoch": 0.97, + "grad_norm": 1.3913480074751725, + "learning_rate": 2.2712152063972816e-08, + "loss": 0.2528, + "step": 33462 + }, + { + "epoch": 0.97, + "grad_norm": 1.2549791227347953, + "learning_rate": 2.2667454929414246e-08, + "loss": 0.2775, + "step": 33463 + }, + { + "epoch": 0.97, + "grad_norm": 1.599023035474588, + "learning_rate": 2.2622801719870968e-08, + "loss": 0.2567, + "step": 33464 + }, + { + "epoch": 0.97, + "grad_norm": 1.5494591791517727, + "learning_rate": 2.2578192435736555e-08, + "loss": 0.2609, + "step": 33465 + }, + { + "epoch": 0.97, + "grad_norm": 1.3775340592688134, + "learning_rate": 2.2533627077405142e-08, + "loss": 0.2591, + "step": 33466 + }, + { + "epoch": 0.97, + "grad_norm": 1.3894867311750025, + "learning_rate": 2.2489105645269736e-08, + "loss": 0.2512, + "step": 33467 + }, + { + "epoch": 0.97, + "grad_norm": 1.4478675734559139, + "learning_rate": 2.2444628139722813e-08, + "loss": 0.2653, + "step": 33468 + }, + { + "epoch": 0.97, + "grad_norm": 1.3174932453000379, + "learning_rate": 2.2400194561157383e-08, + "loss": 0.2582, + "step": 33469 + }, + { + "epoch": 0.97, + "grad_norm": 1.399930458078274, + "learning_rate": 2.2355804909965363e-08, + "loss": 0.2689, + "step": 33470 + }, + { + "epoch": 0.97, + "grad_norm": 1.368278357917513, + "learning_rate": 2.2311459186538653e-08, + "loss": 0.2827, + "step": 33471 + }, + { + "epoch": 0.97, + "grad_norm": 1.1875763314874053, + "learning_rate": 2.2267157391268613e-08, + "loss": 0.2775, + "step": 33472 + }, + { + "epoch": 0.97, + "grad_norm": 1.9321100643324818, + "learning_rate": 2.2222899524546038e-08, + "loss": 0.2923, + "step": 33473 + }, + { + "epoch": 0.97, + "grad_norm": 1.2845946548865788, + "learning_rate": 2.2178685586761727e-08, + "loss": 0.2638, + "step": 33474 + }, + { + "epoch": 0.97, + "grad_norm": 1.7737251697534304, + "learning_rate": 2.2134515578305372e-08, + "loss": 0.2721, + "step": 33475 + }, + { + "epoch": 0.97, + "grad_norm": 1.2796134063870745, + "learning_rate": 2.2090389499567766e-08, + "loss": 0.2699, + "step": 33476 + }, + { + "epoch": 0.97, + "grad_norm": 1.319770330965331, + "learning_rate": 2.2046307350936936e-08, + "loss": 0.26, + "step": 33477 + }, + { + "epoch": 0.97, + "grad_norm": 1.3546613081837025, + "learning_rate": 2.2002269132803123e-08, + "loss": 0.2852, + "step": 33478 + }, + { + "epoch": 0.97, + "grad_norm": 1.3580109325995482, + "learning_rate": 2.195827484555435e-08, + "loss": 0.2542, + "step": 33479 + }, + { + "epoch": 0.97, + "grad_norm": 2.394168549139939, + "learning_rate": 2.1914324489579754e-08, + "loss": 0.2373, + "step": 33480 + }, + { + "epoch": 0.97, + "grad_norm": 1.4258520929744063, + "learning_rate": 2.1870418065265686e-08, + "loss": 0.2803, + "step": 33481 + }, + { + "epoch": 0.97, + "grad_norm": 1.3423449905169589, + "learning_rate": 2.1826555573000174e-08, + "loss": 0.2851, + "step": 33482 + }, + { + "epoch": 0.97, + "grad_norm": 1.2850270186066821, + "learning_rate": 2.178273701317124e-08, + "loss": 0.2645, + "step": 33483 + }, + { + "epoch": 0.97, + "grad_norm": 1.4984910447753665, + "learning_rate": 2.1738962386164685e-08, + "loss": 0.2745, + "step": 33484 + }, + { + "epoch": 0.97, + "grad_norm": 1.3875353854514094, + "learning_rate": 2.169523169236687e-08, + "loss": 0.2448, + "step": 33485 + }, + { + "epoch": 0.97, + "grad_norm": 1.5200217212392235, + "learning_rate": 2.1651544932163594e-08, + "loss": 0.2856, + "step": 33486 + }, + { + "epoch": 0.97, + "grad_norm": 2.091090473993956, + "learning_rate": 2.1607902105941216e-08, + "loss": 0.2813, + "step": 33487 + }, + { + "epoch": 0.97, + "grad_norm": 1.2288890645433823, + "learning_rate": 2.156430321408387e-08, + "loss": 0.2588, + "step": 33488 + }, + { + "epoch": 0.97, + "grad_norm": 1.3614104790512702, + "learning_rate": 2.152074825697681e-08, + "loss": 0.2919, + "step": 33489 + }, + { + "epoch": 0.97, + "grad_norm": 1.406519338084793, + "learning_rate": 2.1477237235004722e-08, + "loss": 0.2655, + "step": 33490 + }, + { + "epoch": 0.97, + "grad_norm": 1.3152072576401508, + "learning_rate": 2.143377014855119e-08, + "loss": 0.2509, + "step": 33491 + }, + { + "epoch": 0.97, + "grad_norm": 1.4021759489745271, + "learning_rate": 2.1390346997999245e-08, + "loss": 0.2697, + "step": 33492 + }, + { + "epoch": 0.97, + "grad_norm": 1.3916140232339116, + "learning_rate": 2.1346967783733575e-08, + "loss": 0.2716, + "step": 33493 + }, + { + "epoch": 0.97, + "grad_norm": 1.4118874090889064, + "learning_rate": 2.130363250613554e-08, + "loss": 0.2565, + "step": 33494 + }, + { + "epoch": 0.97, + "grad_norm": 1.511881943364107, + "learning_rate": 2.1260341165588727e-08, + "loss": 0.2934, + "step": 33495 + }, + { + "epoch": 0.97, + "grad_norm": 1.3637646389323461, + "learning_rate": 2.121709376247394e-08, + "loss": 0.2531, + "step": 33496 + }, + { + "epoch": 0.97, + "grad_norm": 1.0155049829277385, + "learning_rate": 2.1173890297174203e-08, + "loss": 0.6197, + "step": 33497 + }, + { + "epoch": 0.97, + "grad_norm": 1.3629022703235143, + "learning_rate": 2.1130730770069772e-08, + "loss": 0.2662, + "step": 33498 + }, + { + "epoch": 0.97, + "grad_norm": 1.222349195306221, + "learning_rate": 2.108761518154201e-08, + "loss": 0.2657, + "step": 33499 + }, + { + "epoch": 0.97, + "grad_norm": 1.471024966341507, + "learning_rate": 2.1044543531971162e-08, + "loss": 0.2582, + "step": 33500 + }, + { + "epoch": 0.97, + "grad_norm": 1.6662205036416, + "learning_rate": 2.100151582173693e-08, + "loss": 0.2632, + "step": 33501 + }, + { + "epoch": 0.97, + "grad_norm": 1.3350311670863713, + "learning_rate": 2.0958532051220114e-08, + "loss": 0.2843, + "step": 33502 + }, + { + "epoch": 0.97, + "grad_norm": 1.7942820949329712, + "learning_rate": 2.0915592220799863e-08, + "loss": 0.2537, + "step": 33503 + }, + { + "epoch": 0.97, + "grad_norm": 1.379993955688055, + "learning_rate": 2.0872696330854203e-08, + "loss": 0.2696, + "step": 33504 + }, + { + "epoch": 0.97, + "grad_norm": 1.3122937569071944, + "learning_rate": 2.0829844381762277e-08, + "loss": 0.2748, + "step": 33505 + }, + { + "epoch": 0.97, + "grad_norm": 1.569703426514235, + "learning_rate": 2.0787036373902668e-08, + "loss": 0.2737, + "step": 33506 + }, + { + "epoch": 0.97, + "grad_norm": 1.3006681685990509, + "learning_rate": 2.0744272307652304e-08, + "loss": 0.2697, + "step": 33507 + }, + { + "epoch": 0.97, + "grad_norm": 1.6735748497659464, + "learning_rate": 2.0701552183389206e-08, + "loss": 0.2804, + "step": 33508 + }, + { + "epoch": 0.97, + "grad_norm": 1.394342221210976, + "learning_rate": 2.0658876001489746e-08, + "loss": 0.2692, + "step": 33509 + }, + { + "epoch": 0.97, + "grad_norm": 1.2859495958391036, + "learning_rate": 2.0616243762331402e-08, + "loss": 0.2495, + "step": 33510 + }, + { + "epoch": 0.97, + "grad_norm": 1.29906414594344, + "learning_rate": 2.0573655466289423e-08, + "loss": 0.269, + "step": 33511 + }, + { + "epoch": 0.97, + "grad_norm": 1.4095200856942318, + "learning_rate": 2.0531111113740176e-08, + "loss": 0.2661, + "step": 33512 + }, + { + "epoch": 0.97, + "grad_norm": 1.7849077545220526, + "learning_rate": 2.0488610705059476e-08, + "loss": 0.3178, + "step": 33513 + }, + { + "epoch": 0.97, + "grad_norm": 1.507519587904786, + "learning_rate": 2.044615424062202e-08, + "loss": 0.2567, + "step": 33514 + }, + { + "epoch": 0.97, + "grad_norm": 1.5265175359072831, + "learning_rate": 2.0403741720802504e-08, + "loss": 0.2577, + "step": 33515 + }, + { + "epoch": 0.97, + "grad_norm": 2.020405933117028, + "learning_rate": 2.0361373145975083e-08, + "loss": 0.2602, + "step": 33516 + }, + { + "epoch": 0.97, + "grad_norm": 1.2862698760769296, + "learning_rate": 2.0319048516513895e-08, + "loss": 0.2479, + "step": 33517 + }, + { + "epoch": 0.97, + "grad_norm": 1.3908102757619443, + "learning_rate": 2.0276767832792532e-08, + "loss": 0.2471, + "step": 33518 + }, + { + "epoch": 0.97, + "grad_norm": 1.6030963793080897, + "learning_rate": 2.0234531095183473e-08, + "loss": 0.2552, + "step": 33519 + }, + { + "epoch": 0.97, + "grad_norm": 0.9648215089590854, + "learning_rate": 2.0192338304059756e-08, + "loss": 0.5238, + "step": 33520 + }, + { + "epoch": 0.97, + "grad_norm": 1.3125626838700797, + "learning_rate": 2.015018945979441e-08, + "loss": 0.273, + "step": 33521 + }, + { + "epoch": 0.97, + "grad_norm": 1.2609171449463044, + "learning_rate": 2.010808456275881e-08, + "loss": 0.3021, + "step": 33522 + }, + { + "epoch": 0.97, + "grad_norm": 1.3838717170636587, + "learning_rate": 2.006602361332488e-08, + "loss": 0.2684, + "step": 33523 + }, + { + "epoch": 0.97, + "grad_norm": 1.4988744220314447, + "learning_rate": 2.002400661186288e-08, + "loss": 0.2633, + "step": 33524 + }, + { + "epoch": 0.97, + "grad_norm": 1.2788199460405965, + "learning_rate": 1.9982033558744728e-08, + "loss": 0.2842, + "step": 33525 + }, + { + "epoch": 0.97, + "grad_norm": 1.704011098043439, + "learning_rate": 1.9940104454340137e-08, + "loss": 0.2612, + "step": 33526 + }, + { + "epoch": 0.97, + "grad_norm": 1.3336427901434103, + "learning_rate": 1.989821929901936e-08, + "loss": 0.2636, + "step": 33527 + }, + { + "epoch": 0.97, + "grad_norm": 1.3474336595093248, + "learning_rate": 1.9856378093152108e-08, + "loss": 0.2488, + "step": 33528 + }, + { + "epoch": 0.97, + "grad_norm": 1.3324662307686392, + "learning_rate": 1.9814580837107522e-08, + "loss": 0.2523, + "step": 33529 + }, + { + "epoch": 0.97, + "grad_norm": 1.3247211622223038, + "learning_rate": 1.9772827531254755e-08, + "loss": 0.2608, + "step": 33530 + }, + { + "epoch": 0.97, + "grad_norm": 1.3358635095509581, + "learning_rate": 1.973111817596185e-08, + "loss": 0.2928, + "step": 33531 + }, + { + "epoch": 0.97, + "grad_norm": 1.3805598056573944, + "learning_rate": 1.9689452771597394e-08, + "loss": 0.2842, + "step": 33532 + }, + { + "epoch": 0.97, + "grad_norm": 1.198066532326632, + "learning_rate": 1.9647831318528322e-08, + "loss": 0.2787, + "step": 33533 + }, + { + "epoch": 0.97, + "grad_norm": 1.2507861482278226, + "learning_rate": 1.9606253817122667e-08, + "loss": 0.249, + "step": 33534 + }, + { + "epoch": 0.97, + "grad_norm": 1.4923700172141483, + "learning_rate": 1.9564720267746807e-08, + "loss": 0.24, + "step": 33535 + }, + { + "epoch": 0.97, + "grad_norm": 1.3653245446824938, + "learning_rate": 1.9523230670767667e-08, + "loss": 0.2877, + "step": 33536 + }, + { + "epoch": 0.97, + "grad_norm": 1.4240264586736588, + "learning_rate": 1.948178502655107e-08, + "loss": 0.2563, + "step": 33537 + }, + { + "epoch": 0.97, + "grad_norm": 1.228910494558089, + "learning_rate": 1.9440383335463386e-08, + "loss": 0.2559, + "step": 33538 + }, + { + "epoch": 0.97, + "grad_norm": 2.8810298120142916, + "learning_rate": 1.9399025597869324e-08, + "loss": 0.2775, + "step": 33539 + }, + { + "epoch": 0.97, + "grad_norm": 1.2355541443274904, + "learning_rate": 1.935771181413415e-08, + "loss": 0.2687, + "step": 33540 + }, + { + "epoch": 0.97, + "grad_norm": 1.6599863872465526, + "learning_rate": 1.9316441984622015e-08, + "loss": 0.2888, + "step": 33541 + }, + { + "epoch": 0.97, + "grad_norm": 1.4950739585983883, + "learning_rate": 1.927521610969818e-08, + "loss": 0.2567, + "step": 33542 + }, + { + "epoch": 0.97, + "grad_norm": 1.4980986326681287, + "learning_rate": 1.9234034189725692e-08, + "loss": 0.2643, + "step": 33543 + }, + { + "epoch": 0.97, + "grad_norm": 1.3329843479839507, + "learning_rate": 1.9192896225068147e-08, + "loss": 0.2604, + "step": 33544 + }, + { + "epoch": 0.97, + "grad_norm": 1.3555811638152362, + "learning_rate": 1.9151802216088032e-08, + "loss": 0.2506, + "step": 33545 + }, + { + "epoch": 0.97, + "grad_norm": 1.5979995749230789, + "learning_rate": 1.911075216314895e-08, + "loss": 0.2659, + "step": 33546 + }, + { + "epoch": 0.97, + "grad_norm": 1.235002088562574, + "learning_rate": 1.9069746066612828e-08, + "loss": 0.2422, + "step": 33547 + }, + { + "epoch": 0.97, + "grad_norm": 1.4783627176096106, + "learning_rate": 1.9028783926841045e-08, + "loss": 0.2637, + "step": 33548 + }, + { + "epoch": 0.97, + "grad_norm": 1.3317702912134743, + "learning_rate": 1.8987865744195532e-08, + "loss": 0.2885, + "step": 33549 + }, + { + "epoch": 0.97, + "grad_norm": 1.9763815114676517, + "learning_rate": 1.894699151903767e-08, + "loss": 0.2527, + "step": 33550 + }, + { + "epoch": 0.97, + "grad_norm": 1.258986597810186, + "learning_rate": 1.890616125172773e-08, + "loss": 0.2567, + "step": 33551 + }, + { + "epoch": 0.97, + "grad_norm": 1.4329731344963306, + "learning_rate": 1.8865374942626525e-08, + "loss": 0.2668, + "step": 33552 + }, + { + "epoch": 0.97, + "grad_norm": 1.402454214264255, + "learning_rate": 1.8824632592093216e-08, + "loss": 0.2594, + "step": 33553 + }, + { + "epoch": 0.97, + "grad_norm": 1.4724607296074388, + "learning_rate": 1.8783934200488073e-08, + "loss": 0.2723, + "step": 33554 + }, + { + "epoch": 0.97, + "grad_norm": 1.2609766700917682, + "learning_rate": 1.8743279768169697e-08, + "loss": 0.265, + "step": 33555 + }, + { + "epoch": 0.97, + "grad_norm": 1.4443048841669286, + "learning_rate": 1.8702669295497243e-08, + "loss": 0.2745, + "step": 33556 + }, + { + "epoch": 0.97, + "grad_norm": 2.00771088189835, + "learning_rate": 1.866210278282876e-08, + "loss": 0.3062, + "step": 33557 + }, + { + "epoch": 0.97, + "grad_norm": 1.3486641145416665, + "learning_rate": 1.862158023052285e-08, + "loss": 0.2621, + "step": 33558 + }, + { + "epoch": 0.97, + "grad_norm": 1.538480763250288, + "learning_rate": 1.858110163893645e-08, + "loss": 0.2731, + "step": 33559 + }, + { + "epoch": 0.97, + "grad_norm": 2.0589923354593256, + "learning_rate": 1.8540667008427048e-08, + "loss": 0.2652, + "step": 33560 + }, + { + "epoch": 0.97, + "grad_norm": 1.510365532243211, + "learning_rate": 1.8500276339352142e-08, + "loss": 0.2776, + "step": 33561 + }, + { + "epoch": 0.97, + "grad_norm": 1.3947467992012106, + "learning_rate": 1.8459929632067e-08, + "loss": 0.3079, + "step": 33562 + }, + { + "epoch": 0.97, + "grad_norm": 1.4205942637389901, + "learning_rate": 1.8419626886928e-08, + "loss": 0.2844, + "step": 33563 + }, + { + "epoch": 0.97, + "grad_norm": 1.4851793519563987, + "learning_rate": 1.8379368104291527e-08, + "loss": 0.2442, + "step": 33564 + }, + { + "epoch": 0.97, + "grad_norm": 1.660343135312786, + "learning_rate": 1.83391532845123e-08, + "loss": 0.2688, + "step": 33565 + }, + { + "epoch": 0.97, + "grad_norm": 1.3171824016320401, + "learning_rate": 1.8298982427945587e-08, + "loss": 0.2852, + "step": 33566 + }, + { + "epoch": 0.97, + "grad_norm": 1.3557344729173468, + "learning_rate": 1.8258855534944996e-08, + "loss": 0.2605, + "step": 33567 + }, + { + "epoch": 0.97, + "grad_norm": 1.386843154700344, + "learning_rate": 1.82187726058658e-08, + "loss": 0.2455, + "step": 33568 + }, + { + "epoch": 0.97, + "grad_norm": 1.2366948001758475, + "learning_rate": 1.8178733641061042e-08, + "loss": 0.251, + "step": 33569 + }, + { + "epoch": 0.97, + "grad_norm": 1.377106651285101, + "learning_rate": 1.8138738640884335e-08, + "loss": 0.2698, + "step": 33570 + }, + { + "epoch": 0.97, + "grad_norm": 1.823609869620656, + "learning_rate": 1.8098787605688174e-08, + "loss": 0.2668, + "step": 33571 + }, + { + "epoch": 0.97, + "grad_norm": 1.3826756515413678, + "learning_rate": 1.8058880535825606e-08, + "loss": 0.2658, + "step": 33572 + }, + { + "epoch": 0.97, + "grad_norm": 1.3429679353781931, + "learning_rate": 1.8019017431649134e-08, + "loss": 0.2748, + "step": 33573 + }, + { + "epoch": 0.97, + "grad_norm": 1.4414937180326675, + "learning_rate": 1.797919829350958e-08, + "loss": 0.2635, + "step": 33574 + }, + { + "epoch": 0.97, + "grad_norm": 1.2490139196852466, + "learning_rate": 1.7939423121758894e-08, + "loss": 0.2566, + "step": 33575 + }, + { + "epoch": 0.97, + "grad_norm": 1.5362508728493314, + "learning_rate": 1.7899691916747897e-08, + "loss": 0.2905, + "step": 33576 + }, + { + "epoch": 0.97, + "grad_norm": 1.4194909360923629, + "learning_rate": 1.786000467882798e-08, + "loss": 0.2906, + "step": 33577 + }, + { + "epoch": 0.97, + "grad_norm": 1.3294730888164494, + "learning_rate": 1.7820361408348307e-08, + "loss": 0.2815, + "step": 33578 + }, + { + "epoch": 0.97, + "grad_norm": 1.2525330738676683, + "learning_rate": 1.778076210565971e-08, + "loss": 0.261, + "step": 33579 + }, + { + "epoch": 0.97, + "grad_norm": 1.3618490507706114, + "learning_rate": 1.7741206771110796e-08, + "loss": 0.2733, + "step": 33580 + }, + { + "epoch": 0.97, + "grad_norm": 1.682642022988034, + "learning_rate": 1.7701695405050733e-08, + "loss": 0.2583, + "step": 33581 + }, + { + "epoch": 0.97, + "grad_norm": 1.3267481151043399, + "learning_rate": 1.7662228007828685e-08, + "loss": 0.2535, + "step": 33582 + }, + { + "epoch": 0.97, + "grad_norm": 1.3543284777506832, + "learning_rate": 1.7622804579793264e-08, + "loss": 0.2839, + "step": 33583 + }, + { + "epoch": 0.97, + "grad_norm": 1.4329109286569282, + "learning_rate": 1.7583425121291408e-08, + "loss": 0.2935, + "step": 33584 + }, + { + "epoch": 0.97, + "grad_norm": 1.3425710595201652, + "learning_rate": 1.754408963267118e-08, + "loss": 0.2793, + "step": 33585 + }, + { + "epoch": 0.97, + "grad_norm": 1.398924589402094, + "learning_rate": 1.750479811427952e-08, + "loss": 0.2642, + "step": 33586 + }, + { + "epoch": 0.97, + "grad_norm": 1.4102565502891473, + "learning_rate": 1.7465550566463373e-08, + "loss": 0.2536, + "step": 33587 + }, + { + "epoch": 0.97, + "grad_norm": 1.3000612501447772, + "learning_rate": 1.742634698956913e-08, + "loss": 0.2824, + "step": 33588 + }, + { + "epoch": 0.97, + "grad_norm": 1.2620580022896943, + "learning_rate": 1.738718738394207e-08, + "loss": 0.2639, + "step": 33589 + }, + { + "epoch": 0.97, + "grad_norm": 1.2394404406205182, + "learning_rate": 1.734807174992914e-08, + "loss": 0.2464, + "step": 33590 + }, + { + "epoch": 0.97, + "grad_norm": 18.45742587146181, + "learning_rate": 1.7309000087874506e-08, + "loss": 0.2724, + "step": 33591 + }, + { + "epoch": 0.97, + "grad_norm": 1.856980360570333, + "learning_rate": 1.7269972398122892e-08, + "loss": 0.2704, + "step": 33592 + }, + { + "epoch": 0.97, + "grad_norm": 1.5314534212602569, + "learning_rate": 1.723098868101958e-08, + "loss": 0.2786, + "step": 33593 + }, + { + "epoch": 0.97, + "grad_norm": 1.3706303475869732, + "learning_rate": 1.7192048936907624e-08, + "loss": 0.2829, + "step": 33594 + }, + { + "epoch": 0.97, + "grad_norm": 1.8156336967558024, + "learning_rate": 1.7153153166131196e-08, + "loss": 0.2708, + "step": 33595 + }, + { + "epoch": 0.97, + "grad_norm": 1.4286571680624547, + "learning_rate": 1.7114301369033915e-08, + "loss": 0.2766, + "step": 33596 + }, + { + "epoch": 0.97, + "grad_norm": 1.4746985035358278, + "learning_rate": 1.707549354595772e-08, + "loss": 0.2564, + "step": 33597 + }, + { + "epoch": 0.97, + "grad_norm": 1.4554004395324471, + "learning_rate": 1.703672969724568e-08, + "loss": 0.2646, + "step": 33598 + }, + { + "epoch": 0.97, + "grad_norm": 1.6663897021740701, + "learning_rate": 1.6998009823239736e-08, + "loss": 0.2665, + "step": 33599 + }, + { + "epoch": 0.97, + "grad_norm": 1.353100561610395, + "learning_rate": 1.695933392428184e-08, + "loss": 0.2561, + "step": 33600 + }, + { + "epoch": 0.97, + "grad_norm": 1.9698994896581388, + "learning_rate": 1.692070200071283e-08, + "loss": 0.3075, + "step": 33601 + }, + { + "epoch": 0.97, + "grad_norm": 1.3387722609093886, + "learning_rate": 1.6882114052874098e-08, + "loss": 0.266, + "step": 33602 + }, + { + "epoch": 0.97, + "grad_norm": 1.268341437334605, + "learning_rate": 1.684357008110593e-08, + "loss": 0.2609, + "step": 33603 + }, + { + "epoch": 0.97, + "grad_norm": 1.3940232823215948, + "learning_rate": 1.6805070085748053e-08, + "loss": 0.2477, + "step": 33604 + }, + { + "epoch": 0.97, + "grad_norm": 1.2866880309004947, + "learning_rate": 1.6766614067141307e-08, + "loss": 0.2565, + "step": 33605 + }, + { + "epoch": 0.97, + "grad_norm": 1.743470770477161, + "learning_rate": 1.6728202025624306e-08, + "loss": 0.2807, + "step": 33606 + }, + { + "epoch": 0.97, + "grad_norm": 1.2236061099454159, + "learning_rate": 1.6689833961536227e-08, + "loss": 0.265, + "step": 33607 + }, + { + "epoch": 0.97, + "grad_norm": 1.4681049470898535, + "learning_rate": 1.6651509875215687e-08, + "loss": 0.2834, + "step": 33608 + }, + { + "epoch": 0.97, + "grad_norm": 1.4929946322581675, + "learning_rate": 1.661322976700075e-08, + "loss": 0.269, + "step": 33609 + }, + { + "epoch": 0.97, + "grad_norm": 1.6080331304869948, + "learning_rate": 1.6574993637229476e-08, + "loss": 0.2678, + "step": 33610 + }, + { + "epoch": 0.97, + "grad_norm": 1.6083341239413957, + "learning_rate": 1.6536801486239373e-08, + "loss": 0.2915, + "step": 33611 + }, + { + "epoch": 0.97, + "grad_norm": 1.5034765738422, + "learning_rate": 1.649865331436684e-08, + "loss": 0.2871, + "step": 33612 + }, + { + "epoch": 0.97, + "grad_norm": 1.3096662959148695, + "learning_rate": 1.6460549121948832e-08, + "loss": 0.2677, + "step": 33613 + }, + { + "epoch": 0.97, + "grad_norm": 1.3248372690438859, + "learning_rate": 1.6422488909322297e-08, + "loss": 0.276, + "step": 33614 + }, + { + "epoch": 0.97, + "grad_norm": 1.79086684430568, + "learning_rate": 1.6384472676822528e-08, + "loss": 0.2692, + "step": 33615 + }, + { + "epoch": 0.98, + "grad_norm": 2.3608077808588637, + "learning_rate": 1.6346500424784806e-08, + "loss": 0.2711, + "step": 33616 + }, + { + "epoch": 0.98, + "grad_norm": 1.4471142659875358, + "learning_rate": 1.6308572153544422e-08, + "loss": 0.2617, + "step": 33617 + }, + { + "epoch": 0.98, + "grad_norm": 1.4002155864556654, + "learning_rate": 1.6270687863436663e-08, + "loss": 0.2681, + "step": 33618 + }, + { + "epoch": 0.98, + "grad_norm": 1.459247609943742, + "learning_rate": 1.623284755479515e-08, + "loss": 0.2603, + "step": 33619 + }, + { + "epoch": 0.98, + "grad_norm": 1.3070251528977104, + "learning_rate": 1.6195051227954063e-08, + "loss": 0.2698, + "step": 33620 + }, + { + "epoch": 0.98, + "grad_norm": 1.2439186034088863, + "learning_rate": 1.6157298883246464e-08, + "loss": 0.2901, + "step": 33621 + }, + { + "epoch": 0.98, + "grad_norm": 1.3238005013514837, + "learning_rate": 1.6119590521006534e-08, + "loss": 0.2683, + "step": 33622 + }, + { + "epoch": 0.98, + "grad_norm": 1.395647676484335, + "learning_rate": 1.608192614156623e-08, + "loss": 0.3016, + "step": 33623 + }, + { + "epoch": 0.98, + "grad_norm": 1.61337598807368, + "learning_rate": 1.6044305745258616e-08, + "loss": 0.2711, + "step": 33624 + }, + { + "epoch": 0.98, + "grad_norm": 1.388839124815755, + "learning_rate": 1.6006729332415094e-08, + "loss": 0.252, + "step": 33625 + }, + { + "epoch": 0.98, + "grad_norm": 1.2944391483896778, + "learning_rate": 1.596919690336707e-08, + "loss": 0.2772, + "step": 33626 + }, + { + "epoch": 0.98, + "grad_norm": 1.4779542590325723, + "learning_rate": 1.5931708458446494e-08, + "loss": 0.2874, + "step": 33627 + }, + { + "epoch": 0.98, + "grad_norm": 2.3631131454485366, + "learning_rate": 1.5894263997984215e-08, + "loss": 0.2479, + "step": 33628 + }, + { + "epoch": 0.98, + "grad_norm": 1.643320956191123, + "learning_rate": 1.5856863522309973e-08, + "loss": 0.2512, + "step": 33629 + }, + { + "epoch": 0.98, + "grad_norm": 1.3579015910958259, + "learning_rate": 1.5819507031754055e-08, + "loss": 0.25, + "step": 33630 + }, + { + "epoch": 0.98, + "grad_norm": 1.4852254509934306, + "learning_rate": 1.57821945266462e-08, + "loss": 0.2604, + "step": 33631 + }, + { + "epoch": 0.98, + "grad_norm": 1.2911862314174574, + "learning_rate": 1.574492600731614e-08, + "loss": 0.2926, + "step": 33632 + }, + { + "epoch": 0.98, + "grad_norm": 1.4220622001352239, + "learning_rate": 1.5707701474092508e-08, + "loss": 0.2802, + "step": 33633 + }, + { + "epoch": 0.98, + "grad_norm": 1.4827082265672187, + "learning_rate": 1.567052092730337e-08, + "loss": 0.2899, + "step": 33634 + }, + { + "epoch": 0.98, + "grad_norm": 1.3099354033585608, + "learning_rate": 1.563338436727735e-08, + "loss": 0.2807, + "step": 33635 + }, + { + "epoch": 0.98, + "grad_norm": 1.2864183032152274, + "learning_rate": 1.5596291794341965e-08, + "loss": 0.2794, + "step": 33636 + }, + { + "epoch": 0.98, + "grad_norm": 1.3543453864248998, + "learning_rate": 1.5559243208824183e-08, + "loss": 0.2745, + "step": 33637 + }, + { + "epoch": 0.98, + "grad_norm": 1.455416493395538, + "learning_rate": 1.5522238611052064e-08, + "loss": 0.2732, + "step": 33638 + }, + { + "epoch": 0.98, + "grad_norm": 1.3231127258264248, + "learning_rate": 1.548527800135091e-08, + "loss": 0.2623, + "step": 33639 + }, + { + "epoch": 0.98, + "grad_norm": 1.5185187178887567, + "learning_rate": 1.5448361380047682e-08, + "loss": 0.2568, + "step": 33640 + }, + { + "epoch": 0.98, + "grad_norm": 1.2890729920156017, + "learning_rate": 1.541148874746823e-08, + "loss": 0.2763, + "step": 33641 + }, + { + "epoch": 0.98, + "grad_norm": 1.2423958463527565, + "learning_rate": 1.5374660103937842e-08, + "loss": 0.2638, + "step": 33642 + }, + { + "epoch": 0.98, + "grad_norm": 1.3374646636529608, + "learning_rate": 1.5337875449780713e-08, + "loss": 0.2684, + "step": 33643 + }, + { + "epoch": 0.98, + "grad_norm": 1.5580784423452645, + "learning_rate": 1.530113478532269e-08, + "loss": 0.2994, + "step": 33644 + }, + { + "epoch": 0.98, + "grad_norm": 1.3455789525180946, + "learning_rate": 1.5264438110886848e-08, + "loss": 0.2762, + "step": 33645 + }, + { + "epoch": 0.98, + "grad_norm": 1.5043089008037753, + "learning_rate": 1.5227785426797926e-08, + "loss": 0.2403, + "step": 33646 + }, + { + "epoch": 0.98, + "grad_norm": 1.3318817796356792, + "learning_rate": 1.5191176733379552e-08, + "loss": 0.2678, + "step": 33647 + }, + { + "epoch": 0.98, + "grad_norm": 1.426298911721203, + "learning_rate": 1.51546120309537e-08, + "loss": 0.2755, + "step": 33648 + }, + { + "epoch": 0.98, + "grad_norm": 1.247475993585967, + "learning_rate": 1.5118091319843985e-08, + "loss": 0.2622, + "step": 33649 + }, + { + "epoch": 0.98, + "grad_norm": 1.2140103453935116, + "learning_rate": 1.5081614600372385e-08, + "loss": 0.2745, + "step": 33650 + }, + { + "epoch": 0.98, + "grad_norm": 1.3630813446543595, + "learning_rate": 1.504518187286086e-08, + "loss": 0.2586, + "step": 33651 + }, + { + "epoch": 0.98, + "grad_norm": 1.6362536469373277, + "learning_rate": 1.5008793137630816e-08, + "loss": 0.2601, + "step": 33652 + }, + { + "epoch": 0.98, + "grad_norm": 1.4224082145792207, + "learning_rate": 1.4972448395003668e-08, + "loss": 0.2655, + "step": 33653 + }, + { + "epoch": 0.98, + "grad_norm": 1.448229421087482, + "learning_rate": 1.4936147645299715e-08, + "loss": 0.2918, + "step": 33654 + }, + { + "epoch": 0.98, + "grad_norm": 1.4604575086867462, + "learning_rate": 1.4899890888839808e-08, + "loss": 0.2654, + "step": 33655 + }, + { + "epoch": 0.98, + "grad_norm": 1.3435318456958165, + "learning_rate": 1.4863678125943693e-08, + "loss": 0.2997, + "step": 33656 + }, + { + "epoch": 0.98, + "grad_norm": 1.3930179020423772, + "learning_rate": 1.4827509356930558e-08, + "loss": 0.2598, + "step": 33657 + }, + { + "epoch": 0.98, + "grad_norm": 1.4087553038441147, + "learning_rate": 1.479138458212015e-08, + "loss": 0.3002, + "step": 33658 + }, + { + "epoch": 0.98, + "grad_norm": 1.2261244243402594, + "learning_rate": 1.47553038018311e-08, + "loss": 0.2663, + "step": 33659 + }, + { + "epoch": 0.98, + "grad_norm": 1.8691951300432368, + "learning_rate": 1.4719267016382044e-08, + "loss": 0.2677, + "step": 33660 + }, + { + "epoch": 0.98, + "grad_norm": 1.4800110631989507, + "learning_rate": 1.4683274226090505e-08, + "loss": 0.2652, + "step": 33661 + }, + { + "epoch": 0.98, + "grad_norm": 1.7789416350990654, + "learning_rate": 1.4647325431274006e-08, + "loss": 0.2725, + "step": 33662 + }, + { + "epoch": 0.98, + "grad_norm": 1.2521225439013728, + "learning_rate": 1.4611420632250628e-08, + "loss": 0.2468, + "step": 33663 + }, + { + "epoch": 0.98, + "grad_norm": 3.436982174463467, + "learning_rate": 1.4575559829336782e-08, + "loss": 0.287, + "step": 33664 + }, + { + "epoch": 0.98, + "grad_norm": 1.5951861297462346, + "learning_rate": 1.4539743022848329e-08, + "loss": 0.2547, + "step": 33665 + }, + { + "epoch": 0.98, + "grad_norm": 1.3022947849162563, + "learning_rate": 1.4503970213102792e-08, + "loss": 0.2754, + "step": 33666 + }, + { + "epoch": 0.98, + "grad_norm": 1.4062677853709458, + "learning_rate": 1.4468241400414363e-08, + "loss": 0.2647, + "step": 33667 + }, + { + "epoch": 0.98, + "grad_norm": 1.3897984819713836, + "learning_rate": 1.4432556585099455e-08, + "loss": 0.2732, + "step": 33668 + }, + { + "epoch": 0.98, + "grad_norm": 1.4816938703987814, + "learning_rate": 1.4396915767472263e-08, + "loss": 0.2629, + "step": 33669 + }, + { + "epoch": 0.98, + "grad_norm": 1.3488734375664142, + "learning_rate": 1.4361318947848092e-08, + "loss": 0.2732, + "step": 33670 + }, + { + "epoch": 0.98, + "grad_norm": 1.364162480368608, + "learning_rate": 1.4325766126540019e-08, + "loss": 0.2828, + "step": 33671 + }, + { + "epoch": 0.98, + "grad_norm": 1.5554163457551304, + "learning_rate": 1.4290257303862798e-08, + "loss": 0.2796, + "step": 33672 + }, + { + "epoch": 0.98, + "grad_norm": 1.7700205062345205, + "learning_rate": 1.4254792480128954e-08, + "loss": 0.2655, + "step": 33673 + }, + { + "epoch": 0.98, + "grad_norm": 1.3120987785160687, + "learning_rate": 1.4219371655652125e-08, + "loss": 0.2473, + "step": 33674 + }, + { + "epoch": 0.98, + "grad_norm": 1.3375713360464883, + "learning_rate": 1.4183994830744285e-08, + "loss": 0.2649, + "step": 33675 + }, + { + "epoch": 0.98, + "grad_norm": 1.2934087409140598, + "learning_rate": 1.4148662005718516e-08, + "loss": 0.2658, + "step": 33676 + }, + { + "epoch": 0.98, + "grad_norm": 1.4146868232777767, + "learning_rate": 1.4113373180886237e-08, + "loss": 0.2774, + "step": 33677 + }, + { + "epoch": 0.98, + "grad_norm": 1.4353863034566485, + "learning_rate": 1.4078128356558307e-08, + "loss": 0.2937, + "step": 33678 + }, + { + "epoch": 0.98, + "grad_norm": 4.851800535928512, + "learning_rate": 1.4042927533046147e-08, + "loss": 0.2857, + "step": 33679 + }, + { + "epoch": 0.98, + "grad_norm": 1.4088051240703365, + "learning_rate": 1.4007770710660618e-08, + "loss": 0.2766, + "step": 33680 + }, + { + "epoch": 0.98, + "grad_norm": 1.4642972843369255, + "learning_rate": 1.3972657889712026e-08, + "loss": 0.2778, + "step": 33681 + }, + { + "epoch": 0.98, + "grad_norm": 1.2120232942688982, + "learning_rate": 1.3937589070509572e-08, + "loss": 0.2594, + "step": 33682 + }, + { + "epoch": 0.98, + "grad_norm": 1.3296983329657652, + "learning_rate": 1.390256425336356e-08, + "loss": 0.253, + "step": 33683 + }, + { + "epoch": 0.98, + "grad_norm": 1.2773425235475733, + "learning_rate": 1.3867583438582632e-08, + "loss": 0.2511, + "step": 33684 + }, + { + "epoch": 0.98, + "grad_norm": 2.2698762450724423, + "learning_rate": 1.3832646626475433e-08, + "loss": 0.2522, + "step": 33685 + }, + { + "epoch": 0.98, + "grad_norm": 1.554079202580296, + "learning_rate": 1.3797753817350601e-08, + "loss": 0.2773, + "step": 33686 + }, + { + "epoch": 0.98, + "grad_norm": 1.3601933529086743, + "learning_rate": 1.3762905011516225e-08, + "loss": 0.264, + "step": 33687 + }, + { + "epoch": 0.98, + "grad_norm": 1.4695079500587807, + "learning_rate": 1.3728100209279282e-08, + "loss": 0.27, + "step": 33688 + }, + { + "epoch": 0.98, + "grad_norm": 1.612399834917973, + "learning_rate": 1.3693339410947303e-08, + "loss": 0.2645, + "step": 33689 + }, + { + "epoch": 0.98, + "grad_norm": 1.4863527354759827, + "learning_rate": 1.365862261682671e-08, + "loss": 0.2685, + "step": 33690 + }, + { + "epoch": 0.98, + "grad_norm": 1.3741004268967865, + "learning_rate": 1.3623949827223926e-08, + "loss": 0.3028, + "step": 33691 + }, + { + "epoch": 0.98, + "grad_norm": 1.8965728049131245, + "learning_rate": 1.3589321042445368e-08, + "loss": 0.2624, + "step": 33692 + }, + { + "epoch": 0.98, + "grad_norm": 1.309499200721284, + "learning_rate": 1.3554736262796352e-08, + "loss": 0.2881, + "step": 33693 + }, + { + "epoch": 0.98, + "grad_norm": 1.3330253746952485, + "learning_rate": 1.3520195488581633e-08, + "loss": 0.2567, + "step": 33694 + }, + { + "epoch": 0.98, + "grad_norm": 1.4732426164349328, + "learning_rate": 1.3485698720107077e-08, + "loss": 0.2864, + "step": 33695 + }, + { + "epoch": 0.98, + "grad_norm": 1.9172288681330205, + "learning_rate": 1.345124595767633e-08, + "loss": 0.2572, + "step": 33696 + }, + { + "epoch": 0.98, + "grad_norm": 1.5267757401773419, + "learning_rate": 1.3416837201593591e-08, + "loss": 0.3054, + "step": 33697 + }, + { + "epoch": 0.98, + "grad_norm": 1.6543539414350024, + "learning_rate": 1.3382472452163065e-08, + "loss": 0.2865, + "step": 33698 + }, + { + "epoch": 0.98, + "grad_norm": 1.271088738035845, + "learning_rate": 1.3348151709687284e-08, + "loss": 0.2807, + "step": 33699 + }, + { + "epoch": 0.98, + "grad_norm": 1.29101894803478, + "learning_rate": 1.3313874974468788e-08, + "loss": 0.2732, + "step": 33700 + }, + { + "epoch": 0.98, + "grad_norm": 1.3058918872361702, + "learning_rate": 1.327964224681122e-08, + "loss": 0.251, + "step": 33701 + }, + { + "epoch": 0.98, + "grad_norm": 1.3808107665610108, + "learning_rate": 1.3245453527016006e-08, + "loss": 0.2606, + "step": 33702 + }, + { + "epoch": 0.98, + "grad_norm": 1.5266141109534208, + "learning_rate": 1.3211308815385127e-08, + "loss": 0.2676, + "step": 33703 + }, + { + "epoch": 0.98, + "grad_norm": 1.4359295464521156, + "learning_rate": 1.3177208112220008e-08, + "loss": 0.2784, + "step": 33704 + }, + { + "epoch": 0.98, + "grad_norm": 1.044213607104474, + "learning_rate": 1.3143151417820965e-08, + "loss": 0.6253, + "step": 33705 + }, + { + "epoch": 0.98, + "grad_norm": 1.3841705820044576, + "learning_rate": 1.3109138732488868e-08, + "loss": 0.2779, + "step": 33706 + }, + { + "epoch": 0.98, + "grad_norm": 1.9278167943169415, + "learning_rate": 1.3075170056524033e-08, + "loss": 0.3027, + "step": 33707 + }, + { + "epoch": 0.98, + "grad_norm": 1.3011058860842564, + "learning_rate": 1.304124539022622e-08, + "loss": 0.2883, + "step": 33708 + }, + { + "epoch": 0.98, + "grad_norm": 1.3133376316478567, + "learning_rate": 1.3007364733894635e-08, + "loss": 0.2617, + "step": 33709 + }, + { + "epoch": 0.98, + "grad_norm": 1.3959945586440294, + "learning_rate": 1.297352808782848e-08, + "loss": 0.2604, + "step": 33710 + }, + { + "epoch": 0.98, + "grad_norm": 1.3881008245376874, + "learning_rate": 1.2939735452326407e-08, + "loss": 0.2586, + "step": 33711 + }, + { + "epoch": 0.98, + "grad_norm": 1.2840751694850752, + "learning_rate": 1.2905986827685957e-08, + "loss": 0.2779, + "step": 33712 + }, + { + "epoch": 0.98, + "grad_norm": 1.2557333053373056, + "learning_rate": 1.2872282214205778e-08, + "loss": 0.251, + "step": 33713 + }, + { + "epoch": 0.98, + "grad_norm": 1.5705590506045644, + "learning_rate": 1.2838621612183411e-08, + "loss": 0.3066, + "step": 33714 + }, + { + "epoch": 0.98, + "grad_norm": 0.9377812339708921, + "learning_rate": 1.2805005021915285e-08, + "loss": 0.5165, + "step": 33715 + }, + { + "epoch": 0.98, + "grad_norm": 1.5725810670074745, + "learning_rate": 1.2771432443698385e-08, + "loss": 0.2694, + "step": 33716 + }, + { + "epoch": 0.98, + "grad_norm": 1.4814096462874748, + "learning_rate": 1.2737903877828584e-08, + "loss": 0.2749, + "step": 33717 + }, + { + "epoch": 0.98, + "grad_norm": 4.322942964728281, + "learning_rate": 1.2704419324602312e-08, + "loss": 0.2901, + "step": 33718 + }, + { + "epoch": 0.98, + "grad_norm": 1.3175767708649968, + "learning_rate": 1.2670978784314892e-08, + "loss": 0.28, + "step": 33719 + }, + { + "epoch": 0.98, + "grad_norm": 1.488447610312495, + "learning_rate": 1.2637582257261083e-08, + "loss": 0.2704, + "step": 33720 + }, + { + "epoch": 0.98, + "grad_norm": 0.9682642291937663, + "learning_rate": 1.2604229743736207e-08, + "loss": 0.5773, + "step": 33721 + }, + { + "epoch": 0.98, + "grad_norm": 1.368622196661547, + "learning_rate": 1.2570921244034474e-08, + "loss": 0.2668, + "step": 33722 + }, + { + "epoch": 0.98, + "grad_norm": 2.5729855508527186, + "learning_rate": 1.2537656758449534e-08, + "loss": 0.2714, + "step": 33723 + }, + { + "epoch": 0.98, + "grad_norm": 1.4016087656780976, + "learning_rate": 1.2504436287275046e-08, + "loss": 0.2507, + "step": 33724 + }, + { + "epoch": 0.98, + "grad_norm": 1.500974314589333, + "learning_rate": 1.247125983080466e-08, + "loss": 0.2569, + "step": 33725 + }, + { + "epoch": 0.98, + "grad_norm": 1.2154747933618237, + "learning_rate": 1.2438127389330367e-08, + "loss": 0.2696, + "step": 33726 + }, + { + "epoch": 0.98, + "grad_norm": 1.6376914310775985, + "learning_rate": 1.2405038963144711e-08, + "loss": 0.2748, + "step": 33727 + }, + { + "epoch": 0.98, + "grad_norm": 1.4626386379595946, + "learning_rate": 1.2371994552540234e-08, + "loss": 0.2878, + "step": 33728 + }, + { + "epoch": 0.98, + "grad_norm": 1.3490178992445097, + "learning_rate": 1.2338994157808371e-08, + "loss": 0.2439, + "step": 33729 + }, + { + "epoch": 0.98, + "grad_norm": 1.3780718936859682, + "learning_rate": 1.2306037779239998e-08, + "loss": 0.2534, + "step": 33730 + }, + { + "epoch": 0.98, + "grad_norm": 1.4259741948773852, + "learning_rate": 1.2273125417125997e-08, + "loss": 0.2775, + "step": 33731 + }, + { + "epoch": 0.98, + "grad_norm": 1.3904036114186127, + "learning_rate": 1.2240257071757245e-08, + "loss": 0.2719, + "step": 33732 + }, + { + "epoch": 0.98, + "grad_norm": 1.601978720032597, + "learning_rate": 1.2207432743423508e-08, + "loss": 0.2744, + "step": 33733 + }, + { + "epoch": 0.98, + "grad_norm": 1.3922199808278446, + "learning_rate": 1.2174652432414558e-08, + "loss": 0.2717, + "step": 33734 + }, + { + "epoch": 0.98, + "grad_norm": 0.8983501095371605, + "learning_rate": 1.2141916139019605e-08, + "loss": 0.5358, + "step": 33735 + }, + { + "epoch": 0.98, + "grad_norm": 1.4577734983295831, + "learning_rate": 1.2109223863527864e-08, + "loss": 0.2752, + "step": 33736 + }, + { + "epoch": 0.98, + "grad_norm": 1.2484988223178197, + "learning_rate": 1.2076575606226882e-08, + "loss": 0.2748, + "step": 33737 + }, + { + "epoch": 0.98, + "grad_norm": 1.4488438021035033, + "learning_rate": 1.204397136740587e-08, + "loss": 0.272, + "step": 33738 + }, + { + "epoch": 0.98, + "grad_norm": 1.6254380523901628, + "learning_rate": 1.2011411147351826e-08, + "loss": 0.2723, + "step": 33739 + }, + { + "epoch": 0.98, + "grad_norm": 1.3933719561966365, + "learning_rate": 1.1978894946352848e-08, + "loss": 0.258, + "step": 33740 + }, + { + "epoch": 0.98, + "grad_norm": 5.401167695802202, + "learning_rate": 1.1946422764695376e-08, + "loss": 0.2944, + "step": 33741 + }, + { + "epoch": 0.98, + "grad_norm": 1.3483413558282487, + "learning_rate": 1.1913994602665846e-08, + "loss": 0.258, + "step": 33742 + }, + { + "epoch": 0.98, + "grad_norm": 1.7502094024018537, + "learning_rate": 1.1881610460550696e-08, + "loss": 0.2989, + "step": 33743 + }, + { + "epoch": 0.98, + "grad_norm": 1.3422808625949132, + "learning_rate": 1.184927033863581e-08, + "loss": 0.2914, + "step": 33744 + }, + { + "epoch": 0.98, + "grad_norm": 1.2950588271315957, + "learning_rate": 1.1816974237206514e-08, + "loss": 0.2449, + "step": 33745 + }, + { + "epoch": 0.98, + "grad_norm": 1.444882717812843, + "learning_rate": 1.1784722156547579e-08, + "loss": 0.278, + "step": 33746 + }, + { + "epoch": 0.98, + "grad_norm": 4.749464333051369, + "learning_rate": 1.175251409694378e-08, + "loss": 0.2727, + "step": 33747 + }, + { + "epoch": 0.98, + "grad_norm": 1.22075913875352, + "learning_rate": 1.172035005867933e-08, + "loss": 0.2487, + "step": 33748 + }, + { + "epoch": 0.98, + "grad_norm": 1.3113553190712144, + "learning_rate": 1.1688230042038451e-08, + "loss": 0.2539, + "step": 33749 + }, + { + "epoch": 0.98, + "grad_norm": 1.3952119581053872, + "learning_rate": 1.1656154047303691e-08, + "loss": 0.2735, + "step": 33750 + }, + { + "epoch": 0.98, + "grad_norm": 2.6037967716320396, + "learning_rate": 1.162412207475927e-08, + "loss": 0.2931, + "step": 33751 + }, + { + "epoch": 0.98, + "grad_norm": 3.2836241367669485, + "learning_rate": 1.1592134124687182e-08, + "loss": 0.2762, + "step": 33752 + }, + { + "epoch": 0.98, + "grad_norm": 1.5462033419310357, + "learning_rate": 1.1560190197369981e-08, + "loss": 0.2917, + "step": 33753 + }, + { + "epoch": 0.98, + "grad_norm": 1.441805015536533, + "learning_rate": 1.1528290293089107e-08, + "loss": 0.2762, + "step": 33754 + }, + { + "epoch": 0.98, + "grad_norm": 1.317065706837969, + "learning_rate": 1.1496434412126556e-08, + "loss": 0.2735, + "step": 33755 + }, + { + "epoch": 0.98, + "grad_norm": 1.063391847425162, + "learning_rate": 1.1464622554763215e-08, + "loss": 0.5608, + "step": 33756 + }, + { + "epoch": 0.98, + "grad_norm": 1.3483886391496311, + "learning_rate": 1.1432854721279973e-08, + "loss": 0.2778, + "step": 33757 + }, + { + "epoch": 0.98, + "grad_norm": 1.3213563820840077, + "learning_rate": 1.1401130911957158e-08, + "loss": 0.2623, + "step": 33758 + }, + { + "epoch": 0.98, + "grad_norm": 1.3408996097327848, + "learning_rate": 1.1369451127075104e-08, + "loss": 0.2634, + "step": 33759 + }, + { + "epoch": 0.98, + "grad_norm": 1.3910881583524515, + "learning_rate": 1.1337815366912475e-08, + "loss": 0.2535, + "step": 33760 + }, + { + "epoch": 0.98, + "grad_norm": 1.3325586370852385, + "learning_rate": 1.1306223631749047e-08, + "loss": 0.2775, + "step": 33761 + }, + { + "epoch": 0.98, + "grad_norm": 1.84288615323071, + "learning_rate": 1.1274675921863486e-08, + "loss": 0.274, + "step": 33762 + }, + { + "epoch": 0.98, + "grad_norm": 1.3251787346891366, + "learning_rate": 1.124317223753446e-08, + "loss": 0.2453, + "step": 33763 + }, + { + "epoch": 0.98, + "grad_norm": 1.2643145717234088, + "learning_rate": 1.1211712579039524e-08, + "loss": 0.2557, + "step": 33764 + }, + { + "epoch": 0.98, + "grad_norm": 1.6811204287880497, + "learning_rate": 1.1180296946656788e-08, + "loss": 0.267, + "step": 33765 + }, + { + "epoch": 0.98, + "grad_norm": 1.1604303705998091, + "learning_rate": 1.1148925340663253e-08, + "loss": 0.2678, + "step": 33766 + }, + { + "epoch": 0.98, + "grad_norm": 1.5204875426901054, + "learning_rate": 1.1117597761335919e-08, + "loss": 0.2664, + "step": 33767 + }, + { + "epoch": 0.98, + "grad_norm": 1.2444542127846405, + "learning_rate": 1.1086314208950677e-08, + "loss": 0.2604, + "step": 33768 + }, + { + "epoch": 0.98, + "grad_norm": 1.3679104483314186, + "learning_rate": 1.1055074683784528e-08, + "loss": 0.3012, + "step": 33769 + }, + { + "epoch": 0.98, + "grad_norm": 1.7113199371842835, + "learning_rate": 1.1023879186112807e-08, + "loss": 0.2776, + "step": 33770 + }, + { + "epoch": 0.98, + "grad_norm": 1.4751086029783136, + "learning_rate": 1.0992727716210294e-08, + "loss": 0.2806, + "step": 33771 + }, + { + "epoch": 0.98, + "grad_norm": 1.4161244472229049, + "learning_rate": 1.0961620274352324e-08, + "loss": 0.2591, + "step": 33772 + }, + { + "epoch": 0.98, + "grad_norm": 1.323640188556527, + "learning_rate": 1.0930556860813124e-08, + "loss": 0.2753, + "step": 33773 + }, + { + "epoch": 0.98, + "grad_norm": 1.3196129726598933, + "learning_rate": 1.0899537475867472e-08, + "loss": 0.2659, + "step": 33774 + }, + { + "epoch": 0.98, + "grad_norm": 1.4507328093861127, + "learning_rate": 1.086856211978904e-08, + "loss": 0.2565, + "step": 33775 + }, + { + "epoch": 0.98, + "grad_norm": 1.2086463475759852, + "learning_rate": 1.0837630792850384e-08, + "loss": 0.2626, + "step": 33776 + }, + { + "epoch": 0.98, + "grad_norm": 3.10732225462441, + "learning_rate": 1.0806743495325179e-08, + "loss": 0.2807, + "step": 33777 + }, + { + "epoch": 0.98, + "grad_norm": 1.3310816377337988, + "learning_rate": 1.077590022748598e-08, + "loss": 0.2679, + "step": 33778 + }, + { + "epoch": 0.98, + "grad_norm": 1.3475132527503528, + "learning_rate": 1.0745100989604795e-08, + "loss": 0.2665, + "step": 33779 + }, + { + "epoch": 0.98, + "grad_norm": 1.211887850964014, + "learning_rate": 1.0714345781953073e-08, + "loss": 0.2632, + "step": 33780 + }, + { + "epoch": 0.98, + "grad_norm": 1.470519469348734, + "learning_rate": 1.0683634604802817e-08, + "loss": 0.2597, + "step": 33781 + }, + { + "epoch": 0.98, + "grad_norm": 1.308870789840587, + "learning_rate": 1.0652967458424924e-08, + "loss": 0.2593, + "step": 33782 + }, + { + "epoch": 0.98, + "grad_norm": 1.3959624637280397, + "learning_rate": 1.062234434308973e-08, + "loss": 0.2627, + "step": 33783 + }, + { + "epoch": 0.98, + "grad_norm": 2.6919927946308353, + "learning_rate": 1.0591765259067577e-08, + "loss": 0.2556, + "step": 33784 + }, + { + "epoch": 0.98, + "grad_norm": 1.3012788825025416, + "learning_rate": 1.0561230206628804e-08, + "loss": 0.2696, + "step": 33785 + }, + { + "epoch": 0.98, + "grad_norm": 1.5006055402897636, + "learning_rate": 1.0530739186042637e-08, + "loss": 0.251, + "step": 33786 + }, + { + "epoch": 0.98, + "grad_norm": 1.4748667815256464, + "learning_rate": 1.0500292197577756e-08, + "loss": 0.2748, + "step": 33787 + }, + { + "epoch": 0.98, + "grad_norm": 1.3123947167160392, + "learning_rate": 1.046988924150283e-08, + "loss": 0.2659, + "step": 33788 + }, + { + "epoch": 0.98, + "grad_norm": 1.3608329824701264, + "learning_rate": 1.0439530318087087e-08, + "loss": 0.2605, + "step": 33789 + }, + { + "epoch": 0.98, + "grad_norm": 1.3276206901079923, + "learning_rate": 1.0409215427597541e-08, + "loss": 0.2382, + "step": 33790 + }, + { + "epoch": 0.98, + "grad_norm": 1.5949122148676682, + "learning_rate": 1.0378944570302307e-08, + "loss": 0.3183, + "step": 33791 + }, + { + "epoch": 0.98, + "grad_norm": 1.3227235851687147, + "learning_rate": 1.034871774646784e-08, + "loss": 0.2653, + "step": 33792 + }, + { + "epoch": 0.98, + "grad_norm": 1.2150716476031191, + "learning_rate": 1.0318534956361703e-08, + "loss": 0.25, + "step": 33793 + }, + { + "epoch": 0.98, + "grad_norm": 1.3444284825954897, + "learning_rate": 1.028839620024924e-08, + "loss": 0.2723, + "step": 33794 + }, + { + "epoch": 0.98, + "grad_norm": 1.7319842182025837, + "learning_rate": 1.0258301478397459e-08, + "loss": 0.2532, + "step": 33795 + }, + { + "epoch": 0.98, + "grad_norm": 1.3760006321737586, + "learning_rate": 1.0228250791071703e-08, + "loss": 0.2592, + "step": 33796 + }, + { + "epoch": 0.98, + "grad_norm": 1.4034653303211453, + "learning_rate": 1.0198244138536761e-08, + "loss": 0.2636, + "step": 33797 + }, + { + "epoch": 0.98, + "grad_norm": 1.4011908498796903, + "learning_rate": 1.0168281521057422e-08, + "loss": 0.2511, + "step": 33798 + }, + { + "epoch": 0.98, + "grad_norm": 1.4472528386325447, + "learning_rate": 1.0138362938899026e-08, + "loss": 0.2673, + "step": 33799 + }, + { + "epoch": 0.98, + "grad_norm": 1.3810883878203464, + "learning_rate": 1.0108488392324145e-08, + "loss": 0.2711, + "step": 33800 + }, + { + "epoch": 0.98, + "grad_norm": 1.272331949412209, + "learning_rate": 1.0078657881597565e-08, + "loss": 0.3004, + "step": 33801 + }, + { + "epoch": 0.98, + "grad_norm": 1.4466606899070098, + "learning_rate": 1.0048871406982408e-08, + "loss": 0.2475, + "step": 33802 + }, + { + "epoch": 0.98, + "grad_norm": 1.698693577479442, + "learning_rate": 1.0019128968741243e-08, + "loss": 0.2667, + "step": 33803 + }, + { + "epoch": 0.98, + "grad_norm": 1.278324012497819, + "learning_rate": 9.989430567136082e-09, + "loss": 0.2469, + "step": 33804 + }, + { + "epoch": 0.98, + "grad_norm": 6.931779418663854, + "learning_rate": 9.959776202430049e-09, + "loss": 0.2611, + "step": 33805 + }, + { + "epoch": 0.98, + "grad_norm": 1.3860732776656148, + "learning_rate": 9.930165874884601e-09, + "loss": 0.2558, + "step": 33806 + }, + { + "epoch": 0.98, + "grad_norm": 1.3476156866445999, + "learning_rate": 9.900599584760084e-09, + "loss": 0.2811, + "step": 33807 + }, + { + "epoch": 0.98, + "grad_norm": 1.826720287202411, + "learning_rate": 9.871077332318513e-09, + "loss": 0.2845, + "step": 33808 + }, + { + "epoch": 0.98, + "grad_norm": 1.3820941481498892, + "learning_rate": 9.841599117820232e-09, + "loss": 0.2828, + "step": 33809 + }, + { + "epoch": 0.98, + "grad_norm": 1.2656122079041912, + "learning_rate": 9.812164941524482e-09, + "loss": 0.2561, + "step": 33810 + }, + { + "epoch": 0.98, + "grad_norm": 1.4013924790866477, + "learning_rate": 9.782774803692718e-09, + "loss": 0.2899, + "step": 33811 + }, + { + "epoch": 0.98, + "grad_norm": 1.2737298076327779, + "learning_rate": 9.753428704582513e-09, + "loss": 0.3086, + "step": 33812 + }, + { + "epoch": 0.98, + "grad_norm": 1.3370763509923624, + "learning_rate": 9.724126644454212e-09, + "loss": 0.2537, + "step": 33813 + }, + { + "epoch": 0.98, + "grad_norm": 1.2763386793628329, + "learning_rate": 9.69486862356539e-09, + "loss": 0.251, + "step": 33814 + }, + { + "epoch": 0.98, + "grad_norm": 1.3709972625960574, + "learning_rate": 9.665654642175282e-09, + "loss": 0.2682, + "step": 33815 + }, + { + "epoch": 0.98, + "grad_norm": 1.3621857239060877, + "learning_rate": 9.636484700540349e-09, + "loss": 0.2541, + "step": 33816 + }, + { + "epoch": 0.98, + "grad_norm": 1.483382508862879, + "learning_rate": 9.607358798919831e-09, + "loss": 0.2674, + "step": 33817 + }, + { + "epoch": 0.98, + "grad_norm": 1.686175837208881, + "learning_rate": 9.578276937569075e-09, + "loss": 0.269, + "step": 33818 + }, + { + "epoch": 0.98, + "grad_norm": 1.4715565286206527, + "learning_rate": 9.549239116745102e-09, + "loss": 0.2958, + "step": 33819 + }, + { + "epoch": 0.98, + "grad_norm": 1.354844049080201, + "learning_rate": 9.520245336704926e-09, + "loss": 0.2629, + "step": 33820 + }, + { + "epoch": 0.98, + "grad_norm": 1.3646438123581535, + "learning_rate": 9.491295597703898e-09, + "loss": 0.2653, + "step": 33821 + }, + { + "epoch": 0.98, + "grad_norm": 1.2865215207259733, + "learning_rate": 9.46238989999737e-09, + "loss": 0.2633, + "step": 33822 + }, + { + "epoch": 0.98, + "grad_norm": 2.212945198041944, + "learning_rate": 9.433528243840695e-09, + "loss": 0.2829, + "step": 33823 + }, + { + "epoch": 0.98, + "grad_norm": 1.4376599264831318, + "learning_rate": 9.404710629488667e-09, + "loss": 0.2782, + "step": 33824 + }, + { + "epoch": 0.98, + "grad_norm": 1.4225560109559325, + "learning_rate": 9.375937057195528e-09, + "loss": 0.2776, + "step": 33825 + }, + { + "epoch": 0.98, + "grad_norm": 1.376672122159293, + "learning_rate": 9.347207527214408e-09, + "loss": 0.2709, + "step": 33826 + }, + { + "epoch": 0.98, + "grad_norm": 1.2356583748283452, + "learning_rate": 9.318522039800105e-09, + "loss": 0.2544, + "step": 33827 + }, + { + "epoch": 0.98, + "grad_norm": 1.4420000803835988, + "learning_rate": 9.289880595205192e-09, + "loss": 0.2761, + "step": 33828 + }, + { + "epoch": 0.98, + "grad_norm": 1.6297320573613252, + "learning_rate": 9.261283193682246e-09, + "loss": 0.2691, + "step": 33829 + }, + { + "epoch": 0.98, + "grad_norm": 1.3603125664072693, + "learning_rate": 9.232729835483845e-09, + "loss": 0.2692, + "step": 33830 + }, + { + "epoch": 0.98, + "grad_norm": 1.2838397576374394, + "learning_rate": 9.204220520862006e-09, + "loss": 0.2749, + "step": 33831 + }, + { + "epoch": 0.98, + "grad_norm": 1.247944862333383, + "learning_rate": 9.175755250068751e-09, + "loss": 0.2572, + "step": 33832 + }, + { + "epoch": 0.98, + "grad_norm": 1.357817927378498, + "learning_rate": 9.147334023354437e-09, + "loss": 0.2812, + "step": 33833 + }, + { + "epoch": 0.98, + "grad_norm": 2.4230559676746957, + "learning_rate": 9.118956840969973e-09, + "loss": 0.2764, + "step": 33834 + }, + { + "epoch": 0.98, + "grad_norm": 1.321545280998698, + "learning_rate": 9.090623703166824e-09, + "loss": 0.2765, + "step": 33835 + }, + { + "epoch": 0.98, + "grad_norm": 1.2778245519422287, + "learning_rate": 9.062334610193679e-09, + "loss": 0.2863, + "step": 33836 + }, + { + "epoch": 0.98, + "grad_norm": 1.9649319523923583, + "learning_rate": 9.034089562300897e-09, + "loss": 0.2588, + "step": 33837 + }, + { + "epoch": 0.98, + "grad_norm": 1.373777910923454, + "learning_rate": 9.005888559737719e-09, + "loss": 0.2778, + "step": 33838 + }, + { + "epoch": 0.98, + "grad_norm": 1.2580818129160112, + "learning_rate": 8.977731602752837e-09, + "loss": 0.27, + "step": 33839 + }, + { + "epoch": 0.98, + "grad_norm": 1.2755289111459518, + "learning_rate": 8.949618691594942e-09, + "loss": 0.2744, + "step": 33840 + }, + { + "epoch": 0.98, + "grad_norm": 1.3943617476562964, + "learning_rate": 8.921549826512165e-09, + "loss": 0.2769, + "step": 33841 + }, + { + "epoch": 0.98, + "grad_norm": 1.4365701140351326, + "learning_rate": 8.893525007751535e-09, + "loss": 0.2709, + "step": 33842 + }, + { + "epoch": 0.98, + "grad_norm": 1.5175610739308445, + "learning_rate": 8.86554423556174e-09, + "loss": 0.2666, + "step": 33843 + }, + { + "epoch": 0.98, + "grad_norm": 1.4336826521571722, + "learning_rate": 8.83760751018814e-09, + "loss": 0.3166, + "step": 33844 + }, + { + "epoch": 0.98, + "grad_norm": 1.5516305575734772, + "learning_rate": 8.809714831878312e-09, + "loss": 0.2751, + "step": 33845 + }, + { + "epoch": 0.98, + "grad_norm": 1.4692262152438504, + "learning_rate": 8.781866200878175e-09, + "loss": 0.2958, + "step": 33846 + }, + { + "epoch": 0.98, + "grad_norm": 2.495182033687855, + "learning_rate": 8.75406161743364e-09, + "loss": 0.3158, + "step": 33847 + }, + { + "epoch": 0.98, + "grad_norm": 1.3368981858962148, + "learning_rate": 8.726301081789513e-09, + "loss": 0.2581, + "step": 33848 + }, + { + "epoch": 0.98, + "grad_norm": 1.482577219011274, + "learning_rate": 8.698584594191151e-09, + "loss": 0.2673, + "step": 33849 + }, + { + "epoch": 0.98, + "grad_norm": 1.3108291276404225, + "learning_rate": 8.67091215488336e-09, + "loss": 0.2651, + "step": 33850 + }, + { + "epoch": 0.98, + "grad_norm": 1.5062951129349316, + "learning_rate": 8.643283764109834e-09, + "loss": 0.2826, + "step": 33851 + }, + { + "epoch": 0.98, + "grad_norm": 1.5336577499356443, + "learning_rate": 8.615699422115376e-09, + "loss": 0.2685, + "step": 33852 + }, + { + "epoch": 0.98, + "grad_norm": 1.8478345203237407, + "learning_rate": 8.588159129142015e-09, + "loss": 0.2606, + "step": 33853 + }, + { + "epoch": 0.98, + "grad_norm": 0.9362722525365054, + "learning_rate": 8.560662885434001e-09, + "loss": 0.5564, + "step": 33854 + }, + { + "epoch": 0.98, + "grad_norm": 1.529411965349238, + "learning_rate": 8.533210691233362e-09, + "loss": 0.2623, + "step": 33855 + }, + { + "epoch": 0.98, + "grad_norm": 1.2508838131807865, + "learning_rate": 8.505802546782128e-09, + "loss": 0.2791, + "step": 33856 + }, + { + "epoch": 0.98, + "grad_norm": 1.272986117604278, + "learning_rate": 8.478438452322879e-09, + "loss": 0.257, + "step": 33857 + }, + { + "epoch": 0.98, + "grad_norm": 1.5125478404749377, + "learning_rate": 8.451118408097093e-09, + "loss": 0.2667, + "step": 33858 + }, + { + "epoch": 0.98, + "grad_norm": 1.4319279709787793, + "learning_rate": 8.42384241434513e-09, + "loss": 0.2501, + "step": 33859 + }, + { + "epoch": 0.98, + "grad_norm": 0.9110311412266952, + "learning_rate": 8.396610471308463e-09, + "loss": 0.5962, + "step": 33860 + }, + { + "epoch": 0.98, + "grad_norm": 1.4131942104499482, + "learning_rate": 8.369422579226905e-09, + "loss": 0.2606, + "step": 33861 + }, + { + "epoch": 0.98, + "grad_norm": 1.2764125297116504, + "learning_rate": 8.342278738340259e-09, + "loss": 0.2683, + "step": 33862 + }, + { + "epoch": 0.98, + "grad_norm": 1.5214748222933758, + "learning_rate": 8.315178948888892e-09, + "loss": 0.2571, + "step": 33863 + }, + { + "epoch": 0.98, + "grad_norm": 5.177886018982325, + "learning_rate": 8.288123211110944e-09, + "loss": 0.2585, + "step": 33864 + }, + { + "epoch": 0.98, + "grad_norm": 1.342593280503825, + "learning_rate": 8.261111525246223e-09, + "loss": 0.2774, + "step": 33865 + }, + { + "epoch": 0.98, + "grad_norm": 1.4524627346332295, + "learning_rate": 8.23414389153232e-09, + "loss": 0.2545, + "step": 33866 + }, + { + "epoch": 0.98, + "grad_norm": 1.332527980704995, + "learning_rate": 8.207220310207375e-09, + "loss": 0.2552, + "step": 33867 + }, + { + "epoch": 0.98, + "grad_norm": 1.3981902517763733, + "learning_rate": 8.180340781508977e-09, + "loss": 0.2681, + "step": 33868 + }, + { + "epoch": 0.98, + "grad_norm": 1.439714631436008, + "learning_rate": 8.153505305674713e-09, + "loss": 0.2527, + "step": 33869 + }, + { + "epoch": 0.98, + "grad_norm": 1.28733187814766, + "learning_rate": 8.126713882941061e-09, + "loss": 0.2738, + "step": 33870 + }, + { + "epoch": 0.98, + "grad_norm": 1.4765994077679845, + "learning_rate": 8.0999665135445e-09, + "loss": 0.2472, + "step": 33871 + }, + { + "epoch": 0.98, + "grad_norm": 1.209441319546658, + "learning_rate": 8.073263197721503e-09, + "loss": 0.2602, + "step": 33872 + }, + { + "epoch": 0.98, + "grad_norm": 1.3264821110198701, + "learning_rate": 8.046603935706888e-09, + "loss": 0.2838, + "step": 33873 + }, + { + "epoch": 0.98, + "grad_norm": 1.396069197729552, + "learning_rate": 8.019988727736572e-09, + "loss": 0.2763, + "step": 33874 + }, + { + "epoch": 0.98, + "grad_norm": 0.9926871190095984, + "learning_rate": 7.993417574044814e-09, + "loss": 0.56, + "step": 33875 + }, + { + "epoch": 0.98, + "grad_norm": 2.2329177196667422, + "learning_rate": 7.966890474866983e-09, + "loss": 0.2623, + "step": 33876 + }, + { + "epoch": 0.98, + "grad_norm": 2.5830204080361696, + "learning_rate": 7.94040743043678e-09, + "loss": 0.2546, + "step": 33877 + }, + { + "epoch": 0.98, + "grad_norm": 1.5498521436518653, + "learning_rate": 7.913968440987352e-09, + "loss": 0.2571, + "step": 33878 + }, + { + "epoch": 0.98, + "grad_norm": 1.4787672907639993, + "learning_rate": 7.887573506752954e-09, + "loss": 0.2612, + "step": 33879 + }, + { + "epoch": 0.98, + "grad_norm": 1.3309437587666029, + "learning_rate": 7.86122262796618e-09, + "loss": 0.2705, + "step": 33880 + }, + { + "epoch": 0.98, + "grad_norm": 1.3351399854136685, + "learning_rate": 7.834915804859067e-09, + "loss": 0.2587, + "step": 33881 + }, + { + "epoch": 0.98, + "grad_norm": 3.4618054176798694, + "learning_rate": 7.808653037664204e-09, + "loss": 0.2774, + "step": 33882 + }, + { + "epoch": 0.98, + "grad_norm": 1.5713956792960098, + "learning_rate": 7.78243432661363e-09, + "loss": 0.2783, + "step": 33883 + }, + { + "epoch": 0.98, + "grad_norm": 1.2566574132936623, + "learning_rate": 7.756259671938826e-09, + "loss": 0.2711, + "step": 33884 + }, + { + "epoch": 0.98, + "grad_norm": 1.2595707073450801, + "learning_rate": 7.730129073869608e-09, + "loss": 0.2481, + "step": 33885 + }, + { + "epoch": 0.98, + "grad_norm": 1.7773964936087212, + "learning_rate": 7.704042532638012e-09, + "loss": 0.2588, + "step": 33886 + }, + { + "epoch": 0.98, + "grad_norm": 1.6100359137354625, + "learning_rate": 7.6780000484733e-09, + "loss": 0.2736, + "step": 33887 + }, + { + "epoch": 0.98, + "grad_norm": 1.8773657518965439, + "learning_rate": 7.652001621605842e-09, + "loss": 0.275, + "step": 33888 + }, + { + "epoch": 0.98, + "grad_norm": 1.7207951916790851, + "learning_rate": 7.626047252264901e-09, + "loss": 0.3097, + "step": 33889 + }, + { + "epoch": 0.98, + "grad_norm": 1.3972937633427907, + "learning_rate": 7.600136940679182e-09, + "loss": 0.2837, + "step": 33890 + }, + { + "epoch": 0.98, + "grad_norm": 1.296425425489794, + "learning_rate": 7.57427068707739e-09, + "loss": 0.258, + "step": 33891 + }, + { + "epoch": 0.98, + "grad_norm": 1.5588123539822976, + "learning_rate": 7.548448491688787e-09, + "loss": 0.2891, + "step": 33892 + }, + { + "epoch": 0.98, + "grad_norm": 1.3602944541713295, + "learning_rate": 7.522670354739858e-09, + "loss": 0.2475, + "step": 33893 + }, + { + "epoch": 0.98, + "grad_norm": 1.3373731800807294, + "learning_rate": 7.496936276458755e-09, + "loss": 0.2511, + "step": 33894 + }, + { + "epoch": 0.98, + "grad_norm": 2.7582498287735917, + "learning_rate": 7.471246257073072e-09, + "loss": 0.2921, + "step": 33895 + }, + { + "epoch": 0.98, + "grad_norm": 1.3656771803923016, + "learning_rate": 7.4456002968087414e-09, + "loss": 0.3078, + "step": 33896 + }, + { + "epoch": 0.98, + "grad_norm": 1.3265878955741228, + "learning_rate": 7.4199983958928025e-09, + "loss": 0.2882, + "step": 33897 + }, + { + "epoch": 0.98, + "grad_norm": 1.0350779986767724, + "learning_rate": 7.3944405545500754e-09, + "loss": 0.5777, + "step": 33898 + }, + { + "epoch": 0.98, + "grad_norm": 2.1659889042935307, + "learning_rate": 7.3689267730070455e-09, + "loss": 0.2663, + "step": 33899 + }, + { + "epoch": 0.98, + "grad_norm": 1.4282011320212533, + "learning_rate": 7.343457051488534e-09, + "loss": 0.2656, + "step": 33900 + }, + { + "epoch": 0.98, + "grad_norm": 1.4697196338500629, + "learning_rate": 7.318031390219915e-09, + "loss": 0.2879, + "step": 33901 + }, + { + "epoch": 0.98, + "grad_norm": 1.3950905621618903, + "learning_rate": 7.292649789424344e-09, + "loss": 0.266, + "step": 33902 + }, + { + "epoch": 0.98, + "grad_norm": 1.6536469328910204, + "learning_rate": 7.267312249327196e-09, + "loss": 0.2508, + "step": 33903 + }, + { + "epoch": 0.98, + "grad_norm": 1.455335302205731, + "learning_rate": 7.242018770151072e-09, + "loss": 0.2574, + "step": 33904 + }, + { + "epoch": 0.98, + "grad_norm": 1.3434508546842754, + "learning_rate": 7.216769352119124e-09, + "loss": 0.2745, + "step": 33905 + }, + { + "epoch": 0.98, + "grad_norm": 2.842824362419282, + "learning_rate": 7.191563995455064e-09, + "loss": 0.2765, + "step": 33906 + }, + { + "epoch": 0.98, + "grad_norm": 1.481752906747551, + "learning_rate": 7.166402700380937e-09, + "loss": 0.2688, + "step": 33907 + }, + { + "epoch": 0.98, + "grad_norm": 1.698694097121223, + "learning_rate": 7.141285467118231e-09, + "loss": 0.2819, + "step": 33908 + }, + { + "epoch": 0.98, + "grad_norm": 1.461589993273553, + "learning_rate": 7.1162122958895465e-09, + "loss": 0.2831, + "step": 33909 + }, + { + "epoch": 0.98, + "grad_norm": 1.2421320054603506, + "learning_rate": 7.091183186915263e-09, + "loss": 0.2622, + "step": 33910 + }, + { + "epoch": 0.98, + "grad_norm": 1.5925696125348652, + "learning_rate": 7.066198140416869e-09, + "loss": 0.2626, + "step": 33911 + }, + { + "epoch": 0.98, + "grad_norm": 1.3906122901185243, + "learning_rate": 7.041257156614745e-09, + "loss": 0.278, + "step": 33912 + }, + { + "epoch": 0.98, + "grad_norm": 1.3153802964683934, + "learning_rate": 7.0163602357287145e-09, + "loss": 0.2945, + "step": 33913 + }, + { + "epoch": 0.98, + "grad_norm": 1.5140126681838262, + "learning_rate": 6.991507377979157e-09, + "loss": 0.2613, + "step": 33914 + }, + { + "epoch": 0.98, + "grad_norm": 1.3160731678609816, + "learning_rate": 6.9666985835842305e-09, + "loss": 0.2578, + "step": 33915 + }, + { + "epoch": 0.98, + "grad_norm": 1.3751178270638769, + "learning_rate": 6.941933852764316e-09, + "loss": 0.2575, + "step": 33916 + }, + { + "epoch": 0.98, + "grad_norm": 1.5472621657382153, + "learning_rate": 6.917213185737015e-09, + "loss": 0.2832, + "step": 33917 + }, + { + "epoch": 0.98, + "grad_norm": 1.3658439102727546, + "learning_rate": 6.892536582720489e-09, + "loss": 0.2615, + "step": 33918 + }, + { + "epoch": 0.98, + "grad_norm": 1.5674497484918777, + "learning_rate": 6.867904043932339e-09, + "loss": 0.2803, + "step": 33919 + }, + { + "epoch": 0.98, + "grad_norm": 1.345662534578188, + "learning_rate": 6.843315569590725e-09, + "loss": 0.2567, + "step": 33920 + }, + { + "epoch": 0.98, + "grad_norm": 1.3764632959932914, + "learning_rate": 6.818771159912696e-09, + "loss": 0.2823, + "step": 33921 + }, + { + "epoch": 0.98, + "grad_norm": 2.7333355419782244, + "learning_rate": 6.794270815113635e-09, + "loss": 0.2904, + "step": 33922 + }, + { + "epoch": 0.98, + "grad_norm": 1.3654940282522852, + "learning_rate": 6.7698145354111454e-09, + "loss": 0.2728, + "step": 33923 + }, + { + "epoch": 0.98, + "grad_norm": 1.2914896785623828, + "learning_rate": 6.745402321020056e-09, + "loss": 0.2628, + "step": 33924 + }, + { + "epoch": 0.98, + "grad_norm": 1.326800873444549, + "learning_rate": 6.7210341721563044e-09, + "loss": 0.2564, + "step": 33925 + }, + { + "epoch": 0.98, + "grad_norm": 1.4524528179643923, + "learning_rate": 6.696710089034719e-09, + "loss": 0.2461, + "step": 33926 + }, + { + "epoch": 0.98, + "grad_norm": 1.2386032942832454, + "learning_rate": 6.672430071870128e-09, + "loss": 0.2565, + "step": 33927 + }, + { + "epoch": 0.98, + "grad_norm": 1.6692123573248943, + "learning_rate": 6.648194120876805e-09, + "loss": 0.2988, + "step": 33928 + }, + { + "epoch": 0.98, + "grad_norm": 1.70707884243057, + "learning_rate": 6.624002236268468e-09, + "loss": 0.2752, + "step": 33929 + }, + { + "epoch": 0.98, + "grad_norm": 1.831078576202809, + "learning_rate": 6.599854418258833e-09, + "loss": 0.2846, + "step": 33930 + }, + { + "epoch": 0.98, + "grad_norm": 1.5540389900182259, + "learning_rate": 6.5757506670610646e-09, + "loss": 0.2756, + "step": 33931 + }, + { + "epoch": 0.98, + "grad_norm": 1.5624076518576415, + "learning_rate": 6.551690982887215e-09, + "loss": 0.2645, + "step": 33932 + }, + { + "epoch": 0.98, + "grad_norm": 1.462701044743116, + "learning_rate": 6.527675365950447e-09, + "loss": 0.2561, + "step": 33933 + }, + { + "epoch": 0.98, + "grad_norm": 2.1396199938186, + "learning_rate": 6.503703816462259e-09, + "loss": 0.2965, + "step": 33934 + }, + { + "epoch": 0.98, + "grad_norm": 1.5912905608163297, + "learning_rate": 6.479776334634147e-09, + "loss": 0.2762, + "step": 33935 + }, + { + "epoch": 0.98, + "grad_norm": 1.3437315071005453, + "learning_rate": 6.4558929206770534e-09, + "loss": 0.2862, + "step": 33936 + }, + { + "epoch": 0.98, + "grad_norm": 1.4965020945391898, + "learning_rate": 6.432053574802477e-09, + "loss": 0.2889, + "step": 33937 + }, + { + "epoch": 0.98, + "grad_norm": 1.3117076581354181, + "learning_rate": 6.408258297220804e-09, + "loss": 0.2791, + "step": 33938 + }, + { + "epoch": 0.98, + "grad_norm": 1.8857208220864132, + "learning_rate": 6.384507088141312e-09, + "loss": 0.2878, + "step": 33939 + }, + { + "epoch": 0.98, + "grad_norm": 1.3330859962553219, + "learning_rate": 6.3607999477738326e-09, + "loss": 0.2545, + "step": 33940 + }, + { + "epoch": 0.98, + "grad_norm": 1.6301639948670823, + "learning_rate": 6.337136876327643e-09, + "loss": 0.2659, + "step": 33941 + }, + { + "epoch": 0.98, + "grad_norm": 1.2552567524280627, + "learning_rate": 6.3135178740114655e-09, + "loss": 0.2618, + "step": 33942 + }, + { + "epoch": 0.98, + "grad_norm": 1.2042118249074465, + "learning_rate": 6.289942941034577e-09, + "loss": 0.26, + "step": 33943 + }, + { + "epoch": 0.98, + "grad_norm": 0.8756484262069038, + "learning_rate": 6.266412077604034e-09, + "loss": 0.5895, + "step": 33944 + }, + { + "epoch": 0.98, + "grad_norm": 1.3707617326750825, + "learning_rate": 6.242925283927448e-09, + "loss": 0.2681, + "step": 33945 + }, + { + "epoch": 0.98, + "grad_norm": 1.559383856831486, + "learning_rate": 6.219482560212986e-09, + "loss": 0.285, + "step": 33946 + }, + { + "epoch": 0.98, + "grad_norm": 1.3787187369856364, + "learning_rate": 6.196083906666594e-09, + "loss": 0.2653, + "step": 33947 + }, + { + "epoch": 0.98, + "grad_norm": 1.4156242456443042, + "learning_rate": 6.17272932349533e-09, + "loss": 0.2685, + "step": 33948 + }, + { + "epoch": 0.98, + "grad_norm": 1.4054890366122585, + "learning_rate": 6.1494188109051386e-09, + "loss": 0.2743, + "step": 33949 + }, + { + "epoch": 0.98, + "grad_norm": 1.5450806094118121, + "learning_rate": 6.126152369101412e-09, + "loss": 0.2616, + "step": 33950 + }, + { + "epoch": 0.98, + "grad_norm": 1.3038495363239995, + "learning_rate": 6.102929998290097e-09, + "loss": 0.2734, + "step": 33951 + }, + { + "epoch": 0.98, + "grad_norm": 1.244069813854207, + "learning_rate": 6.079751698676029e-09, + "loss": 0.2525, + "step": 33952 + }, + { + "epoch": 0.98, + "grad_norm": 1.3247822044026183, + "learning_rate": 6.0566174704629335e-09, + "loss": 0.2569, + "step": 33953 + }, + { + "epoch": 0.98, + "grad_norm": 1.278762717407022, + "learning_rate": 6.033527313856202e-09, + "loss": 0.2508, + "step": 33954 + }, + { + "epoch": 0.98, + "grad_norm": 0.9524584415726086, + "learning_rate": 6.010481229058451e-09, + "loss": 0.5994, + "step": 33955 + }, + { + "epoch": 0.98, + "grad_norm": 1.504487817920571, + "learning_rate": 5.987479216273961e-09, + "loss": 0.2554, + "step": 33956 + }, + { + "epoch": 0.98, + "grad_norm": 0.9710279605542631, + "learning_rate": 5.9645212757053485e-09, + "loss": 0.5696, + "step": 33957 + }, + { + "epoch": 0.98, + "grad_norm": 1.6510837148703137, + "learning_rate": 5.941607407555228e-09, + "loss": 0.251, + "step": 33958 + }, + { + "epoch": 0.98, + "grad_norm": 1.2533263308218743, + "learning_rate": 5.91873761202566e-09, + "loss": 0.264, + "step": 33959 + }, + { + "epoch": 0.99, + "grad_norm": 1.4272689821545732, + "learning_rate": 5.8959118893187064e-09, + "loss": 0.2713, + "step": 33960 + }, + { + "epoch": 0.99, + "grad_norm": 1.3052886873162746, + "learning_rate": 5.873130239635871e-09, + "loss": 0.2655, + "step": 33961 + }, + { + "epoch": 0.99, + "grad_norm": 1.3313940491707204, + "learning_rate": 5.850392663178106e-09, + "loss": 0.2672, + "step": 33962 + }, + { + "epoch": 0.99, + "grad_norm": 1.5089898928925445, + "learning_rate": 5.8276991601458055e-09, + "loss": 0.2723, + "step": 33963 + }, + { + "epoch": 0.99, + "grad_norm": 1.3799653641853753, + "learning_rate": 5.805049730739365e-09, + "loss": 0.2471, + "step": 33964 + }, + { + "epoch": 0.99, + "grad_norm": 1.515577611563658, + "learning_rate": 5.78244437515918e-09, + "loss": 0.2736, + "step": 33965 + }, + { + "epoch": 0.99, + "grad_norm": 1.1950523354979488, + "learning_rate": 5.75988309360398e-09, + "loss": 0.2619, + "step": 33966 + }, + { + "epoch": 0.99, + "grad_norm": 1.391052046097344, + "learning_rate": 5.737365886273605e-09, + "loss": 0.3455, + "step": 33967 + }, + { + "epoch": 0.99, + "grad_norm": 1.4393494132144815, + "learning_rate": 5.7148927533656754e-09, + "loss": 0.2715, + "step": 33968 + }, + { + "epoch": 0.99, + "grad_norm": 1.3507162782587698, + "learning_rate": 5.69246369508003e-09, + "loss": 0.2859, + "step": 33969 + }, + { + "epoch": 0.99, + "grad_norm": 1.17746318713204, + "learning_rate": 5.670078711613181e-09, + "loss": 0.2605, + "step": 33970 + }, + { + "epoch": 0.99, + "grad_norm": 0.9623754173559942, + "learning_rate": 5.647737803163855e-09, + "loss": 0.5426, + "step": 33971 + }, + { + "epoch": 0.99, + "grad_norm": 1.320494353070162, + "learning_rate": 5.625440969928009e-09, + "loss": 0.2461, + "step": 33972 + }, + { + "epoch": 0.99, + "grad_norm": 1.310770235426545, + "learning_rate": 5.603188212103261e-09, + "loss": 0.2844, + "step": 33973 + }, + { + "epoch": 0.99, + "grad_norm": 1.377525407714827, + "learning_rate": 5.580979529886121e-09, + "loss": 0.2705, + "step": 33974 + }, + { + "epoch": 0.99, + "grad_norm": 1.3200479917122734, + "learning_rate": 5.558814923471989e-09, + "loss": 0.2651, + "step": 33975 + }, + { + "epoch": 0.99, + "grad_norm": 1.2372559175647693, + "learning_rate": 5.536694393056818e-09, + "loss": 0.2618, + "step": 33976 + }, + { + "epoch": 0.99, + "grad_norm": 0.9837349782652417, + "learning_rate": 5.514617938835454e-09, + "loss": 0.5535, + "step": 33977 + }, + { + "epoch": 0.99, + "grad_norm": 1.4267727183088879, + "learning_rate": 5.492585561003294e-09, + "loss": 0.2875, + "step": 33978 + }, + { + "epoch": 0.99, + "grad_norm": 1.3445431472912774, + "learning_rate": 5.470597259754074e-09, + "loss": 0.2854, + "step": 33979 + }, + { + "epoch": 0.99, + "grad_norm": 1.406255071938925, + "learning_rate": 5.448653035282636e-09, + "loss": 0.286, + "step": 33980 + }, + { + "epoch": 0.99, + "grad_norm": 1.4697737737402103, + "learning_rate": 5.42675288778216e-09, + "loss": 0.2585, + "step": 33981 + }, + { + "epoch": 0.99, + "grad_norm": 1.4436975163228227, + "learning_rate": 5.404896817445826e-09, + "loss": 0.2769, + "step": 33982 + }, + { + "epoch": 0.99, + "grad_norm": 1.2683993571196672, + "learning_rate": 5.383084824466811e-09, + "loss": 0.2601, + "step": 33983 + }, + { + "epoch": 0.99, + "grad_norm": 1.2294673004861583, + "learning_rate": 5.361316909037739e-09, + "loss": 0.2618, + "step": 33984 + }, + { + "epoch": 0.99, + "grad_norm": 1.4852089962916502, + "learning_rate": 5.339593071350124e-09, + "loss": 0.2732, + "step": 33985 + }, + { + "epoch": 0.99, + "grad_norm": 1.399637144940843, + "learning_rate": 5.317913311596035e-09, + "loss": 0.2605, + "step": 33986 + }, + { + "epoch": 0.99, + "grad_norm": 1.3781629716128843, + "learning_rate": 5.296277629966984e-09, + "loss": 0.2777, + "step": 33987 + }, + { + "epoch": 0.99, + "grad_norm": 1.268957169267332, + "learning_rate": 5.274686026653375e-09, + "loss": 0.2626, + "step": 33988 + }, + { + "epoch": 0.99, + "grad_norm": 1.4722550292668852, + "learning_rate": 5.2531385018461665e-09, + "loss": 0.2805, + "step": 33989 + }, + { + "epoch": 0.99, + "grad_norm": 1.3526018766620482, + "learning_rate": 5.231635055735762e-09, + "loss": 0.2667, + "step": 33990 + }, + { + "epoch": 0.99, + "grad_norm": 1.3963495366366783, + "learning_rate": 5.210175688511454e-09, + "loss": 0.2926, + "step": 33991 + }, + { + "epoch": 0.99, + "grad_norm": 1.4042763807366434, + "learning_rate": 5.188760400362536e-09, + "loss": 0.2655, + "step": 33992 + }, + { + "epoch": 0.99, + "grad_norm": 1.3207929267200795, + "learning_rate": 5.1673891914783e-09, + "loss": 0.2932, + "step": 33993 + }, + { + "epoch": 0.99, + "grad_norm": 1.4070348419795127, + "learning_rate": 5.14606206204693e-09, + "loss": 0.2664, + "step": 33994 + }, + { + "epoch": 0.99, + "grad_norm": 0.9599604251553568, + "learning_rate": 5.124779012257164e-09, + "loss": 0.5566, + "step": 33995 + }, + { + "epoch": 0.99, + "grad_norm": 0.9743477821828432, + "learning_rate": 5.103540042297184e-09, + "loss": 0.6185, + "step": 33996 + }, + { + "epoch": 0.99, + "grad_norm": 1.5713302805698253, + "learning_rate": 5.082345152352952e-09, + "loss": 0.2658, + "step": 33997 + }, + { + "epoch": 0.99, + "grad_norm": 3.948077366181899, + "learning_rate": 5.061194342613207e-09, + "loss": 0.2611, + "step": 33998 + }, + { + "epoch": 0.99, + "grad_norm": 1.300425406119208, + "learning_rate": 5.040087613263356e-09, + "loss": 0.279, + "step": 33999 + }, + { + "epoch": 0.99, + "grad_norm": 1.3702178055760743, + "learning_rate": 5.019024964490471e-09, + "loss": 0.2443, + "step": 34000 + }, + { + "epoch": 0.99, + "grad_norm": 1.655354743503771, + "learning_rate": 4.99800639647996e-09, + "loss": 0.264, + "step": 34001 + }, + { + "epoch": 0.99, + "grad_norm": 1.7969787034436375, + "learning_rate": 4.9770319094177845e-09, + "loss": 0.2652, + "step": 34002 + }, + { + "epoch": 0.99, + "grad_norm": 1.462698342117087, + "learning_rate": 4.956101503488242e-09, + "loss": 0.2583, + "step": 34003 + }, + { + "epoch": 0.99, + "grad_norm": 1.756155962155966, + "learning_rate": 4.93521517887674e-09, + "loss": 0.2763, + "step": 34004 + }, + { + "epoch": 0.99, + "grad_norm": 0.8987309665316319, + "learning_rate": 4.91437293576702e-09, + "loss": 0.5449, + "step": 34005 + }, + { + "epoch": 0.99, + "grad_norm": 1.4444782649530559, + "learning_rate": 4.8935747743439345e-09, + "loss": 0.2728, + "step": 34006 + }, + { + "epoch": 0.99, + "grad_norm": 0.9279547335738123, + "learning_rate": 4.8728206947901145e-09, + "loss": 0.5324, + "step": 34007 + }, + { + "epoch": 0.99, + "grad_norm": 1.282881970038278, + "learning_rate": 4.8521106972893026e-09, + "loss": 0.2585, + "step": 34008 + }, + { + "epoch": 0.99, + "grad_norm": 1.5245463573323854, + "learning_rate": 4.831444782023575e-09, + "loss": 0.2614, + "step": 34009 + }, + { + "epoch": 0.99, + "grad_norm": 1.6322311960978437, + "learning_rate": 4.81082294917612e-09, + "loss": 0.2759, + "step": 34010 + }, + { + "epoch": 0.99, + "grad_norm": 1.4752661266755345, + "learning_rate": 4.790245198928456e-09, + "loss": 0.2598, + "step": 34011 + }, + { + "epoch": 0.99, + "grad_norm": 1.277669647296553, + "learning_rate": 4.769711531462662e-09, + "loss": 0.2623, + "step": 34012 + }, + { + "epoch": 0.99, + "grad_norm": 1.3491242782027297, + "learning_rate": 4.749221946959148e-09, + "loss": 0.2736, + "step": 34013 + }, + { + "epoch": 0.99, + "grad_norm": 1.3202271050289713, + "learning_rate": 4.728776445599437e-09, + "loss": 0.2553, + "step": 34014 + }, + { + "epoch": 0.99, + "grad_norm": 1.609716523299422, + "learning_rate": 4.708375027563383e-09, + "loss": 0.27, + "step": 34015 + }, + { + "epoch": 0.99, + "grad_norm": 1.5014008424558618, + "learning_rate": 4.688017693031399e-09, + "loss": 0.2814, + "step": 34016 + }, + { + "epoch": 0.99, + "grad_norm": 1.397222227192447, + "learning_rate": 4.667704442183341e-09, + "loss": 0.2437, + "step": 34017 + }, + { + "epoch": 0.99, + "grad_norm": 1.2329608288997873, + "learning_rate": 4.647435275197954e-09, + "loss": 0.2456, + "step": 34018 + }, + { + "epoch": 0.99, + "grad_norm": 1.3430231159598187, + "learning_rate": 4.627210192254539e-09, + "loss": 0.2532, + "step": 34019 + }, + { + "epoch": 0.99, + "grad_norm": 1.346702980660386, + "learning_rate": 4.607029193531287e-09, + "loss": 0.2695, + "step": 34020 + }, + { + "epoch": 0.99, + "grad_norm": 1.3226006083526545, + "learning_rate": 4.5868922792063896e-09, + "loss": 0.2702, + "step": 34021 + }, + { + "epoch": 0.99, + "grad_norm": 1.4307225501718628, + "learning_rate": 4.566799449457482e-09, + "loss": 0.2687, + "step": 34022 + }, + { + "epoch": 0.99, + "grad_norm": 1.4456815482574692, + "learning_rate": 4.546750704462199e-09, + "loss": 0.2844, + "step": 34023 + }, + { + "epoch": 0.99, + "grad_norm": 1.254248689362467, + "learning_rate": 4.526746044397068e-09, + "loss": 0.2948, + "step": 34024 + }, + { + "epoch": 0.99, + "grad_norm": 1.3691794023967923, + "learning_rate": 4.506785469439168e-09, + "loss": 0.2624, + "step": 34025 + }, + { + "epoch": 0.99, + "grad_norm": 2.8723389204885197, + "learning_rate": 4.486868979763914e-09, + "loss": 0.2743, + "step": 34026 + }, + { + "epoch": 0.99, + "grad_norm": 1.794521339964008, + "learning_rate": 4.466996575547833e-09, + "loss": 0.2707, + "step": 34027 + }, + { + "epoch": 0.99, + "grad_norm": 1.6428792220845703, + "learning_rate": 4.447168256965229e-09, + "loss": 0.2935, + "step": 34028 + }, + { + "epoch": 0.99, + "grad_norm": 1.3439709469758745, + "learning_rate": 4.427384024192627e-09, + "loss": 0.2813, + "step": 34029 + }, + { + "epoch": 0.99, + "grad_norm": 1.5487502011990295, + "learning_rate": 4.407643877403223e-09, + "loss": 0.29, + "step": 34030 + }, + { + "epoch": 0.99, + "grad_norm": 1.2922557095950544, + "learning_rate": 4.387947816771876e-09, + "loss": 0.2721, + "step": 34031 + }, + { + "epoch": 0.99, + "grad_norm": 1.3453088071857944, + "learning_rate": 4.368295842472336e-09, + "loss": 0.2532, + "step": 34032 + }, + { + "epoch": 0.99, + "grad_norm": 0.9854886468550708, + "learning_rate": 4.3486879546783544e-09, + "loss": 0.5644, + "step": 34033 + }, + { + "epoch": 0.99, + "grad_norm": 1.3934391507688855, + "learning_rate": 4.329124153562569e-09, + "loss": 0.2611, + "step": 34034 + }, + { + "epoch": 0.99, + "grad_norm": 1.3682088949443083, + "learning_rate": 4.309604439297066e-09, + "loss": 0.2602, + "step": 34035 + }, + { + "epoch": 0.99, + "grad_norm": 1.3663695717282498, + "learning_rate": 4.290128812055039e-09, + "loss": 0.2684, + "step": 34036 + }, + { + "epoch": 0.99, + "grad_norm": 1.4289316655859037, + "learning_rate": 4.270697272008573e-09, + "loss": 0.2787, + "step": 34037 + }, + { + "epoch": 0.99, + "grad_norm": 1.423968696925162, + "learning_rate": 4.2513098193280865e-09, + "loss": 0.2632, + "step": 34038 + }, + { + "epoch": 0.99, + "grad_norm": 1.4332483304758037, + "learning_rate": 4.23196645418511e-09, + "loss": 0.3041, + "step": 34039 + }, + { + "epoch": 0.99, + "grad_norm": 1.5632285308334732, + "learning_rate": 4.212667176750617e-09, + "loss": 0.2656, + "step": 34040 + }, + { + "epoch": 0.99, + "grad_norm": 1.2536721878301513, + "learning_rate": 4.193411987194474e-09, + "loss": 0.2455, + "step": 34041 + }, + { + "epoch": 0.99, + "grad_norm": 1.4721881439804507, + "learning_rate": 4.174200885687651e-09, + "loss": 0.3053, + "step": 34042 + }, + { + "epoch": 0.99, + "grad_norm": 1.2531431213648396, + "learning_rate": 4.155033872398351e-09, + "loss": 0.2524, + "step": 34043 + }, + { + "epoch": 0.99, + "grad_norm": 1.157701593486305, + "learning_rate": 4.135910947496436e-09, + "loss": 0.2614, + "step": 34044 + }, + { + "epoch": 0.99, + "grad_norm": 1.298035511488913, + "learning_rate": 4.116832111150659e-09, + "loss": 0.2834, + "step": 34045 + }, + { + "epoch": 0.99, + "grad_norm": 1.4099000036617118, + "learning_rate": 4.097797363529221e-09, + "loss": 0.2572, + "step": 34046 + }, + { + "epoch": 0.99, + "grad_norm": 2.6227712386949915, + "learning_rate": 4.078806704799765e-09, + "loss": 0.264, + "step": 34047 + }, + { + "epoch": 0.99, + "grad_norm": 1.6229081448697749, + "learning_rate": 4.0598601351304885e-09, + "loss": 0.2702, + "step": 34048 + }, + { + "epoch": 0.99, + "grad_norm": 1.3369250866996627, + "learning_rate": 4.040957654688482e-09, + "loss": 0.2709, + "step": 34049 + }, + { + "epoch": 0.99, + "grad_norm": 1.523796687513207, + "learning_rate": 4.0220992636402775e-09, + "loss": 0.2717, + "step": 34050 + }, + { + "epoch": 0.99, + "grad_norm": 1.8311320515361909, + "learning_rate": 4.003284962152409e-09, + "loss": 0.2844, + "step": 34051 + }, + { + "epoch": 0.99, + "grad_norm": 1.4515631279426104, + "learning_rate": 3.984514750391411e-09, + "loss": 0.2648, + "step": 34052 + }, + { + "epoch": 0.99, + "grad_norm": 1.2410407914630668, + "learning_rate": 3.965788628522149e-09, + "loss": 0.2738, + "step": 34053 + }, + { + "epoch": 0.99, + "grad_norm": 1.2645602968517042, + "learning_rate": 3.9471065967100485e-09, + "loss": 0.2489, + "step": 34054 + }, + { + "epoch": 0.99, + "grad_norm": 1.6811415087704058, + "learning_rate": 3.928468655120532e-09, + "loss": 0.27, + "step": 34055 + }, + { + "epoch": 0.99, + "grad_norm": 1.4015163266802608, + "learning_rate": 3.9098748039173575e-09, + "loss": 0.2563, + "step": 34056 + }, + { + "epoch": 0.99, + "grad_norm": 1.2707583309168422, + "learning_rate": 3.8913250432653925e-09, + "loss": 0.2681, + "step": 34057 + }, + { + "epoch": 0.99, + "grad_norm": 1.4731218195369635, + "learning_rate": 3.87281937332784e-09, + "loss": 0.2691, + "step": 34058 + }, + { + "epoch": 0.99, + "grad_norm": 1.5019640380173545, + "learning_rate": 3.854357794267905e-09, + "loss": 0.2782, + "step": 34059 + }, + { + "epoch": 0.99, + "grad_norm": 1.4638338556874093, + "learning_rate": 3.8359403062482316e-09, + "loss": 0.2447, + "step": 34060 + }, + { + "epoch": 0.99, + "grad_norm": 1.2081109502828316, + "learning_rate": 3.8175669094325795e-09, + "loss": 0.2754, + "step": 34061 + }, + { + "epoch": 0.99, + "grad_norm": 1.2692436415366006, + "learning_rate": 3.799237603981931e-09, + "loss": 0.2655, + "step": 34062 + }, + { + "epoch": 0.99, + "grad_norm": 1.9116203241789114, + "learning_rate": 3.780952390058379e-09, + "loss": 0.2506, + "step": 34063 + }, + { + "epoch": 0.99, + "grad_norm": 1.4989774880306326, + "learning_rate": 3.76271126782346e-09, + "loss": 0.2604, + "step": 34064 + }, + { + "epoch": 0.99, + "grad_norm": 1.259722873209704, + "learning_rate": 3.744514237438157e-09, + "loss": 0.2735, + "step": 34065 + }, + { + "epoch": 0.99, + "grad_norm": 1.2561431893560322, + "learning_rate": 3.726361299062897e-09, + "loss": 0.2584, + "step": 34066 + }, + { + "epoch": 0.99, + "grad_norm": 1.424949721466481, + "learning_rate": 3.708252452858108e-09, + "loss": 0.2658, + "step": 34067 + }, + { + "epoch": 0.99, + "grad_norm": 1.4149799988289977, + "learning_rate": 3.6901876989831052e-09, + "loss": 0.2923, + "step": 34068 + }, + { + "epoch": 0.99, + "grad_norm": 1.2130139353478424, + "learning_rate": 3.6721670375977627e-09, + "loss": 0.257, + "step": 34069 + }, + { + "epoch": 0.99, + "grad_norm": 1.3277597030120942, + "learning_rate": 3.6541904688608408e-09, + "loss": 0.29, + "step": 34070 + }, + { + "epoch": 0.99, + "grad_norm": 1.3051898178326085, + "learning_rate": 3.6362579929311025e-09, + "loss": 0.2685, + "step": 34071 + }, + { + "epoch": 0.99, + "grad_norm": 1.4085826075548578, + "learning_rate": 3.6183696099673093e-09, + "loss": 0.286, + "step": 34072 + }, + { + "epoch": 0.99, + "grad_norm": 1.281294359217695, + "learning_rate": 3.6005253201260028e-09, + "loss": 0.2675, + "step": 34073 + }, + { + "epoch": 0.99, + "grad_norm": 1.2551902572511915, + "learning_rate": 3.5827251235659445e-09, + "loss": 0.2431, + "step": 34074 + }, + { + "epoch": 0.99, + "grad_norm": 1.2493100067116165, + "learning_rate": 3.5649690204442313e-09, + "loss": 0.2497, + "step": 34075 + }, + { + "epoch": 0.99, + "grad_norm": 2.0838819343901562, + "learning_rate": 3.547257010916294e-09, + "loss": 0.2653, + "step": 34076 + }, + { + "epoch": 0.99, + "grad_norm": 1.5386380041532512, + "learning_rate": 3.529589095139785e-09, + "loss": 0.2871, + "step": 34077 + }, + { + "epoch": 0.99, + "grad_norm": 1.4523241625284264, + "learning_rate": 3.51196527326958e-09, + "loss": 0.2675, + "step": 34078 + }, + { + "epoch": 0.99, + "grad_norm": 1.4593447463109215, + "learning_rate": 3.49438554546222e-09, + "loss": 0.2762, + "step": 34079 + }, + { + "epoch": 0.99, + "grad_norm": 1.3129965863179451, + "learning_rate": 3.4768499118720266e-09, + "loss": 0.2576, + "step": 34080 + }, + { + "epoch": 0.99, + "grad_norm": 1.2058670306189434, + "learning_rate": 3.459358372653876e-09, + "loss": 0.2547, + "step": 34081 + }, + { + "epoch": 0.99, + "grad_norm": 1.3269201767038232, + "learning_rate": 3.4419109279626438e-09, + "loss": 0.2754, + "step": 34082 + }, + { + "epoch": 0.99, + "grad_norm": 1.5368250703592286, + "learning_rate": 3.4245075779520966e-09, + "loss": 0.2651, + "step": 34083 + }, + { + "epoch": 0.99, + "grad_norm": 1.5543528482490294, + "learning_rate": 3.407148322775444e-09, + "loss": 0.2751, + "step": 34084 + }, + { + "epoch": 0.99, + "grad_norm": 1.4093141189142506, + "learning_rate": 3.389833162585898e-09, + "loss": 0.283, + "step": 34085 + }, + { + "epoch": 0.99, + "grad_norm": 1.5198208522501528, + "learning_rate": 3.372562097536669e-09, + "loss": 0.2618, + "step": 34086 + }, + { + "epoch": 0.99, + "grad_norm": 1.3189951606364692, + "learning_rate": 3.3553351277804123e-09, + "loss": 0.2733, + "step": 34087 + }, + { + "epoch": 0.99, + "grad_norm": 1.230984871551471, + "learning_rate": 3.338152253468674e-09, + "loss": 0.3, + "step": 34088 + }, + { + "epoch": 0.99, + "grad_norm": 1.4247571520312359, + "learning_rate": 3.3210134747529987e-09, + "loss": 0.2888, + "step": 34089 + }, + { + "epoch": 0.99, + "grad_norm": 1.2741790951065346, + "learning_rate": 3.303918791784377e-09, + "loss": 0.2615, + "step": 34090 + }, + { + "epoch": 0.99, + "grad_norm": 1.5103702369842906, + "learning_rate": 3.28686820471491e-09, + "loss": 0.3018, + "step": 34091 + }, + { + "epoch": 0.99, + "grad_norm": 1.4239213255833527, + "learning_rate": 3.2698617136939228e-09, + "loss": 0.2543, + "step": 34092 + }, + { + "epoch": 0.99, + "grad_norm": 1.346766506143649, + "learning_rate": 3.2528993188712943e-09, + "loss": 0.2741, + "step": 34093 + }, + { + "epoch": 0.99, + "grad_norm": 1.7491254637652156, + "learning_rate": 3.235981020398016e-09, + "loss": 0.2666, + "step": 34094 + }, + { + "epoch": 0.99, + "grad_norm": 1.5481546391239194, + "learning_rate": 3.219106818422302e-09, + "loss": 0.2553, + "step": 34095 + }, + { + "epoch": 0.99, + "grad_norm": 1.3114032681481393, + "learning_rate": 3.2022767130934775e-09, + "loss": 0.2509, + "step": 34096 + }, + { + "epoch": 0.99, + "grad_norm": 1.251478610497849, + "learning_rate": 3.185490704559757e-09, + "loss": 0.2615, + "step": 34097 + }, + { + "epoch": 0.99, + "grad_norm": 0.9905119028330437, + "learning_rate": 3.1687487929699114e-09, + "loss": 0.6059, + "step": 34098 + }, + { + "epoch": 0.99, + "grad_norm": 1.30007285625842, + "learning_rate": 3.1520509784710442e-09, + "loss": 0.2751, + "step": 34099 + }, + { + "epoch": 0.99, + "grad_norm": 1.38731608468362, + "learning_rate": 3.1353972612108153e-09, + "loss": 0.2736, + "step": 34100 + }, + { + "epoch": 0.99, + "grad_norm": 1.2794823189968518, + "learning_rate": 3.1187876413363293e-09, + "loss": 0.2493, + "step": 34101 + }, + { + "epoch": 0.99, + "grad_norm": 1.4441089259875426, + "learning_rate": 3.102222118993581e-09, + "loss": 0.2627, + "step": 34102 + }, + { + "epoch": 0.99, + "grad_norm": 1.5505375033983815, + "learning_rate": 3.085700694329674e-09, + "loss": 0.2889, + "step": 34103 + }, + { + "epoch": 0.99, + "grad_norm": 1.2265463687191305, + "learning_rate": 3.0692233674894934e-09, + "loss": 0.2651, + "step": 34104 + }, + { + "epoch": 0.99, + "grad_norm": 1.2052555847324835, + "learning_rate": 3.0527901386190327e-09, + "loss": 0.2525, + "step": 34105 + }, + { + "epoch": 0.99, + "grad_norm": 1.4317145306795078, + "learning_rate": 3.0364010078631767e-09, + "loss": 0.2813, + "step": 34106 + }, + { + "epoch": 0.99, + "grad_norm": 1.431296705621486, + "learning_rate": 3.020055975366254e-09, + "loss": 0.2646, + "step": 34107 + }, + { + "epoch": 0.99, + "grad_norm": 1.2563160171198005, + "learning_rate": 3.0037550412725936e-09, + "loss": 0.2468, + "step": 34108 + }, + { + "epoch": 0.99, + "grad_norm": 1.430249254052305, + "learning_rate": 2.98749820572708e-09, + "loss": 0.2771, + "step": 34109 + }, + { + "epoch": 0.99, + "grad_norm": 1.3356323132586367, + "learning_rate": 2.9712854688712656e-09, + "loss": 0.268, + "step": 34110 + }, + { + "epoch": 0.99, + "grad_norm": 1.2717026363577, + "learning_rate": 2.9551168308500354e-09, + "loss": 0.2533, + "step": 34111 + }, + { + "epoch": 0.99, + "grad_norm": 1.2791135987233788, + "learning_rate": 2.938992291804943e-09, + "loss": 0.2674, + "step": 34112 + }, + { + "epoch": 0.99, + "grad_norm": 1.3475569630662187, + "learning_rate": 2.9229118518792067e-09, + "loss": 0.2634, + "step": 34113 + }, + { + "epoch": 0.99, + "grad_norm": 1.3946253771825565, + "learning_rate": 2.9068755112138245e-09, + "loss": 0.2777, + "step": 34114 + }, + { + "epoch": 0.99, + "grad_norm": 1.5170605315655412, + "learning_rate": 2.8908832699503507e-09, + "loss": 0.276, + "step": 34115 + }, + { + "epoch": 0.99, + "grad_norm": 1.3426219229162442, + "learning_rate": 2.8749351282308935e-09, + "loss": 0.261, + "step": 34116 + }, + { + "epoch": 0.99, + "grad_norm": 1.3616880104317417, + "learning_rate": 2.859031086195341e-09, + "loss": 0.2638, + "step": 34117 + }, + { + "epoch": 0.99, + "grad_norm": 1.292366959534579, + "learning_rate": 2.843171143984136e-09, + "loss": 0.2601, + "step": 34118 + }, + { + "epoch": 0.99, + "grad_norm": 1.3275961888796166, + "learning_rate": 2.827355301737722e-09, + "loss": 0.2743, + "step": 34119 + }, + { + "epoch": 0.99, + "grad_norm": 1.3702072210115068, + "learning_rate": 2.811583559594877e-09, + "loss": 0.2685, + "step": 34120 + }, + { + "epoch": 0.99, + "grad_norm": 1.5682712159751995, + "learning_rate": 2.7958559176954892e-09, + "loss": 0.2693, + "step": 34121 + }, + { + "epoch": 0.99, + "grad_norm": 1.3087359532279885, + "learning_rate": 2.780172376177781e-09, + "loss": 0.2701, + "step": 34122 + }, + { + "epoch": 0.99, + "grad_norm": 1.5098205983619557, + "learning_rate": 2.7645329351810857e-09, + "loss": 0.2683, + "step": 34123 + }, + { + "epoch": 0.99, + "grad_norm": 1.3841498699524182, + "learning_rate": 2.7489375948425155e-09, + "loss": 0.2853, + "step": 34124 + }, + { + "epoch": 0.99, + "grad_norm": 1.3222774010047857, + "learning_rate": 2.7333863553002937e-09, + "loss": 0.2543, + "step": 34125 + }, + { + "epoch": 0.99, + "grad_norm": 1.3894290934225608, + "learning_rate": 2.7178792166909772e-09, + "loss": 0.2644, + "step": 34126 + }, + { + "epoch": 0.99, + "grad_norm": 1.3166186359445695, + "learning_rate": 2.702416179151679e-09, + "loss": 0.2948, + "step": 34127 + }, + { + "epoch": 0.99, + "grad_norm": 1.3367578266241105, + "learning_rate": 2.6869972428195113e-09, + "loss": 0.243, + "step": 34128 + }, + { + "epoch": 0.99, + "grad_norm": 1.521412099530919, + "learning_rate": 2.671622407829921e-09, + "loss": 0.2564, + "step": 34129 + }, + { + "epoch": 0.99, + "grad_norm": 2.086403355761503, + "learning_rate": 2.6562916743183563e-09, + "loss": 0.2834, + "step": 34130 + }, + { + "epoch": 0.99, + "grad_norm": 1.315740370651133, + "learning_rate": 2.641005042420819e-09, + "loss": 0.2711, + "step": 34131 + }, + { + "epoch": 0.99, + "grad_norm": 1.3894657939994484, + "learning_rate": 2.6257625122716456e-09, + "loss": 0.2813, + "step": 34132 + }, + { + "epoch": 0.99, + "grad_norm": 1.3557473509907598, + "learning_rate": 2.610564084005729e-09, + "loss": 0.2829, + "step": 34133 + }, + { + "epoch": 0.99, + "grad_norm": 1.3273341491670188, + "learning_rate": 2.5954097577568504e-09, + "loss": 0.263, + "step": 34134 + }, + { + "epoch": 0.99, + "grad_norm": 1.4483881953194242, + "learning_rate": 2.580299533658792e-09, + "loss": 0.2767, + "step": 34135 + }, + { + "epoch": 0.99, + "grad_norm": 1.7360812172325517, + "learning_rate": 2.565233411845336e-09, + "loss": 0.2812, + "step": 34136 + }, + { + "epoch": 0.99, + "grad_norm": 1.4084228099787433, + "learning_rate": 2.5502113924485984e-09, + "loss": 0.2959, + "step": 34137 + }, + { + "epoch": 0.99, + "grad_norm": 3.1731438966269554, + "learning_rate": 2.5352334756023612e-09, + "loss": 0.272, + "step": 34138 + }, + { + "epoch": 0.99, + "grad_norm": 1.4160103572085865, + "learning_rate": 2.5202996614376306e-09, + "loss": 0.3338, + "step": 34139 + }, + { + "epoch": 0.99, + "grad_norm": 4.005939538955372, + "learning_rate": 2.505409950087079e-09, + "loss": 0.2552, + "step": 34140 + }, + { + "epoch": 0.99, + "grad_norm": 2.82428028132818, + "learning_rate": 2.4905643416817116e-09, + "loss": 0.2662, + "step": 34141 + }, + { + "epoch": 0.99, + "grad_norm": 1.5729096150824582, + "learning_rate": 2.4757628363519804e-09, + "loss": 0.2598, + "step": 34142 + }, + { + "epoch": 0.99, + "grad_norm": 1.7935047875097514, + "learning_rate": 2.4610054342294465e-09, + "loss": 0.2967, + "step": 34143 + }, + { + "epoch": 0.99, + "grad_norm": 1.71195393695299, + "learning_rate": 2.446292135444561e-09, + "loss": 0.2582, + "step": 34144 + }, + { + "epoch": 0.99, + "grad_norm": 1.3335668069252598, + "learning_rate": 2.4316229401261105e-09, + "loss": 0.2518, + "step": 34145 + }, + { + "epoch": 0.99, + "grad_norm": 1.376319255091428, + "learning_rate": 2.416997848403435e-09, + "loss": 0.2562, + "step": 34146 + }, + { + "epoch": 0.99, + "grad_norm": 1.2884126220252343, + "learning_rate": 2.4024168604069862e-09, + "loss": 0.2691, + "step": 34147 + }, + { + "epoch": 0.99, + "grad_norm": 1.4319313175723616, + "learning_rate": 2.38787997626444e-09, + "loss": 0.2802, + "step": 34148 + }, + { + "epoch": 0.99, + "grad_norm": 1.350987039878776, + "learning_rate": 2.3733871961040266e-09, + "loss": 0.2885, + "step": 34149 + }, + { + "epoch": 0.99, + "grad_norm": 1.997115844418948, + "learning_rate": 2.3589385200545324e-09, + "loss": 0.2676, + "step": 34150 + }, + { + "epoch": 0.99, + "grad_norm": 1.3872563473607282, + "learning_rate": 2.344533948241967e-09, + "loss": 0.2526, + "step": 34151 + }, + { + "epoch": 0.99, + "grad_norm": 1.349674697321352, + "learning_rate": 2.3301734807951173e-09, + "loss": 0.2584, + "step": 34152 + }, + { + "epoch": 0.99, + "grad_norm": 1.3311005470296908, + "learning_rate": 2.3158571178394372e-09, + "loss": 0.292, + "step": 34153 + }, + { + "epoch": 0.99, + "grad_norm": 1.3039842477911883, + "learning_rate": 2.3015848595020484e-09, + "loss": 0.2594, + "step": 34154 + }, + { + "epoch": 0.99, + "grad_norm": 1.4813420250817064, + "learning_rate": 2.2873567059084056e-09, + "loss": 0.3073, + "step": 34155 + }, + { + "epoch": 0.99, + "grad_norm": 1.470898395608239, + "learning_rate": 2.2731726571845193e-09, + "loss": 0.2642, + "step": 34156 + }, + { + "epoch": 0.99, + "grad_norm": 1.3484092797352618, + "learning_rate": 2.259032713455289e-09, + "loss": 0.2736, + "step": 34157 + }, + { + "epoch": 0.99, + "grad_norm": 1.3215222146332282, + "learning_rate": 2.2449368748456156e-09, + "loss": 0.2476, + "step": 34158 + }, + { + "epoch": 0.99, + "grad_norm": 1.3123105961746437, + "learning_rate": 2.2308851414792887e-09, + "loss": 0.2589, + "step": 34159 + }, + { + "epoch": 0.99, + "grad_norm": 1.2466546843843158, + "learning_rate": 2.216877513481208e-09, + "loss": 0.2627, + "step": 34160 + }, + { + "epoch": 0.99, + "grad_norm": 1.3515840678697209, + "learning_rate": 2.202913990974609e-09, + "loss": 0.2709, + "step": 34161 + }, + { + "epoch": 0.99, + "grad_norm": 2.0352518948095866, + "learning_rate": 2.1889945740827255e-09, + "loss": 0.2523, + "step": 34162 + }, + { + "epoch": 0.99, + "grad_norm": 0.9830485495639887, + "learning_rate": 2.1751192629287933e-09, + "loss": 0.5711, + "step": 34163 + }, + { + "epoch": 0.99, + "grad_norm": 1.4389659154856458, + "learning_rate": 2.161288057634381e-09, + "loss": 0.2686, + "step": 34164 + }, + { + "epoch": 0.99, + "grad_norm": 0.8972312719184369, + "learning_rate": 2.1475009583221684e-09, + "loss": 0.536, + "step": 34165 + }, + { + "epoch": 0.99, + "grad_norm": 1.4030864994483734, + "learning_rate": 2.133757965113725e-09, + "loss": 0.2744, + "step": 34166 + }, + { + "epoch": 0.99, + "grad_norm": 1.468745320736207, + "learning_rate": 2.1200590781300654e-09, + "loss": 0.289, + "step": 34167 + }, + { + "epoch": 0.99, + "grad_norm": 1.3947702885770468, + "learning_rate": 2.106404297492759e-09, + "loss": 0.2563, + "step": 34168 + }, + { + "epoch": 0.99, + "grad_norm": 1.2140784993924627, + "learning_rate": 2.0927936233217094e-09, + "loss": 0.2531, + "step": 34169 + }, + { + "epoch": 0.99, + "grad_norm": 1.308752518218476, + "learning_rate": 2.079227055737376e-09, + "loss": 0.2639, + "step": 34170 + }, + { + "epoch": 0.99, + "grad_norm": 1.2955825179951066, + "learning_rate": 2.0657045948591083e-09, + "loss": 0.2655, + "step": 34171 + }, + { + "epoch": 0.99, + "grad_norm": 1.3377082641430438, + "learning_rate": 2.0522262408062544e-09, + "loss": 0.262, + "step": 34172 + }, + { + "epoch": 0.99, + "grad_norm": 0.9968108939311024, + "learning_rate": 2.0387919936981638e-09, + "loss": 0.6008, + "step": 34173 + }, + { + "epoch": 0.99, + "grad_norm": 1.2999694607920111, + "learning_rate": 2.025401853653075e-09, + "loss": 0.259, + "step": 34174 + }, + { + "epoch": 0.99, + "grad_norm": 1.354987344813813, + "learning_rate": 2.0120558207892273e-09, + "loss": 0.273, + "step": 34175 + }, + { + "epoch": 0.99, + "grad_norm": 1.2798918289913719, + "learning_rate": 1.9987538952243037e-09, + "loss": 0.2968, + "step": 34176 + }, + { + "epoch": 0.99, + "grad_norm": 1.3132020978060768, + "learning_rate": 1.985496077076543e-09, + "loss": 0.2559, + "step": 34177 + }, + { + "epoch": 0.99, + "grad_norm": 1.382097707076547, + "learning_rate": 1.9722823664614088e-09, + "loss": 0.2835, + "step": 34178 + }, + { + "epoch": 0.99, + "grad_norm": 1.2295184179613121, + "learning_rate": 1.9591127634965844e-09, + "loss": 0.2585, + "step": 34179 + }, + { + "epoch": 0.99, + "grad_norm": 1.4009242380457927, + "learning_rate": 1.9459872682980886e-09, + "loss": 0.2671, + "step": 34180 + }, + { + "epoch": 0.99, + "grad_norm": 1.480419904010724, + "learning_rate": 1.932905880981939e-09, + "loss": 0.2708, + "step": 34181 + }, + { + "epoch": 0.99, + "grad_norm": 1.2313503168133832, + "learning_rate": 1.919868601663044e-09, + "loss": 0.2523, + "step": 34182 + }, + { + "epoch": 0.99, + "grad_norm": 1.941027747212278, + "learning_rate": 1.906875430456867e-09, + "loss": 0.2798, + "step": 34183 + }, + { + "epoch": 0.99, + "grad_norm": 1.5248557329356416, + "learning_rate": 1.893926367477761e-09, + "loss": 0.2774, + "step": 34184 + }, + { + "epoch": 0.99, + "grad_norm": 1.3347104732811153, + "learning_rate": 1.8810214128406334e-09, + "loss": 0.2642, + "step": 34185 + }, + { + "epoch": 0.99, + "grad_norm": 0.957120929153844, + "learning_rate": 1.8681605666587277e-09, + "loss": 0.611, + "step": 34186 + }, + { + "epoch": 0.99, + "grad_norm": 1.2833927521712387, + "learning_rate": 1.8553438290452864e-09, + "loss": 0.2887, + "step": 34187 + }, + { + "epoch": 0.99, + "grad_norm": 1.3683958528787106, + "learning_rate": 1.8425712001146623e-09, + "loss": 0.2795, + "step": 34188 + }, + { + "epoch": 0.99, + "grad_norm": 1.3972578005748986, + "learning_rate": 1.829842679978433e-09, + "loss": 0.2671, + "step": 34189 + }, + { + "epoch": 0.99, + "grad_norm": 11.180402277349657, + "learning_rate": 1.817158268749286e-09, + "loss": 0.2774, + "step": 34190 + }, + { + "epoch": 0.99, + "grad_norm": 1.6288429048519781, + "learning_rate": 1.8045179665387992e-09, + "loss": 0.2811, + "step": 34191 + }, + { + "epoch": 0.99, + "grad_norm": 1.7584247531545094, + "learning_rate": 1.7919217734596595e-09, + "loss": 0.3076, + "step": 34192 + }, + { + "epoch": 0.99, + "grad_norm": 1.6168802875984318, + "learning_rate": 1.7793696896217794e-09, + "loss": 0.2705, + "step": 34193 + }, + { + "epoch": 0.99, + "grad_norm": 1.3503556375731725, + "learning_rate": 1.7668617151361811e-09, + "loss": 0.2944, + "step": 34194 + }, + { + "epoch": 0.99, + "grad_norm": 1.3374929388696322, + "learning_rate": 1.754397850113887e-09, + "loss": 0.2669, + "step": 34195 + }, + { + "epoch": 0.99, + "grad_norm": 1.3531007772945334, + "learning_rate": 1.741978094664254e-09, + "loss": 0.2612, + "step": 34196 + }, + { + "epoch": 0.99, + "grad_norm": 1.2666699078095636, + "learning_rate": 1.7296024488971942e-09, + "loss": 0.2644, + "step": 34197 + }, + { + "epoch": 0.99, + "grad_norm": 1.295588307873593, + "learning_rate": 1.7172709129220644e-09, + "loss": 0.2896, + "step": 34198 + }, + { + "epoch": 0.99, + "grad_norm": 1.4662524777639077, + "learning_rate": 1.7049834868471116e-09, + "loss": 0.2538, + "step": 34199 + }, + { + "epoch": 0.99, + "grad_norm": 1.3095539875160624, + "learning_rate": 1.6927401707811375e-09, + "loss": 0.2616, + "step": 34200 + }, + { + "epoch": 0.99, + "grad_norm": 1.2787619560134011, + "learning_rate": 1.680540964832389e-09, + "loss": 0.2544, + "step": 34201 + }, + { + "epoch": 0.99, + "grad_norm": 1.5203848398670115, + "learning_rate": 1.6683858691085575e-09, + "loss": 0.2862, + "step": 34202 + }, + { + "epoch": 0.99, + "grad_norm": 1.437716901432429, + "learning_rate": 1.6562748837162246e-09, + "loss": 0.2672, + "step": 34203 + }, + { + "epoch": 0.99, + "grad_norm": 1.2267200824861637, + "learning_rate": 1.6442080087625267e-09, + "loss": 0.2638, + "step": 34204 + }, + { + "epoch": 0.99, + "grad_norm": 2.4899579946481203, + "learning_rate": 1.6321852443546005e-09, + "loss": 0.2722, + "step": 34205 + }, + { + "epoch": 0.99, + "grad_norm": 1.485089254068316, + "learning_rate": 1.620206590597917e-09, + "loss": 0.2785, + "step": 34206 + }, + { + "epoch": 0.99, + "grad_norm": 1.3349552972264764, + "learning_rate": 1.6082720475985025e-09, + "loss": 0.2757, + "step": 34207 + }, + { + "epoch": 0.99, + "grad_norm": 1.3560812591984077, + "learning_rate": 1.5963816154612732e-09, + "loss": 0.2625, + "step": 34208 + }, + { + "epoch": 0.99, + "grad_norm": 1.6266018012720813, + "learning_rate": 1.5845352942917004e-09, + "loss": 0.271, + "step": 34209 + }, + { + "epoch": 0.99, + "grad_norm": 1.5087843178993317, + "learning_rate": 1.5727330841941445e-09, + "loss": 0.2738, + "step": 34210 + }, + { + "epoch": 0.99, + "grad_norm": 1.2884432904082155, + "learning_rate": 1.560974985272412e-09, + "loss": 0.2509, + "step": 34211 + }, + { + "epoch": 0.99, + "grad_norm": 2.7734410261869566, + "learning_rate": 1.5492609976303085e-09, + "loss": 0.2625, + "step": 34212 + }, + { + "epoch": 0.99, + "grad_norm": 1.4734630598788772, + "learning_rate": 1.5375911213716399e-09, + "loss": 0.2498, + "step": 34213 + }, + { + "epoch": 0.99, + "grad_norm": 1.3516988014203497, + "learning_rate": 1.5259653565996568e-09, + "loss": 0.2566, + "step": 34214 + }, + { + "epoch": 0.99, + "grad_norm": 1.564892331678899, + "learning_rate": 1.5143837034159447e-09, + "loss": 0.2946, + "step": 34215 + }, + { + "epoch": 0.99, + "grad_norm": 1.5694777286099375, + "learning_rate": 1.5028461619231994e-09, + "loss": 0.2692, + "step": 34216 + }, + { + "epoch": 0.99, + "grad_norm": 1.435298440900204, + "learning_rate": 1.491352732223561e-09, + "loss": 0.2548, + "step": 34217 + }, + { + "epoch": 0.99, + "grad_norm": 3.4636895060235267, + "learning_rate": 1.4799034144180603e-09, + "loss": 0.2604, + "step": 34218 + }, + { + "epoch": 0.99, + "grad_norm": 1.7744993990229614, + "learning_rate": 1.4684982086077272e-09, + "loss": 0.278, + "step": 34219 + }, + { + "epoch": 0.99, + "grad_norm": 1.4095980028071666, + "learning_rate": 1.457137114893592e-09, + "loss": 0.2727, + "step": 34220 + }, + { + "epoch": 0.99, + "grad_norm": 1.3832088874041695, + "learning_rate": 1.445820133375575e-09, + "loss": 0.2531, + "step": 34221 + }, + { + "epoch": 0.99, + "grad_norm": 1.3570460656681387, + "learning_rate": 1.4345472641535962e-09, + "loss": 0.2652, + "step": 34222 + }, + { + "epoch": 0.99, + "grad_norm": 1.4037941946909789, + "learning_rate": 1.4233185073270206e-09, + "loss": 0.2676, + "step": 34223 + }, + { + "epoch": 0.99, + "grad_norm": 1.2033907373224355, + "learning_rate": 1.4121338629957682e-09, + "loss": 0.2681, + "step": 34224 + }, + { + "epoch": 0.99, + "grad_norm": 1.5651844056250503, + "learning_rate": 1.4009933312569835e-09, + "loss": 0.296, + "step": 34225 + }, + { + "epoch": 0.99, + "grad_norm": 1.2327056785180206, + "learning_rate": 1.3898969122105864e-09, + "loss": 0.2817, + "step": 34226 + }, + { + "epoch": 0.99, + "grad_norm": 1.3290236096837786, + "learning_rate": 1.378844605953722e-09, + "loss": 0.2761, + "step": 34227 + }, + { + "epoch": 0.99, + "grad_norm": 1.4575794826035602, + "learning_rate": 1.367836412583534e-09, + "loss": 0.2529, + "step": 34228 + }, + { + "epoch": 0.99, + "grad_norm": 1.5417479309456616, + "learning_rate": 1.356872332198278e-09, + "loss": 0.2541, + "step": 34229 + }, + { + "epoch": 0.99, + "grad_norm": 0.8871175624501038, + "learning_rate": 1.3459523648934324e-09, + "loss": 0.5399, + "step": 34230 + }, + { + "epoch": 0.99, + "grad_norm": 1.3227648800519143, + "learning_rate": 1.3350765107666974e-09, + "loss": 0.2785, + "step": 34231 + }, + { + "epoch": 0.99, + "grad_norm": 1.301819711740371, + "learning_rate": 1.3242447699129969e-09, + "loss": 0.2561, + "step": 34232 + }, + { + "epoch": 0.99, + "grad_norm": 1.4442758531262785, + "learning_rate": 1.3134571424278098e-09, + "loss": 0.2635, + "step": 34233 + }, + { + "epoch": 0.99, + "grad_norm": 1.359636729602823, + "learning_rate": 1.3027136284077257e-09, + "loss": 0.2728, + "step": 34234 + }, + { + "epoch": 0.99, + "grad_norm": 1.3804781382652704, + "learning_rate": 1.2920142279460034e-09, + "loss": 0.2773, + "step": 34235 + }, + { + "epoch": 0.99, + "grad_norm": 1.308050678055632, + "learning_rate": 1.2813589411381222e-09, + "loss": 0.252, + "step": 34236 + }, + { + "epoch": 0.99, + "grad_norm": 1.3191782574980178, + "learning_rate": 1.2707477680773406e-09, + "loss": 0.2758, + "step": 34237 + }, + { + "epoch": 0.99, + "grad_norm": 1.4263776040060208, + "learning_rate": 1.2601807088580275e-09, + "loss": 0.2513, + "step": 34238 + }, + { + "epoch": 0.99, + "grad_norm": 1.7417376958419948, + "learning_rate": 1.2496577635728869e-09, + "loss": 0.2722, + "step": 34239 + }, + { + "epoch": 0.99, + "grad_norm": 1.3409742433914027, + "learning_rate": 1.239178932314622e-09, + "loss": 0.2625, + "step": 34240 + }, + { + "epoch": 0.99, + "grad_norm": 1.2186876011743888, + "learning_rate": 1.2287442151764917e-09, + "loss": 0.2692, + "step": 34241 + }, + { + "epoch": 0.99, + "grad_norm": 1.2490171594520287, + "learning_rate": 1.2183536122500895e-09, + "loss": 0.2532, + "step": 34242 + }, + { + "epoch": 0.99, + "grad_norm": 1.292539012308666, + "learning_rate": 1.2080071236270086e-09, + "loss": 0.2747, + "step": 34243 + }, + { + "epoch": 0.99, + "grad_norm": 1.9590877775984161, + "learning_rate": 1.1977047493988425e-09, + "loss": 0.2712, + "step": 34244 + }, + { + "epoch": 0.99, + "grad_norm": 1.6726087700911474, + "learning_rate": 1.1874464896566296e-09, + "loss": 0.2786, + "step": 34245 + }, + { + "epoch": 0.99, + "grad_norm": 1.7056171015480077, + "learning_rate": 1.1772323444902978e-09, + "loss": 0.283, + "step": 34246 + }, + { + "epoch": 0.99, + "grad_norm": 1.350596248670458, + "learning_rate": 1.1670623139903303e-09, + "loss": 0.284, + "step": 34247 + }, + { + "epoch": 0.99, + "grad_norm": 4.987515034575985, + "learning_rate": 1.1569363982466553e-09, + "loss": 0.3009, + "step": 34248 + }, + { + "epoch": 0.99, + "grad_norm": 1.298728632584643, + "learning_rate": 1.1468545973480904e-09, + "loss": 0.2675, + "step": 34249 + }, + { + "epoch": 0.99, + "grad_norm": 4.3465160840315304, + "learning_rate": 1.136816911384564e-09, + "loss": 0.2368, + "step": 34250 + }, + { + "epoch": 0.99, + "grad_norm": 1.701117050384174, + "learning_rate": 1.1268233404437835e-09, + "loss": 0.2708, + "step": 34251 + }, + { + "epoch": 0.99, + "grad_norm": 1.288032443386916, + "learning_rate": 1.1168738846145666e-09, + "loss": 0.2686, + "step": 34252 + }, + { + "epoch": 0.99, + "grad_norm": 1.4214183796815114, + "learning_rate": 1.106968543984066e-09, + "loss": 0.2703, + "step": 34253 + }, + { + "epoch": 0.99, + "grad_norm": 1.532337655455923, + "learning_rate": 1.0971073186399893e-09, + "loss": 0.2641, + "step": 34254 + }, + { + "epoch": 0.99, + "grad_norm": 1.203207050699272, + "learning_rate": 1.0872902086694893e-09, + "loss": 0.2583, + "step": 34255 + }, + { + "epoch": 0.99, + "grad_norm": 1.335517726496878, + "learning_rate": 1.0775172141591628e-09, + "loss": 0.2518, + "step": 34256 + }, + { + "epoch": 0.99, + "grad_norm": 1.7536095082747696, + "learning_rate": 1.0677883351950525e-09, + "loss": 0.2619, + "step": 34257 + }, + { + "epoch": 0.99, + "grad_norm": 1.3380359782314597, + "learning_rate": 1.0581035718632004e-09, + "loss": 0.2624, + "step": 34258 + }, + { + "epoch": 0.99, + "grad_norm": 1.414119286766135, + "learning_rate": 1.048462924249094e-09, + "loss": 0.2576, + "step": 34259 + }, + { + "epoch": 0.99, + "grad_norm": 1.50968148925985, + "learning_rate": 1.0388663924376653e-09, + "loss": 0.2981, + "step": 34260 + }, + { + "epoch": 0.99, + "grad_norm": 1.422180692317844, + "learning_rate": 1.029313976513846e-09, + "loss": 0.2589, + "step": 34261 + }, + { + "epoch": 0.99, + "grad_norm": 1.4192093778959545, + "learning_rate": 1.0198056765614584e-09, + "loss": 0.2674, + "step": 34262 + }, + { + "epoch": 0.99, + "grad_norm": 1.5614484916688693, + "learning_rate": 1.0103414926654343e-09, + "loss": 0.2756, + "step": 34263 + }, + { + "epoch": 0.99, + "grad_norm": 1.29436498843235, + "learning_rate": 1.0009214249079302e-09, + "loss": 0.2802, + "step": 34264 + }, + { + "epoch": 0.99, + "grad_norm": 1.2961550501356713, + "learning_rate": 9.915454733733232e-10, + "loss": 0.2595, + "step": 34265 + }, + { + "epoch": 0.99, + "grad_norm": 1.2820046789201711, + "learning_rate": 9.822136381432146e-10, + "loss": 0.2529, + "step": 34266 + }, + { + "epoch": 0.99, + "grad_norm": 1.3974414471502468, + "learning_rate": 9.729259193008712e-10, + "loss": 0.2914, + "step": 34267 + }, + { + "epoch": 0.99, + "grad_norm": 1.5437951085795316, + "learning_rate": 9.636823169278941e-10, + "loss": 0.2817, + "step": 34268 + }, + { + "epoch": 0.99, + "grad_norm": 12.57379858457222, + "learning_rate": 9.54482831105885e-10, + "loss": 0.2472, + "step": 34269 + }, + { + "epoch": 0.99, + "grad_norm": 3.431390802756403, + "learning_rate": 9.453274619158902e-10, + "loss": 0.2895, + "step": 34270 + }, + { + "epoch": 0.99, + "grad_norm": 1.3408193802434083, + "learning_rate": 9.362162094384008e-10, + "loss": 0.2593, + "step": 34271 + }, + { + "epoch": 0.99, + "grad_norm": 1.328701734807903, + "learning_rate": 9.271490737550182e-10, + "loss": 0.2707, + "step": 34272 + }, + { + "epoch": 0.99, + "grad_norm": 1.3498183199169795, + "learning_rate": 9.181260549445681e-10, + "loss": 0.2569, + "step": 34273 + }, + { + "epoch": 0.99, + "grad_norm": 1.4207350219813597, + "learning_rate": 9.091471530875417e-10, + "loss": 0.2566, + "step": 34274 + }, + { + "epoch": 0.99, + "grad_norm": 1.3860192975329422, + "learning_rate": 9.002123682622099e-10, + "loss": 0.2577, + "step": 34275 + }, + { + "epoch": 0.99, + "grad_norm": 1.3188719978154257, + "learning_rate": 8.913217005485086e-10, + "loss": 0.2611, + "step": 34276 + }, + { + "epoch": 0.99, + "grad_norm": 1.2891616375267876, + "learning_rate": 8.824751500241535e-10, + "loss": 0.2878, + "step": 34277 + }, + { + "epoch": 0.99, + "grad_norm": 1.3633973800000923, + "learning_rate": 8.736727167668602e-10, + "loss": 0.2679, + "step": 34278 + }, + { + "epoch": 0.99, + "grad_norm": 1.3539235258189255, + "learning_rate": 8.649144008554544e-10, + "loss": 0.2643, + "step": 34279 + }, + { + "epoch": 0.99, + "grad_norm": 1.3244687599044904, + "learning_rate": 8.562002023665417e-10, + "loss": 0.2615, + "step": 34280 + }, + { + "epoch": 0.99, + "grad_norm": 1.2630732193648078, + "learning_rate": 8.475301213772824e-10, + "loss": 0.2689, + "step": 34281 + }, + { + "epoch": 0.99, + "grad_norm": 1.6878210712208992, + "learning_rate": 8.389041579637269e-10, + "loss": 0.2575, + "step": 34282 + }, + { + "epoch": 0.99, + "grad_norm": 2.175630846687431, + "learning_rate": 8.303223122024806e-10, + "loss": 0.2968, + "step": 34283 + }, + { + "epoch": 0.99, + "grad_norm": 0.9563808246064704, + "learning_rate": 8.217845841690386e-10, + "loss": 0.5466, + "step": 34284 + }, + { + "epoch": 0.99, + "grad_norm": 1.2683303941432809, + "learning_rate": 8.132909739388961e-10, + "loss": 0.2532, + "step": 34285 + }, + { + "epoch": 0.99, + "grad_norm": 2.5005378466393253, + "learning_rate": 8.048414815864381e-10, + "loss": 0.2672, + "step": 34286 + }, + { + "epoch": 0.99, + "grad_norm": 1.3480126306058247, + "learning_rate": 7.964361071871595e-10, + "loss": 0.2707, + "step": 34287 + }, + { + "epoch": 0.99, + "grad_norm": 1.4170966684860538, + "learning_rate": 7.880748508148905e-10, + "loss": 0.2579, + "step": 34288 + }, + { + "epoch": 0.99, + "grad_norm": 1.4802241361707915, + "learning_rate": 7.797577125429057e-10, + "loss": 0.2608, + "step": 34289 + }, + { + "epoch": 0.99, + "grad_norm": 1.3394162568623895, + "learning_rate": 7.714846924455899e-10, + "loss": 0.2721, + "step": 34290 + }, + { + "epoch": 0.99, + "grad_norm": 1.303370414559833, + "learning_rate": 7.632557905951077e-10, + "loss": 0.2454, + "step": 34291 + }, + { + "epoch": 0.99, + "grad_norm": 1.2399208121148444, + "learning_rate": 7.550710070641786e-10, + "loss": 0.2676, + "step": 34292 + }, + { + "epoch": 0.99, + "grad_norm": 1.6163746466953535, + "learning_rate": 7.469303419255225e-10, + "loss": 0.2617, + "step": 34293 + }, + { + "epoch": 0.99, + "grad_norm": 1.2962577401263262, + "learning_rate": 7.388337952501934e-10, + "loss": 0.2725, + "step": 34294 + }, + { + "epoch": 0.99, + "grad_norm": 1.4518415396603614, + "learning_rate": 7.307813671103558e-10, + "loss": 0.2594, + "step": 34295 + }, + { + "epoch": 0.99, + "grad_norm": 1.4020310820241766, + "learning_rate": 7.227730575770642e-10, + "loss": 0.2506, + "step": 34296 + }, + { + "epoch": 0.99, + "grad_norm": 1.3413722520415121, + "learning_rate": 7.148088667202624e-10, + "loss": 0.2632, + "step": 34297 + }, + { + "epoch": 0.99, + "grad_norm": 1.4552697296175119, + "learning_rate": 7.068887946110048e-10, + "loss": 0.2467, + "step": 34298 + }, + { + "epoch": 0.99, + "grad_norm": 1.4527426367037162, + "learning_rate": 6.990128413192354e-10, + "loss": 0.2868, + "step": 34299 + }, + { + "epoch": 0.99, + "grad_norm": 1.418717493450318, + "learning_rate": 6.911810069137881e-10, + "loss": 0.2715, + "step": 34300 + }, + { + "epoch": 0.99, + "grad_norm": 1.6498555121334497, + "learning_rate": 6.833932914640517e-10, + "loss": 0.2829, + "step": 34301 + }, + { + "epoch": 0.99, + "grad_norm": 1.315625456415437, + "learning_rate": 6.756496950388602e-10, + "loss": 0.2787, + "step": 34302 + }, + { + "epoch": 0.99, + "grad_norm": 1.280870997910224, + "learning_rate": 6.679502177070474e-10, + "loss": 0.266, + "step": 34303 + }, + { + "epoch": 0.99, + "grad_norm": 1.5832802253878253, + "learning_rate": 6.602948595352266e-10, + "loss": 0.2752, + "step": 34304 + }, + { + "epoch": 1.0, + "grad_norm": 1.3056654215595767, + "learning_rate": 6.526836205922316e-10, + "loss": 0.2609, + "step": 34305 + }, + { + "epoch": 1.0, + "grad_norm": 1.5802248714554614, + "learning_rate": 6.451165009446758e-10, + "loss": 0.2738, + "step": 34306 + }, + { + "epoch": 1.0, + "grad_norm": 1.3623057743693192, + "learning_rate": 6.375935006591727e-10, + "loss": 0.2662, + "step": 34307 + }, + { + "epoch": 1.0, + "grad_norm": 2.435525749475048, + "learning_rate": 6.301146198028907e-10, + "loss": 0.2781, + "step": 34308 + }, + { + "epoch": 1.0, + "grad_norm": 1.821141652215764, + "learning_rate": 6.226798584407778e-10, + "loss": 0.2988, + "step": 34309 + }, + { + "epoch": 1.0, + "grad_norm": 1.2997319359301247, + "learning_rate": 6.152892166394475e-10, + "loss": 0.272, + "step": 34310 + }, + { + "epoch": 1.0, + "grad_norm": 1.3506745338161916, + "learning_rate": 6.079426944632926e-10, + "loss": 0.2572, + "step": 34311 + }, + { + "epoch": 1.0, + "grad_norm": 1.6346833498816897, + "learning_rate": 6.006402919778165e-10, + "loss": 0.2894, + "step": 34312 + }, + { + "epoch": 1.0, + "grad_norm": 1.2417090740713448, + "learning_rate": 5.933820092468567e-10, + "loss": 0.2567, + "step": 34313 + }, + { + "epoch": 1.0, + "grad_norm": 1.369110764045404, + "learning_rate": 5.861678463348064e-10, + "loss": 0.265, + "step": 34314 + }, + { + "epoch": 1.0, + "grad_norm": 1.3181366691380985, + "learning_rate": 5.789978033055033e-10, + "loss": 0.2696, + "step": 34315 + }, + { + "epoch": 1.0, + "grad_norm": 1.282912921095376, + "learning_rate": 5.718718802216749e-10, + "loss": 0.2616, + "step": 34316 + }, + { + "epoch": 1.0, + "grad_norm": 1.4971083329394792, + "learning_rate": 5.647900771471592e-10, + "loss": 0.2759, + "step": 34317 + }, + { + "epoch": 1.0, + "grad_norm": 1.3195725589158116, + "learning_rate": 5.577523941430185e-10, + "loss": 0.2878, + "step": 34318 + }, + { + "epoch": 1.0, + "grad_norm": 1.3520769137380468, + "learning_rate": 5.507588312725354e-10, + "loss": 0.2669, + "step": 34319 + }, + { + "epoch": 1.0, + "grad_norm": 1.8242482217306153, + "learning_rate": 5.438093885973273e-10, + "loss": 0.2766, + "step": 34320 + }, + { + "epoch": 1.0, + "grad_norm": 1.7073006096715833, + "learning_rate": 5.369040661779013e-10, + "loss": 0.2747, + "step": 34321 + }, + { + "epoch": 1.0, + "grad_norm": 1.6190303641469872, + "learning_rate": 5.300428640764299e-10, + "loss": 0.2626, + "step": 34322 + }, + { + "epoch": 1.0, + "grad_norm": 1.2372551945189905, + "learning_rate": 5.232257823523101e-10, + "loss": 0.2468, + "step": 34323 + }, + { + "epoch": 1.0, + "grad_norm": 1.3956418949171423, + "learning_rate": 5.164528210666042e-10, + "loss": 0.2597, + "step": 34324 + }, + { + "epoch": 1.0, + "grad_norm": 1.7118436981856915, + "learning_rate": 5.09723980278154e-10, + "loss": 0.293, + "step": 34325 + }, + { + "epoch": 1.0, + "grad_norm": 1.3600995053444846, + "learning_rate": 5.030392600469114e-10, + "loss": 0.2832, + "step": 34326 + }, + { + "epoch": 1.0, + "grad_norm": 1.409988497151393, + "learning_rate": 4.963986604322735e-10, + "loss": 0.2608, + "step": 34327 + }, + { + "epoch": 1.0, + "grad_norm": 1.3777381886609563, + "learning_rate": 4.898021814919718e-10, + "loss": 0.2573, + "step": 34328 + }, + { + "epoch": 1.0, + "grad_norm": 1.3087573143190017, + "learning_rate": 4.832498232848481e-10, + "loss": 0.2371, + "step": 34329 + }, + { + "epoch": 1.0, + "grad_norm": 0.9617046653499384, + "learning_rate": 4.767415858680791e-10, + "loss": 0.5414, + "step": 34330 + }, + { + "epoch": 1.0, + "grad_norm": 1.3985927339696247, + "learning_rate": 4.702774692999512e-10, + "loss": 0.2812, + "step": 34331 + }, + { + "epoch": 1.0, + "grad_norm": 1.0105158680521211, + "learning_rate": 4.63857473637086e-10, + "loss": 0.5528, + "step": 34332 + }, + { + "epoch": 1.0, + "grad_norm": 1.4439240207910335, + "learning_rate": 4.574815989361048e-10, + "loss": 0.2815, + "step": 34333 + }, + { + "epoch": 1.0, + "grad_norm": 1.363941617142091, + "learning_rate": 4.511498452530738e-10, + "loss": 0.2953, + "step": 34334 + }, + { + "epoch": 1.0, + "grad_norm": 1.4094815851024618, + "learning_rate": 4.448622126440594e-10, + "loss": 0.2762, + "step": 34335 + }, + { + "epoch": 1.0, + "grad_norm": 1.3597288888479873, + "learning_rate": 4.386187011651277e-10, + "loss": 0.2883, + "step": 34336 + }, + { + "epoch": 1.0, + "grad_norm": 1.3345478562683961, + "learning_rate": 4.324193108701247e-10, + "loss": 0.2693, + "step": 34337 + }, + { + "epoch": 1.0, + "grad_norm": 1.301275698702963, + "learning_rate": 4.262640418151165e-10, + "loss": 0.273, + "step": 34338 + }, + { + "epoch": 1.0, + "grad_norm": 1.4601822034798693, + "learning_rate": 4.2015289405339386e-10, + "loss": 0.263, + "step": 34339 + }, + { + "epoch": 1.0, + "grad_norm": 1.551350671722919, + "learning_rate": 4.1408586763935776e-10, + "loss": 0.2817, + "step": 34340 + }, + { + "epoch": 1.0, + "grad_norm": 1.2624631945955365, + "learning_rate": 4.08062962626854e-10, + "loss": 0.2362, + "step": 34341 + }, + { + "epoch": 1.0, + "grad_norm": 1.4838561188725563, + "learning_rate": 4.020841790686181e-10, + "loss": 0.2441, + "step": 34342 + }, + { + "epoch": 1.0, + "grad_norm": 1.2930501645569372, + "learning_rate": 3.9614951701683056e-10, + "loss": 0.2969, + "step": 34343 + }, + { + "epoch": 1.0, + "grad_norm": 1.9968449910489805, + "learning_rate": 3.902589765247822e-10, + "loss": 0.2466, + "step": 34344 + }, + { + "epoch": 1.0, + "grad_norm": 1.4576045314351207, + "learning_rate": 3.8441255764465335e-10, + "loss": 0.2853, + "step": 34345 + }, + { + "epoch": 1.0, + "grad_norm": 1.7016752314023984, + "learning_rate": 3.7861026042695927e-10, + "loss": 0.2577, + "step": 34346 + }, + { + "epoch": 1.0, + "grad_norm": 1.4488073734603526, + "learning_rate": 3.7285208492388035e-10, + "loss": 0.2649, + "step": 34347 + }, + { + "epoch": 1.0, + "grad_norm": 1.3508575217135355, + "learning_rate": 3.671380311853767e-10, + "loss": 0.2589, + "step": 34348 + }, + { + "epoch": 1.0, + "grad_norm": 1.291914216823944, + "learning_rate": 3.614680992625186e-10, + "loss": 0.2846, + "step": 34349 + }, + { + "epoch": 1.0, + "grad_norm": 1.6175859498305762, + "learning_rate": 3.5584228920526596e-10, + "loss": 0.27, + "step": 34350 + }, + { + "epoch": 1.0, + "grad_norm": 1.686870347674965, + "learning_rate": 3.5026060106302384e-10, + "loss": 0.2826, + "step": 34351 + }, + { + "epoch": 1.0, + "grad_norm": 1.5385527567395954, + "learning_rate": 3.447230348857522e-10, + "loss": 0.2569, + "step": 34352 + }, + { + "epoch": 1.0, + "grad_norm": 1.4472185666177317, + "learning_rate": 3.3922959072119066e-10, + "loss": 0.2665, + "step": 34353 + }, + { + "epoch": 1.0, + "grad_norm": 1.322434904560908, + "learning_rate": 3.33780268618189e-10, + "loss": 0.253, + "step": 34354 + }, + { + "epoch": 1.0, + "grad_norm": 1.582357707991831, + "learning_rate": 3.283750686255971e-10, + "loss": 0.2672, + "step": 34355 + }, + { + "epoch": 1.0, + "grad_norm": 1.370077738399118, + "learning_rate": 3.2301399079059937e-10, + "loss": 0.2825, + "step": 34356 + }, + { + "epoch": 1.0, + "grad_norm": 1.3682919977793062, + "learning_rate": 3.176970351598252e-10, + "loss": 0.2711, + "step": 34357 + }, + { + "epoch": 1.0, + "grad_norm": 1.2986002798111886, + "learning_rate": 3.124242017815693e-10, + "loss": 0.2655, + "step": 34358 + }, + { + "epoch": 1.0, + "grad_norm": 1.3297866648344556, + "learning_rate": 3.071954907007957e-10, + "loss": 0.2719, + "step": 34359 + }, + { + "epoch": 1.0, + "grad_norm": 1.414177168734986, + "learning_rate": 3.02010901965244e-10, + "loss": 0.259, + "step": 34360 + }, + { + "epoch": 1.0, + "grad_norm": 1.3761797453863924, + "learning_rate": 2.9687043561987814e-10, + "loss": 0.2586, + "step": 34361 + }, + { + "epoch": 1.0, + "grad_norm": 2.359585917967585, + "learning_rate": 2.917740917096623e-10, + "loss": 0.2559, + "step": 34362 + }, + { + "epoch": 1.0, + "grad_norm": 1.340109904706074, + "learning_rate": 2.8672187028011554e-10, + "loss": 0.2557, + "step": 34363 + }, + { + "epoch": 1.0, + "grad_norm": 1.3517934680541372, + "learning_rate": 2.817137713756468e-10, + "loss": 0.2654, + "step": 34364 + }, + { + "epoch": 1.0, + "grad_norm": 1.0392937585279576, + "learning_rate": 2.76749795040665e-10, + "loss": 0.5926, + "step": 34365 + }, + { + "epoch": 1.0, + "grad_norm": 1.4421434136076183, + "learning_rate": 2.7182994131902393e-10, + "loss": 0.255, + "step": 34366 + }, + { + "epoch": 1.0, + "grad_norm": 1.4109587013192555, + "learning_rate": 2.6695421025402234e-10, + "loss": 0.2487, + "step": 34367 + }, + { + "epoch": 1.0, + "grad_norm": 1.5062171822203272, + "learning_rate": 2.6212260188784864e-10, + "loss": 0.2703, + "step": 34368 + }, + { + "epoch": 1.0, + "grad_norm": 1.2416809291994957, + "learning_rate": 2.573351162649118e-10, + "loss": 0.2763, + "step": 34369 + }, + { + "epoch": 1.0, + "grad_norm": 1.4643397060197996, + "learning_rate": 2.5259175342573495e-10, + "loss": 0.2544, + "step": 34370 + }, + { + "epoch": 1.0, + "grad_norm": 1.5036220783918468, + "learning_rate": 2.4789251341306164e-10, + "loss": 0.2732, + "step": 34371 + }, + { + "epoch": 1.0, + "grad_norm": 1.3570027023743432, + "learning_rate": 2.4323739626852526e-10, + "loss": 0.255, + "step": 34372 + }, + { + "epoch": 1.0, + "grad_norm": 1.4102767864797445, + "learning_rate": 2.38626402032649e-10, + "loss": 0.2727, + "step": 34373 + }, + { + "epoch": 1.0, + "grad_norm": 1.4116041456887511, + "learning_rate": 2.3405953074595586e-10, + "loss": 0.2635, + "step": 34374 + }, + { + "epoch": 1.0, + "grad_norm": 1.4081464880516528, + "learning_rate": 2.2953678244952425e-10, + "loss": 0.2798, + "step": 34375 + }, + { + "epoch": 1.0, + "grad_norm": 1.2356350800584657, + "learning_rate": 2.2505815718276702e-10, + "loss": 0.2851, + "step": 34376 + }, + { + "epoch": 1.0, + "grad_norm": 1.369350014331245, + "learning_rate": 2.206236549856522e-10, + "loss": 0.2598, + "step": 34377 + }, + { + "epoch": 1.0, + "grad_norm": 1.3824868642988113, + "learning_rate": 2.162332758970376e-10, + "loss": 0.2734, + "step": 34378 + }, + { + "epoch": 1.0, + "grad_norm": 1.3144423491384996, + "learning_rate": 2.1188701995522587e-10, + "loss": 0.2707, + "step": 34379 + }, + { + "epoch": 1.0, + "grad_norm": 1.278625514389077, + "learning_rate": 2.0758488719907488e-10, + "loss": 0.2498, + "step": 34380 + }, + { + "epoch": 1.0, + "grad_norm": 1.4123658603617244, + "learning_rate": 2.033268776663322e-10, + "loss": 0.2928, + "step": 34381 + }, + { + "epoch": 1.0, + "grad_norm": 1.2936952521859895, + "learning_rate": 1.991129913953005e-10, + "loss": 0.2708, + "step": 34382 + }, + { + "epoch": 1.0, + "grad_norm": 1.6402989297455384, + "learning_rate": 1.9494322842206203e-10, + "loss": 0.3076, + "step": 34383 + }, + { + "epoch": 1.0, + "grad_norm": 1.3865751934411115, + "learning_rate": 1.9081758878380928e-10, + "loss": 0.2793, + "step": 34384 + }, + { + "epoch": 1.0, + "grad_norm": 1.4912481242250761, + "learning_rate": 1.8673607251717963e-10, + "loss": 0.2754, + "step": 34385 + }, + { + "epoch": 1.0, + "grad_norm": 1.4648961028286536, + "learning_rate": 1.8269867965825528e-10, + "loss": 0.2737, + "step": 34386 + }, + { + "epoch": 1.0, + "grad_norm": 1.2629353064214253, + "learning_rate": 1.7870541024256337e-10, + "loss": 0.2592, + "step": 34387 + }, + { + "epoch": 1.0, + "grad_norm": 1.2816862152326502, + "learning_rate": 1.7475626430507599e-10, + "loss": 0.2584, + "step": 34388 + }, + { + "epoch": 1.0, + "grad_norm": 1.5064491874311818, + "learning_rate": 1.7085124188076508e-10, + "loss": 0.263, + "step": 34389 + }, + { + "epoch": 1.0, + "grad_norm": 1.396636663818769, + "learning_rate": 1.6699034300404758e-10, + "loss": 0.2476, + "step": 34390 + }, + { + "epoch": 1.0, + "grad_norm": 0.9129178824045228, + "learning_rate": 1.6317356770934045e-10, + "loss": 0.5142, + "step": 34391 + }, + { + "epoch": 1.0, + "grad_norm": 2.1581961177233575, + "learning_rate": 1.5940091602995035e-10, + "loss": 0.2752, + "step": 34392 + }, + { + "epoch": 1.0, + "grad_norm": 1.4631642172218873, + "learning_rate": 1.5567238799973905e-10, + "loss": 0.2781, + "step": 34393 + }, + { + "epoch": 1.0, + "grad_norm": 1.3544904061465928, + "learning_rate": 1.5198798365034795e-10, + "loss": 0.2451, + "step": 34394 + }, + { + "epoch": 1.0, + "grad_norm": 1.280610831681891, + "learning_rate": 1.4834770301563884e-10, + "loss": 0.2658, + "step": 34395 + }, + { + "epoch": 1.0, + "grad_norm": 1.402414492482225, + "learning_rate": 1.4475154612725307e-10, + "loss": 0.2788, + "step": 34396 + }, + { + "epoch": 1.0, + "grad_norm": 1.391960525581136, + "learning_rate": 1.41199513016832e-10, + "loss": 0.2698, + "step": 34397 + }, + { + "epoch": 1.0, + "grad_norm": 1.3515032030943253, + "learning_rate": 1.3769160371601697e-10, + "loss": 0.2794, + "step": 34398 + }, + { + "epoch": 1.0, + "grad_norm": 1.3948634754221358, + "learning_rate": 1.3422781825533916e-10, + "loss": 0.2596, + "step": 34399 + }, + { + "epoch": 1.0, + "grad_norm": 1.346674121863401, + "learning_rate": 1.3080815666532965e-10, + "loss": 0.3103, + "step": 34400 + }, + { + "epoch": 1.0, + "grad_norm": 3.032850410752809, + "learning_rate": 1.2743261897651959e-10, + "loss": 0.2704, + "step": 34401 + }, + { + "epoch": 1.0, + "grad_norm": 1.2751380346946044, + "learning_rate": 1.2410120521888503e-10, + "loss": 0.2551, + "step": 34402 + }, + { + "epoch": 1.0, + "grad_norm": 1.3313817051628796, + "learning_rate": 1.2081391542129173e-10, + "loss": 0.2873, + "step": 34403 + }, + { + "epoch": 1.0, + "grad_norm": 1.2938023345378071, + "learning_rate": 1.175707496131606e-10, + "loss": 0.3035, + "step": 34404 + }, + { + "epoch": 1.0, + "grad_norm": 1.0157669564309708, + "learning_rate": 1.1437170782280238e-10, + "loss": 0.5739, + "step": 34405 + }, + { + "epoch": 1.0, + "grad_norm": 1.2857795289883223, + "learning_rate": 1.112167900785277e-10, + "loss": 0.2946, + "step": 34406 + }, + { + "epoch": 1.0, + "grad_norm": 1.319962092408908, + "learning_rate": 1.0810599640809216e-10, + "loss": 0.2598, + "step": 34407 + }, + { + "epoch": 1.0, + "grad_norm": 1.439506419894959, + "learning_rate": 1.0503932683925133e-10, + "loss": 0.2559, + "step": 34408 + }, + { + "epoch": 1.0, + "grad_norm": 1.4262232039737957, + "learning_rate": 1.0201678139920568e-10, + "loss": 0.2567, + "step": 34409 + }, + { + "epoch": 1.0, + "grad_norm": 1.5084351623148933, + "learning_rate": 9.903836011404544e-11, + "loss": 0.2745, + "step": 34410 + }, + { + "epoch": 1.0, + "grad_norm": 1.6561061207453256, + "learning_rate": 9.610406301041597e-11, + "loss": 0.275, + "step": 34411 + }, + { + "epoch": 1.0, + "grad_norm": 1.0012976054702507, + "learning_rate": 9.32138901138524e-11, + "loss": 0.5454, + "step": 34412 + }, + { + "epoch": 1.0, + "grad_norm": 1.332521441820664, + "learning_rate": 9.036784145044497e-11, + "loss": 0.2866, + "step": 34413 + }, + { + "epoch": 1.0, + "grad_norm": 1.5736593831620929, + "learning_rate": 8.756591704461859e-11, + "loss": 0.2696, + "step": 34414 + }, + { + "epoch": 1.0, + "grad_norm": 1.4049433859887916, + "learning_rate": 8.480811692190838e-11, + "loss": 0.2416, + "step": 34415 + }, + { + "epoch": 1.0, + "grad_norm": 1.9201272786393349, + "learning_rate": 8.209444110618414e-11, + "loss": 0.2954, + "step": 34416 + }, + { + "epoch": 1.0, + "grad_norm": 1.3860216420166527, + "learning_rate": 7.942488962131567e-11, + "loss": 0.2801, + "step": 34417 + }, + { + "epoch": 1.0, + "grad_norm": 1.031138681349205, + "learning_rate": 7.679946249117276e-11, + "loss": 0.5347, + "step": 34418 + }, + { + "epoch": 1.0, + "grad_norm": 1.2558183247770058, + "learning_rate": 7.421815973851499e-11, + "loss": 0.255, + "step": 34419 + }, + { + "epoch": 1.0, + "grad_norm": 1.4013207964208185, + "learning_rate": 7.168098138610191e-11, + "loss": 0.2884, + "step": 34420 + }, + { + "epoch": 1.0, + "grad_norm": 1.3450687361062015, + "learning_rate": 6.918792745724822e-11, + "loss": 0.2486, + "step": 34421 + }, + { + "epoch": 1.0, + "grad_norm": 1.4394009704783715, + "learning_rate": 6.673899797304817e-11, + "loss": 0.2869, + "step": 34422 + }, + { + "epoch": 1.0, + "grad_norm": 1.5114492073155597, + "learning_rate": 6.433419295515109e-11, + "loss": 0.2752, + "step": 34423 + }, + { + "epoch": 1.0, + "grad_norm": 1.2684069157105538, + "learning_rate": 6.197351242465122e-11, + "loss": 0.2552, + "step": 34424 + }, + { + "epoch": 1.0, + "grad_norm": 1.2212300587534488, + "learning_rate": 5.965695640319791e-11, + "loss": 0.2789, + "step": 34425 + }, + { + "epoch": 1.0, + "grad_norm": 1.4539402790276525, + "learning_rate": 5.7384524910775176e-11, + "loss": 0.2683, + "step": 34426 + }, + { + "epoch": 1.0, + "grad_norm": 1.4931670044770198, + "learning_rate": 5.5156217967367034e-11, + "loss": 0.2822, + "step": 34427 + }, + { + "epoch": 1.0, + "grad_norm": 1.4348152742257423, + "learning_rate": 5.297203559240238e-11, + "loss": 0.2538, + "step": 34428 + }, + { + "epoch": 1.0, + "grad_norm": 1.2940503707078432, + "learning_rate": 5.083197780586524e-11, + "loss": 0.2735, + "step": 34429 + }, + { + "epoch": 1.0, + "grad_norm": 1.3525325946507987, + "learning_rate": 4.873604462551918e-11, + "loss": 0.2687, + "step": 34430 + }, + { + "epoch": 1.0, + "grad_norm": 1.3792762253630275, + "learning_rate": 4.66842360713482e-11, + "loss": 0.2749, + "step": 34431 + }, + { + "epoch": 1.0, + "grad_norm": 1.551266343936395, + "learning_rate": 4.467655216000566e-11, + "loss": 0.2636, + "step": 34432 + }, + { + "epoch": 1.0, + "grad_norm": 1.600087623482149, + "learning_rate": 4.271299291036535e-11, + "loss": 0.2837, + "step": 34433 + }, + { + "epoch": 1.0, + "grad_norm": 1.462670616435877, + "learning_rate": 4.0793558338525496e-11, + "loss": 0.2971, + "step": 34434 + }, + { + "epoch": 1.0, + "grad_norm": 1.964135762422746, + "learning_rate": 3.8918248462804783e-11, + "loss": 0.2671, + "step": 34435 + }, + { + "epoch": 1.0, + "grad_norm": 1.2987062793487294, + "learning_rate": 3.708706329874634e-11, + "loss": 0.2504, + "step": 34436 + }, + { + "epoch": 1.0, + "grad_norm": 1.333478528959848, + "learning_rate": 3.53000028630035e-11, + "loss": 0.2743, + "step": 34437 + }, + { + "epoch": 1.0, + "grad_norm": 1.6981430996196147, + "learning_rate": 3.3557067171674504e-11, + "loss": 0.2631, + "step": 34438 + }, + { + "epoch": 1.0, + "grad_norm": 1.5509001173450976, + "learning_rate": 3.185825623919225e-11, + "loss": 0.2771, + "step": 34439 + }, + { + "epoch": 1.0, + "grad_norm": 1.4975300173195334, + "learning_rate": 3.0203570081099866e-11, + "loss": 0.2522, + "step": 34440 + }, + { + "epoch": 1.0, + "grad_norm": 1.5199504599535294, + "learning_rate": 2.859300871183024e-11, + "loss": 0.2769, + "step": 34441 + }, + { + "epoch": 1.0, + "grad_norm": 2.261035656056417, + "learning_rate": 2.7026572145816275e-11, + "loss": 0.2468, + "step": 34442 + }, + { + "epoch": 1.0, + "grad_norm": 1.3150605335737644, + "learning_rate": 2.5504260396380655e-11, + "loss": 0.2563, + "step": 34443 + }, + { + "epoch": 1.0, + "grad_norm": 1.2332444039224417, + "learning_rate": 2.402607347795627e-11, + "loss": 0.2435, + "step": 34444 + }, + { + "epoch": 1.0, + "grad_norm": 1.5222097099536964, + "learning_rate": 2.2592011402200466e-11, + "loss": 0.2763, + "step": 34445 + }, + { + "epoch": 1.0, + "grad_norm": 1.3211320265410884, + "learning_rate": 2.120207418354614e-11, + "loss": 0.287, + "step": 34446 + }, + { + "epoch": 1.0, + "grad_norm": 1.4547817930274611, + "learning_rate": 1.9856261832540413e-11, + "loss": 0.2581, + "step": 34447 + }, + { + "epoch": 1.0, + "grad_norm": 1.3858620080861117, + "learning_rate": 1.855457436195085e-11, + "loss": 0.2703, + "step": 34448 + }, + { + "epoch": 1.0, + "grad_norm": 1.338571179114177, + "learning_rate": 1.729701178287968e-11, + "loss": 0.273, + "step": 34449 + }, + { + "epoch": 1.0, + "grad_norm": 1.369415403655994, + "learning_rate": 1.6083574106984244e-11, + "loss": 0.2574, + "step": 34450 + }, + { + "epoch": 1.0, + "grad_norm": 1.4047009024566939, + "learning_rate": 1.4914261344256552e-11, + "loss": 0.3185, + "step": 34451 + }, + { + "epoch": 1.0, + "grad_norm": 1.35292551215894, + "learning_rate": 1.3789073505798833e-11, + "loss": 0.2533, + "step": 34452 + }, + { + "epoch": 1.0, + "grad_norm": 1.4179811559580542, + "learning_rate": 1.2708010600492871e-11, + "loss": 0.2648, + "step": 34453 + }, + { + "epoch": 1.0, + "grad_norm": 1.2586887915778904, + "learning_rate": 1.1671072639440895e-11, + "loss": 0.2697, + "step": 34454 + }, + { + "epoch": 1.0, + "grad_norm": 1.29258516060778, + "learning_rate": 1.0678259629859355e-11, + "loss": 0.2797, + "step": 34455 + }, + { + "epoch": 1.0, + "grad_norm": 1.4298283245123184, + "learning_rate": 9.729571582295372e-12, + "loss": 0.2661, + "step": 34456 + }, + { + "epoch": 1.0, + "grad_norm": 1.406116660635767, + "learning_rate": 8.825008503965393e-12, + "loss": 0.2636, + "step": 34457 + }, + { + "epoch": 1.0, + "grad_norm": 1.4139163945121804, + "learning_rate": 7.964570403196093e-12, + "loss": 0.2758, + "step": 34458 + }, + { + "epoch": 1.0, + "grad_norm": 1.3474147148690472, + "learning_rate": 7.1482572877590304e-12, + "loss": 0.2613, + "step": 34459 + }, + { + "epoch": 1.0, + "grad_norm": 1.2534012950600784, + "learning_rate": 6.376069164315546e-12, + "loss": 0.2533, + "step": 34460 + }, + { + "epoch": 1.0, + "grad_norm": 1.596590513603264, + "learning_rate": 5.6480060406372e-12, + "loss": 0.2844, + "step": 34461 + }, + { + "epoch": 1.0, + "grad_norm": 2.0726885157554045, + "learning_rate": 4.964067922275106e-12, + "loss": 0.2575, + "step": 34462 + }, + { + "epoch": 1.0, + "grad_norm": 1.6097907462643006, + "learning_rate": 4.324254815890605e-12, + "loss": 0.2764, + "step": 34463 + }, + { + "epoch": 1.0, + "grad_norm": 1.333715852953316, + "learning_rate": 3.728566726479699e-12, + "loss": 0.2798, + "step": 34464 + }, + { + "epoch": 1.0, + "grad_norm": 1.4168857330957758, + "learning_rate": 3.1770036595935026e-12, + "loss": 0.2841, + "step": 34465 + }, + { + "epoch": 1.0, + "grad_norm": 1.2243698068191387, + "learning_rate": 2.6695656202280207e-12, + "loss": 0.2621, + "step": 34466 + }, + { + "epoch": 1.0, + "grad_norm": 1.4316045562274866, + "learning_rate": 2.2062526128241446e-12, + "loss": 0.2833, + "step": 34467 + }, + { + "epoch": 1.0, + "grad_norm": 1.3879857663296395, + "learning_rate": 1.7870646412676551e-12, + "loss": 0.3127, + "step": 34468 + }, + { + "epoch": 1.0, + "grad_norm": 1.2020622616485839, + "learning_rate": 1.4120017094443328e-12, + "loss": 0.2541, + "step": 34469 + }, + { + "epoch": 1.0, + "grad_norm": 1.5910426531224553, + "learning_rate": 1.0810638206848468e-12, + "loss": 0.2963, + "step": 34470 + }, + { + "epoch": 1.0, + "grad_norm": 2.1166016756371695, + "learning_rate": 7.942509777647545e-13, + "loss": 0.2518, + "step": 34471 + }, + { + "epoch": 1.0, + "grad_norm": 1.5234773547467817, + "learning_rate": 5.515631834596135e-13, + "loss": 0.2721, + "step": 34472 + }, + { + "epoch": 1.0, + "grad_norm": 1.5953833224746208, + "learning_rate": 3.530004399898701e-13, + "loss": 0.2724, + "step": 34473 + }, + { + "epoch": 1.0, + "grad_norm": 1.275967561813762, + "learning_rate": 1.9856274846574709e-13, + "loss": 0.2505, + "step": 34474 + }, + { + "epoch": 1.0, + "grad_norm": 1.2728634660174734, + "learning_rate": 8.825011055257904e-14, + "loss": 0.2528, + "step": 34475 + }, + { + "epoch": 1.0, + "grad_norm": 1.73439123421526, + "learning_rate": 2.2062527915700514e-14, + "loss": 0.3071, + "step": 34476 + }, + { + "epoch": 1.0, + "grad_norm": 1.2960731363340376, + "learning_rate": 0.0, + "loss": 0.2713, + "step": 34477 + }, + { + "epoch": 1.0, + "step": 34477, + "total_flos": 5675725729562624.0, + "train_loss": 0.34586435998860876, + "train_runtime": 180586.9564, + "train_samples_per_second": 24.438, + "train_steps_per_second": 0.191 + } + ], + "logging_steps": 1.0, + "max_steps": 34477, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3000, + "total_flos": 5675725729562624.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}